strange problems with CUDA, Memcpy, etc.

Hi.

I wont to write simple test program to find scalar product W.V where V=A.W, A is matrix. I dont know if im doing something wrong onr there are some nvcc bugs but:

  1. let h_W={0,0,…,0}=0, but after cutilSafeCall(cudaMemcpy(d_W, h_W, MAT_DIM, cudaMemcpyHostToDevice)); d_W != 0 ???

  2. let say that my block dimension is (16,32), ( 16*32=512 so is ok ) i recive different result when i take ex. (16,16), WHY ? ( program dasn’t care of dimension, till its power of 2 )

  3. results differ form this computed on CPU for larger dimensions ( for small is equal ) about 50% is omething wrong with program or nvcc ???

#include <stdio.h>

#include <stdlib.h>

#include <time.h>

#include <string.h>

#include <cutil_inline.h>

#define BLOCK_SIZE_x 16

#define BLOCK_SIZE_y 16

void SumProdCPU(

			   float *h_A,

			   float *h_W,

						   float *h_c,

			   int mat_dim

		   ){

	h_c[0]=0;

	for(int x=0;x<mat_dim;x++)

	for(int y=0;y<mat_dim;y++)

			h_c[0]+=h_W[y]*h_A[y*mat_dim+x]*h_W[x];

}

__global__ void SumProdGPU(

			   float *d_A,

			   float *d_W,

			   float *d_c,

			   int mat_dim

			  ){

	//Accumulators cache

	__shared__ float accumResult[BLOCK_SIZE_y][BLOCK_SIZE_x];

	float sum = 0;

	int pos;

	float temp;

	for(unsigned int pos_y = threadIdx.y; pos_y < mat_dim; pos_y += blockDim.y)

	for(unsigned int pos_x = threadIdx.x; pos_x < mat_dim; pos_x += blockDim.x){

			pos=pos_y*mat_dim+pos_x;

			temp=d_A[pos]*d_W[pos_x];

		sum += d_W[pos_y]*temp;

	}

	accumResult[threadIdx.y][threadIdx.x] = sum;

	for(unsigned int stride = blockDim.x / 2; stride > 0; stride >>= 1){

		__syncthreads();

			if(threadIdx.x < stride)

		accumResult[threadIdx.y][threadIdx.x] += accumResult[threadIdx.y][stride + threadIdx.x];

	}

	if(threadIdx.x==0){

	for(int stride = blockDim.y / 2; stride > 0; stride >>= 1){

		__syncthreads();

		if(threadIdx.y < stride)

		accumResult[threadIdx.y][0] += accumResult[stride + threadIdx.y][0];

	}

	if(threadIdx.y==0) d_c[0] = accumResult[0][0];

	}

}

const int MAT_DIM = 2000;

const int   MAT_SZ = MAT_DIM*MAT_DIM*sizeof(float);

const int   WEKT_SZ = MAT_DIM*sizeof(float);

const int RESULT_SZ = sizeof(float);

int main()

{

///////////////////////////////////////////////////////////////////////////////

// Main program

///////////////////////////////////////////////////////////////////////////////

	float *h_A, *h_W,*h_c;

	float *d_A, *d_W,*d_c;

	float wart=0.1;

	//   double delta, ref, sum_delta, sum_ref, L1norm;

	int i;

	// Kernel invocation

	h_A = (float *)malloc(MAT_SZ);

	h_W = (float *)malloc(WEKT_SZ);

	h_c = (float *)malloc(RESULT_SZ);

	cudaMalloc((void **)&d_A, MAT_SZ);

	cudaMalloc((void **)&d_W, WEKT_SZ);

	cudaMalloc((void **)&d_c, RESULT_SZ);

	for(i = 0; i < MAT_DIM*MAT_DIM; i++)

	h_A[i] = wart;

	cutilSafeCall(cudaMemcpy(d_A, h_A, MAT_SZ, cudaMemcpyHostToDevice));

	for(i = 0; i < ROZM_MAC; i++)

	h_W[i] = 0.0;

	cutilSafeCall(cudaMemcpy(d_W, h_W, MAT_DIM, cudaMemcpyHostToDevice));

	cutilSafeCall(cudaThreadSynchronize());

	dim3 DimBlock(BLOCK_SIZE_y,BLOCK_SIZE_x);

	SumProdGPU<<<1, DimBlock>>>(d_A, d_W, d_c,MAT_DIM)

	cutilSafeCall(cudaThreadSynchronize());

	printf("Reading back GPU result...\n");

	cudaMemcpy(h_c, d_c, RESULT_SZ, cudaMemcpyDeviceToHost);

	cudaMemcpy(h_W, d_W, WEKT_SZ, cudaMemcpyDeviceToHost);

	printf("Checking GPU results...=%f \n",h_c[0]);

	printf("Checking GPU results... wektor =%f \n",h_W[MAT_DIM-5]);

	SumProdCPU(h_A, h_W, h_c, MAT_DIM);

	printf("Checking CPU results...=%f \n",*h_c);

	cudaFree(d_A);

	cudaFree(d_W);

	cudaFree(d_c);

	free(h_A);

	free(h_W);

	free(h_c);

	return 0;

}

cudaMemcpy(d_W, h_W, MAT_DIM, cudaMemcpyHostToDevice)

Do you mean WEKT_SZ instead of MAT_DIM?

Yes, ofcourse WEKT_SZ, my mistake :).

But ther is still problem with accuration its about 0.001% differ from real result.

How is the “real result” calculated? You are doing single precision arithmetic, which should give you about 6 digits after the decimal of repeatable accuracy. Below that and you start to get down into implementation specific behaviour and/or deviations from the IEEE 754 specification. The accuracy and rounding modes which CUDA supports are in an appendix in the programming guide.

ok … slowly, first im using 1.1 compute capability. 64-bit precision “double” is intorduced in 1.3. 32-bit “float”, gives u 10^(-6)% pracission ( so where is 0.001%, and why result differ if i change block dim ex (16,32) or (16,16) ). “real” value is computed easy, for this case GPU summing every NxN matrix elements. If matrix elements are the same and equal ‘a’ SUM=NNa so i know EXACT result.

I think you might want to revise your understanding of how IEEE floating point arithmetic works. The relative accuracy you can expect to achieve will be very dependent on the value of your ‘a’.