Hi,
I’m new to CUDA programming so this might seem trivial. I am running CUDA on MacOSX with Geforce 8800GT GPU. I have posted my code below. It is a simple program to copy one array into another with an offset. My problem is when I increase the number of threads per block from 128 to 256 or 512, I start to get incorrect results. For example, if number of array elements (N) = 1537 I get correct results with 128 or 256 threads per block but not with 512 threads per block. As far as I know, 512 threads per block is valid.
Can anyone please tell me why I’m getting this kind of behavior in the results and if possible how to resolve it.
__global__ void foo(float* d_A, float* d_B, int offset)
{
int i = threadIdx.x + blockDim.x * blockIdx.x;
//int i = threadIdx.x;
d_A[i] = d_B[i+offset];
}
int main(int argc, char **argv)
{
float *A, *B;
float *d_A, *d_B;
unsigned int i, N, cnt, num_grids;
unsigned int timer;
size_t DATA_SIZE;
int offset = 1;
CUT_DEVICE_INIT(argc, argv);
CUT_SAFE_CALL( cutCreateTimer(&timer) );
//N = 1536; //9999872;
if (argc == 2)
{
N = atoi(argv[1]);
}
else
{
N = 0;
}
DATA_SIZE = N * sizeof(float);
printf("Allocating Memory in Host ...\n");
A = (float *)malloc(DATA_SIZE);
B = (float *)malloc(DATA_SIZE);
printf("Allocating Memory in Device (GPU) ... \n");
CUDA_SAFE_CALL( cudaMalloc((void **)&d_A, DATA_SIZE));
CUDA_SAFE_CALL( cudaMalloc((void **)&d_B, DATA_SIZE));
printf("Generating Input data in Host memory ...\n");
for (i = 0; i< N; i++)
{
B[i] = (float) i;
}
printf("Copying Input data to GPU memory ...\n");
CUDA_SAFE_CALL( cudaMemcpy(d_B, B, DATA_SIZE, cudaMemcpyHostToDevice));
printf("Data Initialize done ...\n");
CUDA_SAFE_CALL( cudaThreadSynchronize() );
CUT_SAFE_CALL( cutResetTimer(timer) );
CUT_SAFE_CALL( cutStartTimer(timer) );
dim3 dimBlock(128);
num_grids = ((int)(N/dimBlock.x) + (N%dimBlock.x == 0?0:1));
dim3 dimGrid(num_grids);
printf("\n Number of elements = %d \n num_grids = %d \n dimGrid => %d * %d = %d \n dimBlock => %d * %d * %d = %d \n", N, num_grids, dimGrid.x, dimGrid.y, dimGrid.x*dimGrid.y, dimBlock.x, dimBlock.y, dimBlock.z, dimBlock.x*dimBlock.y*dimBlock.z);
foo<<<dimGrid, dimBlock>>>(d_A, d_B, offset);
CUT_CHECK_ERROR("GPU execution failed\n");
CUDA_SAFE_CALL( cudaThreadSynchronize() );
CUT_SAFE_CALL( cutStopTimer(timer) );
printf("GPU time: %f msecs.\n", cutGetTimerValue(timer));
printf("Reading back result from GPU ... \n");
CUDA_SAFE_CALL( cudaMemcpy(A , d_A, DATA_SIZE, cudaMemcpyDeviceToHost));
printf("Free GPU memory ...\n");
CUDA_SAFE_CALL( cudaFree(d_A));
CUDA_SAFE_CALL( cudaFree(d_B));
printf("\n RESULTS:\n");
// Verify results
if(1)
{
printf("\n Correct results:\n");
cnt =0;
for (i = 0; i<N; i++)
{
if (A[i] == B[i+offset])
{
cnt ++;
//printf("\n A[%d] = %f :: B[%d] = %f", i, A[i], i, B[i]);
}
}
printf("\n Total number of correct results = %d\n", cnt);
}
if(1)
{
printf("\n Incorrect results:");
cnt = 0;
for (i = 0; i<N; i++)
{
if (A[i] != B[i+offset])
{
cnt ++;
//printf("\n A[%d] = %f :: B[%d] = %f", i, A[i], i, B[i]);
}
}
printf("\n Total number of incorrect results = %d\n", cnt);
}
return 0;
}