Unexpected behavior with varying number of threads per block

Hi,

I’m new to CUDA programming so this might seem trivial. I am running CUDA on MacOSX with Geforce 8800GT GPU. I have posted my code below. It is a simple program to copy one array into another with an offset. My problem is when I increase the number of threads per block from 128 to 256 or 512, I start to get incorrect results. For example, if number of array elements (N) = 1537 I get correct results with 128 or 256 threads per block but not with 512 threads per block. As far as I know, 512 threads per block is valid.

Can anyone please tell me why I’m getting this kind of behavior in the results and if possible how to resolve it.

__global__ void foo(float* d_A, float* d_B, int offset)

{

  int i = threadIdx.x + blockDim.x * blockIdx.x;

  //int i = threadIdx.x;

  d_A[i] = d_B[i+offset];

}

int main(int argc, char **argv)

{

  float *A, *B;

  float *d_A, *d_B;

  unsigned int i, N, cnt, num_grids;

  unsigned int timer;

  size_t DATA_SIZE;

  int offset = 1;

CUT_DEVICE_INIT(argc, argv);

  CUT_SAFE_CALL( cutCreateTimer(&timer) );

	  

  //N = 1536; //9999872;

  if (argc == 2)

  {

	N = atoi(argv[1]);

  }

  else

  {

	N = 0;

  }

  DATA_SIZE = N * sizeof(float);

	 

  printf("Allocating Memory in Host ...\n");

  A = (float *)malloc(DATA_SIZE);

  B = (float *)malloc(DATA_SIZE);

printf("Allocating Memory in Device (GPU) ... \n");

  CUDA_SAFE_CALL( cudaMalloc((void **)&d_A, DATA_SIZE));

  CUDA_SAFE_CALL( cudaMalloc((void **)&d_B, DATA_SIZE));

printf("Generating Input data in Host memory ...\n");

  for (i = 0; i< N; i++)

  {

	B[i] = (float) i;

  }

printf("Copying Input data to GPU memory ...\n");

  CUDA_SAFE_CALL( cudaMemcpy(d_B, B, DATA_SIZE, cudaMemcpyHostToDevice));

  printf("Data Initialize done ...\n");

CUDA_SAFE_CALL( cudaThreadSynchronize() );

  CUT_SAFE_CALL( cutResetTimer(timer) );

  CUT_SAFE_CALL( cutStartTimer(timer) );

dim3 dimBlock(128);

  num_grids = ((int)(N/dimBlock.x) + (N%dimBlock.x == 0?0:1));

	

  dim3 dimGrid(num_grids);

printf("\n Number of elements = %d \n num_grids = %d \n dimGrid => %d * %d = %d \n dimBlock => %d * %d * %d = %d \n", N, num_grids, dimGrid.x, dimGrid.y, dimGrid.x*dimGrid.y, dimBlock.x, dimBlock.y, dimBlock.z, dimBlock.x*dimBlock.y*dimBlock.z);

foo<<<dimGrid, dimBlock>>>(d_A, d_B, offset);

CUT_CHECK_ERROR("GPU execution failed\n");

  CUDA_SAFE_CALL( cudaThreadSynchronize() );

  CUT_SAFE_CALL( cutStopTimer(timer) );

  printf("GPU time: %f msecs.\n", cutGetTimerValue(timer));

printf("Reading back result from GPU ... \n");

  CUDA_SAFE_CALL( cudaMemcpy(A , d_A, DATA_SIZE, cudaMemcpyDeviceToHost));

printf("Free GPU memory ...\n");

  CUDA_SAFE_CALL( cudaFree(d_A));

  CUDA_SAFE_CALL( cudaFree(d_B));

printf("\n RESULTS:\n");

// Verify results

  if(1)

  {

	 printf("\n Correct results:\n");

	 cnt =0;

	 for (i = 0; i<N; i++)

	 {

		if (A[i] == B[i+offset])

		{

		  cnt ++;

		  //printf("\n A[%d] = %f :: B[%d] = %f", i, A[i], i, B[i]);

		}

	}

	printf("\n Total number of correct results = %d\n", cnt);

  }

if(1)

  {

	printf("\n Incorrect results:");

	cnt = 0;

	for (i = 0; i<N; i++)

	{

	   if (A[i] != B[i+offset])

	  {

		  cnt ++;

		  //printf("\n A[%d] = %f :: B[%d] = %f", i, A[i], i, B[i]);

	  }

	}

	printf("\n Total number of incorrect results = %d\n", cnt);

  }	

return 0;

}

Dont know if this will fix your problem, but im thinking it might

First, when calculating the size of the grid, what i do is : dim= ceil((float)N/(float)dimblock) that way youre sure to launch enough threads.

Next, when writing to global memory, make sure you dont go out of bounds by doing:
if(i<N)
d_A[i]=whatever

thanks for the reply… checking for the bounds [ if (i<N) ]helped…