Can not use more than 16*256 threads!

Hi everyone,

I have a problem with CUDA, i cannot use more than 16*256 threads in my system!

please take a look at this:

dim3 grid(16);

dim3 num_threads(256)

test_kernel<<<grid,num_threads,memsize >>>(pA,pB);

It’s seem that the “test_kernel” did not run at all ! But when i reduce “grid” to 15, the app run normally with correct result T_T!

(I have change num_threads an grid with another values, “test_kernel” did not run if gridnum_threads >= 16256)

Is this a bug or something??? Please help me!

Thank!!!

You might be running out of shared memory. Can’t say for sure without seeing your kernel and how you compute memsize.

What return code does the kernel launch give you? Use the same debugging macros as in the SDK template and sample projects to see…

Christian

Perhaps your kernel writes past the end of an allocated array when grid > 15.

I’m not sure this is memory problem or not! I’m new to CUDA and i haven’t known about it much (so don’t blame me if i aske some silly questions ^_^)

I’m testing with “template” project in CUDA SDK! I’ve changed a few line of code in “runTest” function!

void

runTest( int argc, char** argv) 

{

�   CUT_DEVICE_INIT(argc, argv);

�   unsigned int timer = 0;

 �   CUT_SAFE_CALL( cutCreateTimer( &timer));

 �   CUT_SAFE_CALL( cutStartTimer( timer));

�   // ***************************************************

 �   unsigned int num_threads = 256;

 �   UINT num_blocks = 15;

 �   UINT buffersize = num_blocks*num_threads;

 �   unsigned int mem_size = sizeof( float)*num_blocks * num_threads;

 �   // ***************************************************

�   // allocate host memory

 �   float* h_idata = (float*) malloc( mem_size);

 �   // initalize the memory

 �   for( unsigned int i = 0; i < buffersize; ++i) 

 �   {

 �  �  �   h_idata[i] = (float) i;

 �   }

�   // allocate device memory

 �   float* d_idata;

 �   CUDA_SAFE_CALL( cudaMalloc( (void**) &d_idata, mem_size));

 �   // copy host memory to device

 �   CUDA_SAFE_CALL( cudaMemcpy( d_idata, h_idata, mem_size,

 �  �  �  �  �  �  �  �  �  �  �  �  �  �  �   cudaMemcpyHostToDevice) );

�   // allocate device memory for result

 �   float* d_odata;

 �   CUDA_SAFE_CALL( cudaMalloc( (void**) &d_odata, mem_size));

�   // setup execution parameters

 �   dim3  grid(num_blocks);

 �   dim3  threads( num_threads);

�   // execute the kernel

 �   testKernel<<< grid, threads, mem_size >>>( d_idata, d_odata);

�   // check if kernel execution generated and error

 �   CUT_CHECK_ERROR("Kernel execution failed");

�   // allocate mem for the result on host side

 �   float* h_odata = (float*) malloc( mem_size);

 �   // copy result from device to host

 �   CUDA_SAFE_CALL( cudaMemcpy( h_odata, d_odata, sizeof( float) * num_threads*num_blocks,

 �  �  �  �  �  �  �  �  �  �  �  �  �  �  �   cudaMemcpyDeviceToHost) );

�   CUT_SAFE_CALL( cutStopTimer( timer));

 �   printf( "Processing time: %f (ms)\n", cutGetTimerValue( timer));

 �   CUT_SAFE_CALL( cutDeleteTimer( timer));

�   // ***************************************************

 �   // For testing only

 �   puts("\nData:\n");

 �   for(UINT i = 0;i < buffersize;++i)

 �   {

 �  �  �   printf("%f ",h_odata[i]);

 �   }

 �   // ***************************************************

�   // compute reference solution

 �   float* reference = (float*) malloc( mem_size);

 �   computeGold( reference, h_idata, num_threads);

�   // check result

 �   if( cutCheckCmdLineFlag( argc, (const char**) argv, "regression")) 

 �   {

 �  �  �   // write file for regression test

 �  �  �   CUT_SAFE_CALL( cutWriteFilef( "./data/regression.dat",

 �  �  �  �  �  �  �  �  �  �  �  �  �  �  �  �  �  �   h_odata, num_threads, 0.0));

 �   }

 �   else 

 �   {

 �  �  �   // custom output handling when no regression test running

 �  �  �   // in this case check if the result is equivalent to the expected soluion

 �  �  �   CUTBoolean res = cutComparef( reference, h_odata, num_threads);

 �  �  �   printf( "Test %s\n", (1 == res) ? "PASSED" : "FAILED");

 �   }

�   // cleanup memory

 �   free( h_idata);

 �   free( h_odata);

 �   free( reference);

 �   CUDA_SAFE_CALL(cudaFree(d_idata));

 �   CUDA_SAFE_CALL(cudaFree(d_odata));

}

With the above code, the app run normally (“Test PASSED”)! But, if i set “num_blocks” to 16, the app will prompt “Test FAILED”… T_T!

This is my graphics card properties:

name : GeForce 9300M G

totalGlobalMem  268435456

sharedMemPerBlock : 16384

regsPerBlock : 8192

warpSize : 32

memPitch : 262144

maxThreadsPerBlock : 512

maxThreadsDim : 512x512x64

maxGridSize : 65535x65535x1

totalConstMem : 65536

major : 1

minor : 1

clockRate  800000

textureAligment  : 256

deviceOverlap : 0

multiProcessorCount : 2

Thank for your attention ^_^ !!!

In debug mode it will show you the computed values and compare them against the values computed on the CPU. Maybe that will give an indicaton of what went wrong.

Christian

You ask for num_blocks * num_threads * sizeof(float) of shared memory.

15 * 256 * 4 = 15900 bytes of shared memory
16 * 256 * 4 = 16384 bytes of shared memory. And since kernel-parameters take up shared memory space, this request is too high, since you only have 16384 bytes of shared memory available.

If you’re dynamically allocating shared memory at kernel launch (with a launch parameter) you should declare the amount of bytes of shared memory for one block not for the entire grid.

  • Thank everyone, i’ve solved that problem ^^! I’ve removed “mem_size” from the code and… it worked ^^!
  • The third para is “Memory per block”, not “Memory of full array”…T_T!!! How careless i am …T_T!

Thank again!!!