I’m not sure this is memory problem or not! I’m new to CUDA and i haven’t known about it much (so don’t blame me if i aske some silly questions ^_^)
I’m testing with “template” project in CUDA SDK! I’ve changed a few line of code in “runTest” function!
void
runTest( int argc, char** argv)
{
�   CUT_DEVICE_INIT(argc, argv);
�   unsigned int timer = 0;
�   CUT_SAFE_CALL( cutCreateTimer( &timer));
�   CUT_SAFE_CALL( cutStartTimer( timer));
�   // ***************************************************
�   unsigned int num_threads = 256;
�   UINT num_blocks = 15;
�   UINT buffersize = num_blocks*num_threads;
�   unsigned int mem_size = sizeof( float)*num_blocks * num_threads;
�   // ***************************************************
�   // allocate host memory
�   float* h_idata = (float*) malloc( mem_size);
�   // initalize the memory
�   for( unsigned int i = 0; i < buffersize; ++i)
�   {
�  �  �   h_idata[i] = (float) i;
�   }
�   // allocate device memory
�   float* d_idata;
�   CUDA_SAFE_CALL( cudaMalloc( (void**) &d_idata, mem_size));
�   // copy host memory to device
�   CUDA_SAFE_CALL( cudaMemcpy( d_idata, h_idata, mem_size,
�  �  �  �  �  �  �  �  �  �  �  �  �  �  �   cudaMemcpyHostToDevice) );
�   // allocate device memory for result
�   float* d_odata;
�   CUDA_SAFE_CALL( cudaMalloc( (void**) &d_odata, mem_size));
�   // setup execution parameters
�   dim3  grid(num_blocks);
�   dim3  threads( num_threads);
�   // execute the kernel
�   testKernel<<< grid, threads, mem_size >>>( d_idata, d_odata);
�   // check if kernel execution generated and error
�   CUT_CHECK_ERROR("Kernel execution failed");
�   // allocate mem for the result on host side
�   float* h_odata = (float*) malloc( mem_size);
�   // copy result from device to host
�   CUDA_SAFE_CALL( cudaMemcpy( h_odata, d_odata, sizeof( float) * num_threads*num_blocks,
�  �  �  �  �  �  �  �  �  �  �  �  �  �  �   cudaMemcpyDeviceToHost) );
�   CUT_SAFE_CALL( cutStopTimer( timer));
�   printf( "Processing time: %f (ms)\n", cutGetTimerValue( timer));
�   CUT_SAFE_CALL( cutDeleteTimer( timer));
�   // ***************************************************
�   // For testing only
�   puts("\nData:\n");
�   for(UINT i = 0;i < buffersize;++i)
�   {
�  �  �   printf("%f ",h_odata[i]);
�   }
�   // ***************************************************
�   // compute reference solution
�   float* reference = (float*) malloc( mem_size);
�   computeGold( reference, h_idata, num_threads);
�   // check result
�   if( cutCheckCmdLineFlag( argc, (const char**) argv, "regression"))
�   {
�  �  �   // write file for regression test
�  �  �   CUT_SAFE_CALL( cutWriteFilef( "./data/regression.dat",
�  �  �  �  �  �  �  �  �  �  �  �  �  �  �  �  �  �   h_odata, num_threads, 0.0));
�   }
�   else
�   {
�  �  �   // custom output handling when no regression test running
�  �  �   // in this case check if the result is equivalent to the expected soluion
�  �  �   CUTBoolean res = cutComparef( reference, h_odata, num_threads);
�  �  �   printf( "Test %s\n", (1 == res) ? "PASSED" : "FAILED");
�   }
�   // cleanup memory
�   free( h_idata);
�   free( h_odata);
�   free( reference);
�   CUDA_SAFE_CALL(cudaFree(d_idata));
�   CUDA_SAFE_CALL(cudaFree(d_odata));
}
With the above code, the app run normally (“Test PASSED”)! But, if i set “num_blocks” to 16, the app will prompt “Test FAILED”… T_T!
This is my graphics card properties:
name : GeForce 9300M G
totalGlobalMem 268435456
sharedMemPerBlock : 16384
regsPerBlock : 8192
warpSize : 32
memPitch : 262144
maxThreadsPerBlock : 512
maxThreadsDim : 512x512x64
maxGridSize : 65535x65535x1
totalConstMem : 65536
major : 1
minor : 1
clockRate 800000
textureAligment : 256
deviceOverlap : 0
multiProcessorCount : 2
Thank for your attention ^_^ !!!