Limitation of blocks and threads

Hi,

I’m new to CUDA and still have problems with the number of threads and blocks that we’re defining, I have Geforce 9800 GT and I want to run the following code for testing:

#include <stdio.h>

#include <iostream>

#include "cuPrintf.cu"

#define imin(a,b) (a<b?a:b)

const int N = 33 * 1024;

const int threadsPerBlock = 256;

const int blocksPerGrid =

            imin( 32, (N+threadsPerBlock-1) / threadsPerBlock );

__global__ void dot(float *a, float *b, float *c )

{

  cuPrintf("Hello, world from the device!\n");

}

int main(void)

{

    cudaDeviceProp prop;

    int count;

cudaGetDeviceCount(&count);

for(int i=0; i<count; i++)

    {

        cudaGetDeviceProperties(&prop, i);

        std::cout<< "General info for device"<< std::endl;

        std::cout << "Name:" << prop.name <<std::endl;

        std::cout << "Clock rate:" << prop.clockRate <<std::endl;

        std::cout << "Total global mem:" << prop.totalGlobalMem <<std::endl;

        std::cout << "MaxGridSize:" << prop.maxGridSize[0] <<std::endl;

         std::cout << "maxThreadsPerBlock" << prop.maxThreadsPerBlock <<std::endl;

}

  float   *a, *b, c, *partial_c;

  float   *dev_a, *dev_b, *dev_partial_c;

// allocate memory on the cpu side

    a = (float*)malloc( N*sizeof(float) );

    b = (float*)malloc( N*sizeof(float) );

    partial_c = (float*)malloc( blocksPerGrid*sizeof(float) );

// allocate the memory on the GPU

  cudaMalloc( (void**)&dev_a,

                              N*sizeof(float) ) ;

  cudaMalloc( (void**)&dev_b,

                              N*sizeof(float) ) ;

  cudaMalloc( (void**)&dev_partial_c,

                              blocksPerGrid*sizeof(float) ) ;

   // fill in the host memory with data

    for (int i=0; i<N; i++) {

        a[i] = i;

        b[i] = i*2;

    }

// copy the arrays 'a' and 'b' to the GPU

    cudaMemcpy( dev_a, a, N*sizeof(float),

                              cudaMemcpyHostToDevice ) ;

    cudaMemcpy( dev_b, b, N*sizeof(float),

                              cudaMemcpyHostToDevice ) ; 

  // greet from the host

  printf("Hello, world from the host!\n");

  std::cout << threadsPerBlock<< std::endl;

  // initialize cuPrintf

  cudaPrintfInit();

// launch a kernel with a single thread to greet from the device

  dot<<<16,256>>>(dev_a,dev_b,dev_partial_c);

// display the device's greeting

  cudaPrintfDisplay();

// clean up after cuPrintf

  cudaPrintfEnd();

return 0;

}

but it seems that it’s not running the kernel code at all, but if I change the Grid size to 8 then it will print from the device code. but as I know it shouldn’t be like this, can you help me?