Hi,
I’m new to CUDA and still have problems with the number of threads and blocks that we’re defining, I have Geforce 9800 GT and I want to run the following code for testing:
#include <stdio.h>
#include <iostream>
#include "cuPrintf.cu"
#define imin(a,b) (a<b?a:b)
const int N = 33 * 1024;
const int threadsPerBlock = 256;
const int blocksPerGrid =
imin( 32, (N+threadsPerBlock-1) / threadsPerBlock );
__global__ void dot(float *a, float *b, float *c )
{
cuPrintf("Hello, world from the device!\n");
}
int main(void)
{
cudaDeviceProp prop;
int count;
cudaGetDeviceCount(&count);
for(int i=0; i<count; i++)
{
cudaGetDeviceProperties(&prop, i);
std::cout<< "General info for device"<< std::endl;
std::cout << "Name:" << prop.name <<std::endl;
std::cout << "Clock rate:" << prop.clockRate <<std::endl;
std::cout << "Total global mem:" << prop.totalGlobalMem <<std::endl;
std::cout << "MaxGridSize:" << prop.maxGridSize[0] <<std::endl;
std::cout << "maxThreadsPerBlock" << prop.maxThreadsPerBlock <<std::endl;
}
float *a, *b, c, *partial_c;
float *dev_a, *dev_b, *dev_partial_c;
// allocate memory on the cpu side
a = (float*)malloc( N*sizeof(float) );
b = (float*)malloc( N*sizeof(float) );
partial_c = (float*)malloc( blocksPerGrid*sizeof(float) );
// allocate the memory on the GPU
cudaMalloc( (void**)&dev_a,
N*sizeof(float) ) ;
cudaMalloc( (void**)&dev_b,
N*sizeof(float) ) ;
cudaMalloc( (void**)&dev_partial_c,
blocksPerGrid*sizeof(float) ) ;
// fill in the host memory with data
for (int i=0; i<N; i++) {
a[i] = i;
b[i] = i*2;
}
// copy the arrays 'a' and 'b' to the GPU
cudaMemcpy( dev_a, a, N*sizeof(float),
cudaMemcpyHostToDevice ) ;
cudaMemcpy( dev_b, b, N*sizeof(float),
cudaMemcpyHostToDevice ) ;
// greet from the host
printf("Hello, world from the host!\n");
std::cout << threadsPerBlock<< std::endl;
// initialize cuPrintf
cudaPrintfInit();
// launch a kernel with a single thread to greet from the device
dot<<<16,256>>>(dev_a,dev_b,dev_partial_c);
// display the device's greeting
cudaPrintfDisplay();
// clean up after cuPrintf
cudaPrintfEnd();
return 0;
}
but it seems that it’s not running the kernel code at all, but if I change the Grid size to 8 then it will print from the device code. but as I know it shouldn’t be like this, can you help me?