Hello everyone,
i am new to CUDA C GPGPU programming and found an example in the following pdf-file:
http://heim.ifi.uio.no/~knutm/geilo2008/seland.pdf the example is on page 22.
I also copied the code in this thread.
const int N = 1024;
const int blocksize = 16;
__global__ void add_matrix( float* a, float *b, float *c, int N )
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
int index = i + j*N;
if ( i < N && j < N )
c[index] = a[index] + b[index];
}
int main() {
float *a = new float[N*N];
float *b = new float[N*N];
float *c = new float[N*N];
for ( int i = 0; i < N*N; ++i ) {
a[i] = 1.0f; b[i] = 3.5f;
}
float *ad, *bd, *cd;
const int size = N*N*sizeof(float);
cudaMalloc( (void**)&ad, size );
cudaMalloc( (void**)&bd, size );
cudaMalloc( (void**)&cd, size );
cudaMemcpy( ad, a, size, cudaMemcpyHostToDevice );
cudaMemcpy( bd, b, size, cudaMemcpyHostToDevice );
dim3 dimBlock( blocksize, blocksize );
dim3 dimGrid( N/dimBlock.x, N/dimBlock.y );
add_matrix<<<dimGrid, dimBlock>>>( ad, bd, cd, N );
cudaMemcpy( c, cd, size, cudaMemcpyDeviceToHost );
cudaFree( ad ); cudaFree( bd ); cudaFree( cd );
delete[] a; delete[] b; delete[] c;
return EXIT_SUCCESS;
}
I do understand everything but not the give block and grid parameters.
dim3 dimBlock( blocksize, blocksize ); // Means to me: 16*16 = 256 Threads per block. Is that right?
dim3 dimGrid( N/dimBlock.x, N/dimBlock.y ); // Means to me: N/dimBlock.x = 1024/16 = 64 and N/dimBlock.y = 64 → 64*64 = 4096 Blocks per grid. right?
But why such a big 2D grid? I would have 256*4096 = 1,048,576 threads with that grid. But my problemsize is only 1024?! So why did the author chose such a big grid or did i understand something wrong in his calculation?!
Hope you can bring some light in this darkness External Image