understading Number of blocks and threads

Hi,

I am beginner in cuda programming…

i need few clarification for understanding Number of blocks and threads in cuda…

As of now my understanding is like as follows

[codebox]//---------------------kernal----------------------------

global void abc(int *a, int *b, int n)

{

int i = blockIdx.x * blockDim.x + threadIdx.x;

b[i]=a[i];

}

//-------------------------------------------------------

numberofblock=1;

numberofthreadsperblock=16;

abc <<< numberofblock, numberofthreadsperblock <<< (a,B)

[/codebox]

  1. While using like above, 16 threads are used in block 1 it’s right or wrong?

Another example…

[codebox]//---------------------kernal----------------------------

global void abc(int *a, int *b, int n)

{

int i = blockIdx.x * blockDim.x + threadIdx.x;

int j = blockIdx.y * blockDim.y + threadIdx.y;

int index=i+j*n;

b[index]=a[index];

}

//-------------------------------------------------------

numberofblock=2;

numberofthreadsperblock=16;

dim3 dimblock(2,2);

dim3 nthread(16,16);

abc <<< dimblock, nthread<<< (a,b,n)

[/codebox]

  1. How can i consider blocks here?

    whether i need to consider like block (0,0) (0,1) (1,0) (1,1) …?

    if i consider like above…

    in block (0,0) 16 threads are executed… its right or wrong?

  2. If my above understanding is wrong how can i relate dimblock(2,2) and nthread(16,16)…?

Thanks in advance,

s.sudhagar. :mellow:

Right/Correct/Yes.

This will launch 1 block which contains 16 threads.

If you have not read the Programming Guide I’d suggest you do so.

thanks zeus13i,

can help me to under stand this…

dim3 dimblock(2,2);

dim3 nthread(16,16);

abc <<< dimblock, nthread<<< (a,b,n)

This launches 2x2=4 blocks with 16x16=256 threads such that:

gridDim.x = 2, gridDim.y = 2, gridDim.z = 1

blockDim.x = 16, blockDim.y = 16, blockDim.z = 1

It is almost the same as abc<<<4,256>>>(a,b,n), except for the extra abstraction of dimensionality in your code.

Let me know if that doesn’t fully answer your query.

Again, however, this information can all be found in the Programming Guide.

thanks again…

have look on bellow code…

[codebox]#include <stdio.h>

#include <conio.h>

// Set grid size

const int N = 5;

const int blocksize = 16;

dim3 dimBlock( blocksize, 1 );

dim3 dimGrid( (N + dimBlock.x - 1)/dimBlock.x, (N + dimBlock.y - 1)/dimBlock.y );

// compute kernel

global

void add_matrix( float* a, float *b, float *c, int N )

{

int i = blockIdx.x * blockDim.x + threadIdx.x;

int j = blockIdx.y * blockDim.y + threadIdx.y;

int index = i + j*N;

if ( i < N && j < N )

 c[index] = a[index] + b[index];

}

int main() {

printf("%d\n",N);

printf("%d\n",dimBlock.x);

printf("%d\n",dimBlock.y);

getch();

// CPU memory allocation

const size_t size = NNsizeof(float);

float *a = (float *)malloc(size);

float *b = (float *)malloc(size);

float *c = (float *)malloc(size);

// initialize the a and b arrays

for ( int i = 0; i < N*N; ++i ) {

 a[i] = 1.0f; b[i] = 3.5f; }

// GPU memory allocation

float *ad, *bd, *cd;

cudaMalloc( (void**)&ad, size );

cudaMalloc( (void**)&bd, size );

cudaMalloc( (void**)&cd, size );

// copy data to GPU

cudaMemcpy( ad, a, size, cudaMemcpyHostToDevice );

cudaMemcpy( bd, b, size, cudaMemcpyHostToDevice );

// execute kernel

add_matrix<<<dimGrid, dimBlock>>>( ad, bd, cd, N );

// block until the device has completed

cudaThreadSynchronize();

cudaMemcpy( c, cd, size, cudaMemcpyDeviceToHost );

// verify the data returned to the host is correct

int j, k;

for (j = 0; j < 2; j++)

{

for (k = 0; k < 16; k++)

{

printf("%d => %f\n",(j * 16 + k),c[j * 16 + k]);

  }

}

getch();

cudaFree( ad ); cudaFree( bd ); cudaFree( cd );

free( a ); free( b ); free ©;

return 0;

}

[/codebox]

kindly check the screen shot of above program with this post…

tcheck.bmp (945 KB)

my confusion starts here…

[codebox] const int N = 5;

const int blocksize = 16;

dim3 dimBlock( blocksize, 1 );

dim3 dimGrid( (N + dimBlock.x - 1)/dimBlock.x, (N + dimBlock.y - 1)/dimBlock.y );[/codebox]

[codebox] add_matrix<<<dimGrid, dimBlock>>>( ad, bd, cd, N );[/codebox]

[codebox] global

void add_matrix( float* a, float *b, float *c, int N )

{

int i = blockIdx.x * blockDim.x + threadIdx.x;

int j = blockIdx.y * blockDim.y + threadIdx.y;

int index = i + j*N;

if ( i < N && j < N )

 c[index] = a[index] + b[index];

}

[/codebox]

as per my understanding,

n=25

dimBlock( blocksize, 1 );=> 16*1= 16 threads per block

dimGrid( (N + dimBlock.x - 1)/dimBlock.x, (N + dimBlock.y - 1)/dimBlock.y ); => (1.25 *1.25)=1.56 block

if i consider block =1,

16 threads are executed in 1 block so we need to get output for c[0]…c[15] only right but i got the output for c[0]…c[24] …?

i cant find out where i did a mistake… kindly help me to figure it out…

thanks in advence…

problem solved i can figureout flow…

thanks …