Hi,

I am beginner in cuda programming…

i need few clarification for understanding Number of blocks and threads in cuda…

As of now my understanding is like as follows

[codebox]//---------------------kernal----------------------------

global void abc(int *a, int *b, int n)

{

int i = blockIdx.x * blockDim.x + threadIdx.x;

b[i]=a[i];

}

//-------------------------------------------------------

numberofblock=1;

abc <<< numberofblock, numberofthreadsperblock <<< (a,B)

[/codebox]

1. While using like above, 16 threads are used in block 1 itâ€™s right or wrong?

Another example…

[codebox]//---------------------kernal----------------------------

global void abc(int *a, int *b, int n)

{

int i = blockIdx.x * blockDim.x + threadIdx.x;

int j = blockIdx.y * blockDim.y + threadIdx.y;

int index=i+j*n;

b[index]=a[index];

}

//-------------------------------------------------------

numberofblock=2;

dim3 dimblock(2,2);

[/codebox]

1. How can i consider blocks here?

whether i need to consider like block (0,0) (0,1) (1,0) (1,1) …?

if i consider like above…

in block (0,0) 16 threads are executed… its right or wrong?

2. If my above understanding is wrong how can i relate dimblock(2,2) and nthread(16,16)…?

s.sudhagar. :mellow:

Right/Correct/Yes.

This will launch 1 block which contains 16 threads.

If you have not read the Programming Guide I’d suggest you do so.

thanks zeus13i,

can help me to under stand this…

dim3 dimblock(2,2);

This launches 2x2=4 blocks with 16x16=256 threads such that:

gridDim.x = 2, gridDim.y = 2, gridDim.z = 1

blockDim.x = 16, blockDim.y = 16, blockDim.z = 1

It is almost the same as abc<<<4,256>>>(a,b,n), except for the extra abstraction of dimensionality in your code.

Again, however, this information can all be found in the Programming Guide.

thanks again…

have look on bellow code…

[codebox]#include <stdio.h>

#include <conio.h>

// Set grid size

const int N = 5;

const int blocksize = 16;

dim3 dimBlock( blocksize, 1 );

dim3 dimGrid( (N + dimBlock.x - 1)/dimBlock.x, (N + dimBlock.y - 1)/dimBlock.y );

// compute kernel

global

void add_matrix( float* a, float *b, float *c, int N )

{

int i = blockIdx.x * blockDim.x + threadIdx.x;

int j = blockIdx.y * blockDim.y + threadIdx.y;

int index = i + j*N;

if ( i < N && j < N )

`````` c[index] = a[index] + b[index];
``````

}

int main() {

printf("%d\n",N);

printf("%d\n",dimBlock.x);

printf("%d\n",dimBlock.y);

getch();

// CPU memory allocation

const size_t size = NNsizeof(float);

float *a = (float *)malloc(size);

float *b = (float *)malloc(size);

float *c = (float *)malloc(size);

// initialize the a and b arrays

for ( int i = 0; i < N*N; ++i ) {

`````` a[i] = 1.0f; b[i] = 3.5f; }
``````

// GPU memory allocation

cudaMalloc( (void**)&bd, size );

cudaMalloc( (void**)&cd, size );

// copy data to GPU

cudaMemcpy( ad, a, size, cudaMemcpyHostToDevice );

cudaMemcpy( bd, b, size, cudaMemcpyHostToDevice );

// execute kernel

// block until the device has completed

cudaMemcpy( c, cd, size, cudaMemcpyDeviceToHost );

// verify the data returned to the host is correct

int j, k;

for (j = 0; j < 2; j++)

{

for (k = 0; k < 16; k++)

{

printf("%d => %f\n",(j * 16 + k),c[j * 16 + k]);

``````  }
``````

}

getch();

cudaFree( ad ); cudaFree( bd ); cudaFree( cd );

free( a ); free( b ); free ©;

return 0;

}

[/codebox]

kindly check the screen shot of above program with this post…

tcheck.bmp (945 KB)

my confusion starts here…

[codebox] const int N = 5;

const int blocksize = 16;

dim3 dimBlock( blocksize, 1 );

dim3 dimGrid( (N + dimBlock.x - 1)/dimBlock.x, (N + dimBlock.y - 1)/dimBlock.y );[/codebox]

[codebox] global

void add_matrix( float* a, float *b, float *c, int N )

{

int i = blockIdx.x * blockDim.x + threadIdx.x;

int j = blockIdx.y * blockDim.y + threadIdx.y;

int index = i + j*N;

if ( i < N && j < N )

`````` c[index] = a[index] + b[index];
``````

}

[/codebox]

as per my understanding,

n=25

dimBlock( blocksize, 1 );=> 16*1= 16 threads per block

dimGrid( (N + dimBlock.x - 1)/dimBlock.x, (N + dimBlock.y - 1)/dimBlock.y ); => (1.25 *1.25)=1.56 block

if i consider block =1,

16 threads are executed in 1 block so we need to get output for c[0]…c[15] only right but i got the output for c[0]…c[24] …?

i cant find out where i did a mistake… kindly help me to figure it out…