Problem with 2-dimensional thread blocks

Hi all!

I have some troubles with 2-dimensional thread blocks. In fact it seems that I can’t use the threadIdx.y coordinate. The following example shows my issue.

bi_thread_block.cu:

#define SIZE 10

#include <stdio.h>

// Kernel definition

global void add(int* device)

{

int i =5*threadIdx.y + threadIdx.x; 

device[i] = i;

}

int main()

{

int A={0}; 

int *devPtrA; 

int memsize= SIZE * sizeof(int);

cudaMalloc((void**)&devPtrA, memsize);

cudaMemcpy(devPtrA, A, memsize, cudaMemcpyHostToDevice);

for (int i=0; i<SIZE; i++)

printf("A[%d]=%d\n",i,A[i]);



printf("\n");

add<<<2, 5>>>(devPtrA);

cudaMemcpy(A, devPtrA, memsize, cudaMemcpyDeviceToHost);

for (int i=0; i<SIZE; i++)

printf("A[%d]=%d\n",i,A[i]); 

cudaFree(devPtrA);

return 0;

}

Command I use to compile:

nvcc -o bi_thread_block bi_thread_block.cu

./bi_thread_block output:

A[0]=0

A[1]=0

A[2]=0

A[3]=0

A[4]=0

A[5]=0

A[6]=0

A[7]=0

A[8]=0

A[9]=0

A[0]=0

A[1]=1

A[2]=2

A[3]=3

A[4]=4

A[5]=0

A[6]=0

A[7]=0

A[8]=0

A[9]=0

The first 5 elements are modified by “add”, while the other 5 are not. I’ve also tried to use only the y coordinate calling add<<<10, 1>>>(devPtrA) and changing “add” to

global void add(int* device)

{

int i =threadIdx.y;

device[i] = i;

}

but it doesn’t work either. Does anyone have any idea?

Thanks a lot!

Giacomo

This line is wrong:

int i =5*threadIdx.y + threadIdx.x;

This only gives you the index of the thread in a block

it should be:

int index_x = blockIdx.x * blockDim.x + threadIdx.x;

  int index_y = blockIdx.y * blockDim.y + threadIdx.y;

// map the two 2D indices to a single linear, 1D index

  int grid_width = gridDim.x * blockDim.x;

  int i = index_y * grid_width + index_x;

ripped from here http://code.google.com/p/stanford-cs193g-sp2010/wiki/TutorialMultidimensionalKernelLaunch

Thank you pasoleatis! You’re right!

Actually the problem is not really solved because I still can’t use the y coordinate. In the following example I investigate the behaviour of index_x, index_y and index_y * grid_width + index_x:

bi_thread_block.cu:

// Kernel definition

global void add(int* device, int* ydevice, int* xdevice)

{

int index_x = blockIdx.x * blockDim.x + threadIdx.x;

int index_y = blockIdx.y * blockDim.y + threadIdx.y;

// map the two 2D indices to a single linear, 1D index

int grid_width = gridDim.x * blockDim.x;

int index = index_y * grid_width + index_x;

device[index] = index;

ydevice[index_y] = index_y;

xdevice[index_x] = index_x;

}

int main()

{

int A={0}, B={0}, C={0};

int *devPtrA, *devPtrB, *devPtrC;

int memsize= SIZE * sizeof(int);

cudaMalloc((void**)&devPtrA, memsize);

cudaMalloc((void**)&devPtrB, memsize);     

cudaMalloc((void**)&devPtrC, memsize); 

cudaMemcpy(devPtrA, A, memsize, cudaMemcpyHostToDevice);

cudaMemcpy(devPtrB, B, memsize, cudaMemcpyHostToDevice);    

cudaMemcpy(devPtrC, C, memsize, cudaMemcpyHostToDevice);

add<<<5, 2>>>(devPtrA, devPtrB, devPtrC);

cudaMemcpy(A, devPtrA, memsize, cudaMemcpyDeviceToHost);

cudaMemcpy(B, devPtrB, memsize, cudaMemcpyDeviceToHost);    

cudaMemcpy(C, devPtrC, memsize, cudaMemcpyDeviceToHost);

for (int i=0; i<SIZE; i++)

printf("A[%d]=%d\n",i,A[i]);

for (int i=0; i<SIZE; i++) 

printf("B[%d]=%d\n",i,B[i]);

for (int i=0; i<SIZE; i++) 

printf("C[%d]=%d\n",i,C[i]);

cudaFree(devPtrA);

cudaFree(devPtrB); 

cudaFree(devPtrC); 

return 0;

}

./bi_thread_block output:

A[0]=0

A[1]=1

A[2]=2

A[3]=3

A[4]=4

A[5]=5

A[6]=6

A[7]=7

A[8]=8

A[9]=9

B[0]=0

B[1]=0

B[2]=0

B[3]=0

B[4]=0

B[5]=0

B[6]=0

B[7]=0

B[8]=0

B[9]=0

C[0]=0

C[1]=1

C[2]=2

C[3]=3

C[4]=4

C[5]=5

C[6]=6

C[7]=7

C[8]=8

C[9]=9

ThreadIdx.y and index_y seem to be always 0.

I think in this case the problem is the way you submit the kernel. The code in the kernel is correct, but you have to submit the kernel in a different way.

you use add<<<5, 2>>>(devPtrA, devPtrB, devPtrC); This meens that you use 1D grids and 1D blocks, instead you can define :

dim3 blocks=dim3(bx,by,bz),threads=dim3(tx,ty,tz);

now use this add<<<blocks,threads>>>(devPtrA, devPtrB, devPtrC);

I think it is very good you try to understand this. This is very important and can save lots of trouble in the future it is done proeprly from the beginning.

Ok, I see… I’ll look through the way of submission more carefully.
Thanks again :)

Just to complte using this <<<5,2>>< is equivalent to <<<dim3(5,1,1),dim3(2,1,1)>>>