dimGrid and dimBlock Confusion

I have this program

int main()
{
short d_out;
cudaMalloc((void
*)&d_out, sizeof(short)644);
dim3 dimBlock(8,4,1);
dim3 dimGrid(1,8,1);
readTexels<<<dimGrid,dimBlock>>>(d_out);
short* h_out=(short*)malloc(sizeof(short)644);
cudaMemcpy(h_out, d_out, sizeof(short)644, cudaMemcpyDeviceToHost);

for (int i=0;i<4;i++)
{
for(int j=0;j<64;j++)
{
printf("%d %d\n",j+64i,h_out[j+64i]);
}
printf("\n");
}
return 0;
}

and my kernel as:-

global void readTexels(short* d_out)
{
unsigned int i = blockIdx.xblockDim.x + threadIdx.x;
unsigned int j = blockIdx.y
blockDim.y + threadIdx.y;
int index=i+j*4;
d_out[index]=-27;
}

i want is a 8 blocks with 8 rows and 4 cols each. so that the total count of thread is 256.
But i am getting only 76 threads as it seems only upto d_out[75], value -27 is assigned.

I think i have done somethign wrong while declaring my dimgrid and dimblock dimension.

help me correct this…

thank you

according to your CPU code

[codebox]for (int i=0;i<4;i++)

{

for(int j=0;j<64;j++)

{

	printf("%d %d\n",j+64*i,h_out[j+64*i]);

} 

printf("\n");

}[/codebox]

I suppose that you want a 2-D data with

number of rows = 4 and

number of columns = 64

then you should use execution configuration as

[codebox] dim3 dimBlock(8,4,1);

dim3 dimGrid(8,1,1);

readTexels<<<dimGrid,dimBlock>>>(d_out); [/codebox]

and you kernel should be corrected as

[codebox]global void readTexels(short* d_out)

{

unsigned int colIndex = blockIdx.x*blockDim.x + threadIdx.x;

unsigned int rowIndex = blockIdx.y*blockDim.y + threadIdx.y;

// row-major index

int index = rowIndex * 64 + colIndex ;

d_out[index] = -27 ;

}[/codebox]