I have this program
int main()
{
short d_out;
cudaMalloc((void*)&d_out, sizeof(short)644);
dim3 dimBlock(8,4,1);
dim3 dimGrid(1,8,1);
readTexels<<<dimGrid,dimBlock>>>(d_out);
short* h_out=(short*)malloc(sizeof(short)644);
cudaMemcpy(h_out, d_out, sizeof(short)644, cudaMemcpyDeviceToHost);
for (int i=0;i<4;i++)
{
for(int j=0;j<64;j++)
{
printf(“%d %d\n”,j+64i,h_out[j+64i]);
}
printf(“\n”);
}
return 0;
}
and my kernel as:-
global void readTexels(short* d_out)
{
unsigned int i = blockIdx.xblockDim.x + threadIdx.x;
unsigned int j = blockIdx.yblockDim.y + threadIdx.y;
int index=i+j*4;
d_out[index]=-27;
}
i want is a 8 blocks with 8 rows and 4 cols each. so that the total count of thread is 256.
But i am getting only 76 threads as it seems only upto d_out[75], value -27 is assigned.
I think i have done somethign wrong while declaring my dimgrid and dimblock dimension.
help me correct this…
thank you