I am facing a little trouble in accessing elements of my 2D array in GPU.
Situation:
I am referring to following code from the official guide , but with my additions
//Host Code
float *d_src , *d2_src;
size_t pitch;
dim3 dimGrid(2,2); //2D array of blocks.
dim3 dimBlock(4,1); //Every block has 4 threads. In way I have 2 by 2 grid with each element as vector of length 4 float elements
N = 16;
width = blockSize = 4;
height = N/blockSize; //= 4 blocks
float **h2_src = NULL, h_src = NULL;
//Host pointer mem alloc
h_src = (float**)malloc(sizeof(float*) * height);
h2_src = (float**)malloc(sizeof(float*) * height);
for(int i = 0; i < height; i++)
{
h_src[i] = (float)malloc(sizeof(float) * width);
memset( h_src[i] , 0 , width * sizeof(float) );
h2_src[i] = (float)malloc(sizeof(float) * width);
memset( h2_src[i] , 0 , width * sizeof(float) );
}
//Generate data. input is a 2D array of form h_src[4][4];
for(int i = 0; i < height; i++)
{
for(int j = 0; j < width; j++)
{
h_src[i*width + j] = i * width + j;
}
}
//input[][] = { 0 , 1, 2, .... 15 }
//Memory Alloc at device
cudaMallocPitch( (void**) &d_src , &pitch , width * sizeof(float) , height); //At debugging , the value of pitch is 64.
cudaMallocPitch( (void**) &d2_src , &pitch , width * sizeof(float) , height); //At debugging , the value of pitch is 64.
cudaMemcpy2D( d_src, pitch , h_src , pitch , width * sizeof(float) , height , cudaMemcpyHostToDevice );
//GPU call
myKernel<<<dimGrid , dimBlock>>>(input , output, pitch, numBlocks, N , blockSize);
cudaMemcpy2D( h2_src, pitch , d2_src , pitch , width * sizeof(float) , height , cudaMemcpyHostToDevice );
//display all elements copied back from d2_src to h2_src
My kernel function
__global__ void myKernel(float* d_src , float* d2_src , size_t pitch , int N , int blockSize)
{
float *row;
for(int j = 0; j < 2; j++) // Here 2 is my grid height that is 2 by 2.
{
row = (float*)( (char*)d_src + j * pitch );
for(int i = 0; i < 2; i++)
{
d2_src[j*pitch + i] = row[i];
}
}
}
The result I am getting in h2_src is 0,1,2,3, 0…0 [trailing zeros].
I expected to get the original sequence viz <0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15>
What I guess after reading the pitch value as 64 is that it is taking 64/4 = 16 elements in a row.
Q1. Does this mean that CUDA is taking my 2 by 2 grid as a 1 by 4 grid ??? i.e elements 0…15 of the d_src is being treated as linear array? ?
Q2. Where else the problem can be? Is it in copying my host allocated source array to device allocated source array??
I am learning CUDA so need little help. Thanks.