Please help me. I am also doing a similar problem.

I have 16 elements arranged in a 2D matrix at host with values <0,1,2,3…15>. Next, my blockSize = 4 so number of blocks = 4.

I have arranged in a 2 by 2 grid with each element of size 4*1 as a vector.

My kernel function:

```
__global__ void Sum(float *d_a, float* d_b, float *result, int blockSize, size_t pitch, int N)
{
int i=0, j=0;
float *row;
//int idx = threadIdx.x + blockIdx.x * blockDim.x;
//int idy = threadIdx.y + blockIdx.y * blockDim.y;
for(i = 0; i < 2; i++)
{
row = (float*)((char*)d_a + i * pitch );
for(j = 0; j < 2; j++)
result[i*pitch + j] = row[j]; //result[j] = row[j];
}
// result[idy * pitch + idx] = d_a[idy * pitch + idx];
}
```

This is my main function

```
int main(void)
{
float** a_h = NULL, **b_h = NULL;
//float* a_h = NULL, *b_h = NULL;
float *a_d, *r_d, *b_d;
int N = 16, i = 0 , j = 0;
int blockSize = 4;
int NumBlocks = (N%blockSize)? (N/blockSize + N%blockSize) : N/blockSize;
int m = (NumBlocks%2)? (NumBlocks + 1) : NumBlocks/2;
dim3 dimGrid(2, m);
dim3 dimBlock(blockSize, 1);
// dim3 dimGrid(1, 1, 1);
// dim3 dimBlock(blockSize, N/blockSize);
int M = NumBlocks * blockSize;
int size = M * sizeof(float);
int shared_mem = blockSize * sizeof(float);
size_t pitch;
cudaError_t Error;
/*a_h = (float*)malloc(size);
memset(a_h , 0 , size);
b_h = (float*)malloc(size);
memset(b_h , 0 , size);*/
//allocate memories and generate data.
a_h = (float**)malloc(sizeof(float*) * NumBlocks);
for(i = 0; i < NumBlocks; i++)
{
a_h[i] = (float*)malloc(blockSize * sizeof(float) );
memset(a_h[i] , 0 , blockSize * sizeof(float));
}
b_h = (float**)malloc(sizeof(float*) * NumBlocks);
for(i = 0; i < NumBlocks; i++)
{
b_h[i] = (float*)malloc(blockSize * sizeof(float) );
memset(b_h[i] , 0 , blockSize * sizeof(float));
}
//produce data
for( i = 0; i < NumBlocks; i++)
{
for( j = 0; j < blockSize; j++)
{
if( (i*blockSize + j) < N)
//a_h[i*NumBlocks + j] = (float)(i*blockSize + j);
a_h[i][j] = (float)(i*blockSize + j);
//printf("a_h[%d][%d] = %f\n",i , j , a_h[i*NumBlocks +j ]);
printf("a_h[%d][%d] = %f\n",i , j , a_h[i][j]);
}
}
Error = cudaMallocPitch( (void**)&r_d , &pitch , blockSize * sizeof(float) , NumBlocks);
Error = cudaMalloc( (void**)&b_d ,blockSize * sizeof(float) );
Error = cudaMallocPitch( (void**)&a_d , &pitch , blockSize * sizeof(float) , NumBlocks);
Error = cudaMemcpy2D(a_d, pitch, a_h , blockSize * sizeof(float) , blockSize * sizeof(float), NumBlocks , cudaMemcpyHostToDevice);
//call cuda function
Sum<<<dimGrid, dimBlock, shared_mem>>>(a_d, b_d, r_d, blockSize, pitch, N);
//copy back
Error = cudaMemcpy2D(b_h, pitch, r_d , pitch, blockSize, NumBlocks , cudaMemcpyDeviceToHost);
printf("\n\n");
for( i = 0; i < NumBlocks; i++)
{
for( j = 0; j < blockSize; j++)
printf("b_h[%d][%d] = %f\n",i , j , b_h[i][j]);
//printf("b_h[%d][%d] = %f\n",i , j , b_h[i*NumBlocks + j]);
}
//free memories
cudaFree(r_d);
cudaFree(a_d);
for(i = 0; i < NumBlocks; i++)
{
free(a_h[i]);
a_h[i] = NULL;
free(b_h[i]);
b_h[i] = NULL;
}
/*free(a_h);
a_h = NULL;
free(b_h);
b_h = NULL;*/
}//end main()
```

Observations:

I am expecting output to be <0,1,2,3,4,5,6,7,…15> but instead getting output as <0,1,2,3,0,0…0> that is after 1st block it is giving trailing zeros.

This implies that with the data was not copied correctly or there is some problem in retrieving data from device array a_d .

In my kernel function, the commented statements that you see was after referring to posts in this thread.

Second observation is that if I take N = 128 or 256 and blockSize as 32, it works but then crashes when I am about free the host array b_h at the end of the program. Any inputs will help me.

I have tried a lot but not succeeded.