I wrote a simple script that places a 2D array of numbers in device and then retrieves them.
I pass an input grid of dimensions H * W.
For H*W elements i.e when H = 3 , W = 4 , the input is
a_h[0][0] = 0;
a_h[0][1] = 1;
.
.
.
a_h[2][3] = 11;
A strange behavior is observed.
-
If H = 4 and W = 3 , there is a crash when I am about to display the retrieved data. The crash occurs at b_h[3][0]. Incomplete data is displayed correctly till b_h[2][2]
-
Where as if H = 3 and W = 4 , the program executes successfully and correct result is displayed.
-
When I allocate device memory, for H by W matrix, I am getting pitch value as 64.
-
For square matrices viz: H = W , the program works fine.
-
for cases like H=4, W =3 ad H = 6 , W = 7 the program prints incomplete results and crashes. I tried to change parameters of cudaMemcpy2D(), where
//Error = cudaMemcpy2D(b_h, W * sizeof(float) , r_d , pitch_r , W * sizeof(float) , H , cudaMemcpyDeviceToHost );
Error = cudaMemcpy2D(b_h, pitch_r , r_d , pitch_r , pitch_r , H , cudaMemcpyDeviceToHost );
if I un-comment the first one and comment the second , I get incomplete but correct result, resulting in crash at b_h[3][0].
If I use second statement instead of first , my program does not crash but gives correct result till b_h[2][2] and then all zeros till b_h[3][2].
Q1. Can you help me where is the flaw?
For simplicity I am using 1D grid of blocks. Earlier I used 2D grid of blocks but was not getting correct results.
Kernel Code
__global__ void Sum(float *d_a, float *d_c, float* result, float* sum, int blockSize, size_t pitch, int N, int H, int W)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int idy = threadIdx.y + blockIdx.y * blockDim.y;
//int byt = pitch / sizeof(float);
if(idx < N)
result[idy * pitch + idx] = d_a[idy * pitch + idx];
}//end of Sum()
Main function
int main(void)
{
float** a_h = NULL, **b_h = NULL;
float *a_d, *r_d, *b_d , *c_d, *sum_d;
int *h_summation = NULL;
//Error
cudaError_t Error;
//counters
int i = 0 , j = 0;
//Let us have a H by W matrix
int H = 3 , W = 2;
int blockSize = 4;
int totalElements = H * W;
int numBlocks = (! (totalElements % blockSize) )? (totalElements / blockSize): 1 + (totalElements / blockSize);
size_t pitch_a, pitch_c , pitch_r , pitch_sum;
dim3 dimGrid(numBlocks, 1);
dim3 dimBlock(blockSize,1);
// int shared_mem = W * sizeof(float);
//allocate memory
a_h = (float**)malloc(H * sizeof(float*) );
b_h = (float**)malloc(H * sizeof(float*) );
for( i = 0; i < H; i++)
{
a_h[i] = (float*)malloc(sizeof(float) * W);
memset(a_h[i] , 0 , W * sizeof(float) );
b_h[i] = (float*)malloc(sizeof(float) * W);
memset(b_h[i] , 0 , W * sizeof(float) );
}
h_summation = (int*)malloc(sizeof(float) * W);
memset(h_summation , 0 , sizeof(float) * W);
Error = cudaMallocPitch( (void**)& a_d , &pitch_a, W * sizeof(float) , H);
Error = cudaMallocPitch( (void**)& r_d , &pitch_r, W * sizeof(float) , H);
Error = cudaMallocPitch( (void**)& c_d , &pitch_c, W * sizeof(float) , H);
Error = cudaMalloc( (void**)& sum_d, W * sizeof(int) );
//generate data
for( i = 0; i < H; i++)
{
for( j = 0; j < W; j++)
a_h[i][j] = a_h[i][j] = (float)(i*W + j);
}
//copy source data to destination at device
Error = cudaMemcpy2D(a_d , pitch_a, a_h, W * sizeof(float) , W * sizeof(float) , H , cudaMemcpyHostToDevice );
//call kernel to retrieve data
Sum<<<dimGrid, dimBlock>>>(a_d, c_d, r_d, sum_d, blockSize, pitch_a, totalElements, H , W);
//retrieve data from device
//Error = cudaMemcpy2D(b_h, W * sizeof(float) , r_d , pitch_r , W * sizeof(float) , H , cudaMemcpyDeviceToHost );
Error = cudaMemcpy2D(b_h, pitch_r , r_d , pitch_r , pitch_r , H , cudaMemcpyDeviceToHost );
//display data -- Original data
for( i = 0; i < H; i++)
{
for( j = 0; j < W; j++)
printf("a_h[%d][%d] = %f\n", i , j , a_h[i][j]);
}
printf("\n\n");
//retrieved data
for( i = 0; i < H; i++)
{
for( j = 0; j < W; j++)
printf("b_h[%d][%d] = %f\n", i , j , b_h[i][j]);
}
printf("\n\n");
//print sum per block
for( j = 0; j < W; j++)
printf("h_summation[%d] = %d\n", j , h_summation[j]);
//free memory
cudaFree(a_d);
cudaFree(r_d);
cudaFree(sum_d);
for( i = 0; i < H; i++)
{
free(a_h[i]);
a_h[i] = NULL;
free(b_h[i]);
b_h[i] = NULL;
}
}//end main()