cudaMemcpy2D help

#include <stdio.h>

#include <cuda.h>

__global__ void test(int *p, size_t pitch){

	*((char *)p + threadIdx.x * pitch + threadIdx.y)=123;

}

main(){

	int *p, p_h[5][5], i, j;

	size_t pitch;

	dim3 block(5,5);

	cudaMallocPitch((void**) &p, &pitch, 5*sizeof(int), 5);

	test<<<1,block>>>(p, pitch);

	cudaMemcpy2D(p_h,5*sizeof(int),p,pitch,5*sizeof(int),5,cudaMemcpyDeviceToHost);

	

	for(i=0;i<5;i++){

  for(j=0;j<5;j++){

  	printf("%d ", p_h[i][j]);

  }

  printf("\n");

	}

	cudaFree(p);

}

I wanted to practice using 2D arrays in CUDA, so I wrote this simple program. Everything seems to work except the cudaMemcpy2D. It’s almost there - some of the values of p_h get set correctly, but others are just garbage.

Also: Why does p need to be cast as a char * when I am doing the pointer arithmetic in the kernel?

Thanks.

Indexing seems to be wrong.

Try this kernel

__global__ void test(int *p, size_t pitch)

{

	int iy = blockDim.y * blockIdx.y + threadIdx.y;

	int ix = blockDim.x * blockIdx.x + threadIdx.x;

	if (iy >= 5 || ix >= 5)

  return;

	int *q = (int*)((char*)p + iy * pitch) + ix;

	*q = 123;

}

Thank you very much. That solved it.

In fact if I just change

*((char *)p + threadIdx.x * pitch + threadIdx.y)=123;

to

*((int *)((char *)p + threadIdx.x * pitch) + threadIdx.y)=123;

it works.

I know that is how the programming guide demonstrates using pitch, but as you discovered it is messy and prone to errors. Pointer casting to different pointer types is best avoided.

I always divide the pitch by the element size (call it el_pitch) and index 2D memory like so:

p[iy * el_pitch + ix]

where ix,iy is the coordinates in the 2D array you are accessing.

Please help me. I am also doing a similar problem.

I have 16 elements arranged in a 2D matrix at host with values <0,1,2,3…15>. Next, my blockSize = 4 so number of blocks = 4.

I have arranged in a 2 by 2 grid with each element of size 4*1 as a vector.

My kernel function:

__global__ void Sum(float *d_a, float* d_b, float *result, int blockSize, size_t pitch, int N)

{

  int i=0, j=0;

  float *row;

  //int idx = threadIdx.x + blockIdx.x * blockDim.x;

  //int idy = threadIdx.y + blockIdx.y * blockDim.y;

for(i = 0; i < 2; i++)

  {

	 row = (float*)((char*)d_a + i * pitch );

	 for(j = 0; j < 2; j++)

		result[i*pitch + j] = row[j]; //result[j] = row[j];

  }

//  result[idy * pitch + idx] = d_a[idy * pitch + idx];

}

This is my main function

int main(void)

{

  float** a_h = NULL, **b_h = NULL;

//float* a_h = NULL, *b_h = NULL;

  float *a_d, *r_d, *b_d;

int N = 16, i = 0 , j = 0;

  int blockSize = 4;

  int NumBlocks = (N%blockSize)? (N/blockSize + N%blockSize) : N/blockSize;

  int m = (NumBlocks%2)? (NumBlocks + 1) : NumBlocks/2;

dim3 dimGrid(2, m);

 dim3 dimBlock(blockSize, 1); 

 // dim3 dimGrid(1, 1, 1);

 // dim3 dimBlock(blockSize, N/blockSize); 

int M = NumBlocks * blockSize;

  int size = M * sizeof(float);

  int shared_mem = blockSize * sizeof(float);

  size_t pitch;

  cudaError_t Error;

/*a_h = (float*)malloc(size);

  memset(a_h , 0 , size);

b_h = (float*)malloc(size);

  memset(b_h , 0 , size);*/

//allocate memories and generate data.

  a_h = (float**)malloc(sizeof(float*) * NumBlocks);

  for(i = 0; i < NumBlocks; i++)

  {

	 a_h[i] = (float*)malloc(blockSize * sizeof(float) );

	 memset(a_h[i] , 0 , blockSize * sizeof(float));

  }

b_h = (float**)malloc(sizeof(float*) * NumBlocks);

  for(i = 0; i < NumBlocks; i++)

  {

	 b_h[i] = (float*)malloc(blockSize * sizeof(float) );

	 memset(b_h[i] , 0 , blockSize * sizeof(float));

  }

//produce data

  for( i = 0; i < NumBlocks; i++)

  {

	for( j = 0; j < blockSize; j++)	

	{

		if( (i*blockSize + j) < N)

			//a_h[i*NumBlocks + j] = (float)(i*blockSize + j);

			a_h[i][j] = (float)(i*blockSize + j);

		//printf("a_h[%d][%d] = %f\n",i , j , a_h[i*NumBlocks +j ]);

		printf("a_h[%d][%d] = %f\n",i , j , a_h[i][j]);

	}

}

Error = cudaMallocPitch( (void**)&r_d , &pitch , blockSize * sizeof(float) , NumBlocks);

  Error = cudaMalloc( (void**)&b_d ,blockSize * sizeof(float) );

  Error = cudaMallocPitch( (void**)&a_d , &pitch , blockSize * sizeof(float) , NumBlocks);

Error = cudaMemcpy2D(a_d, pitch, a_h , blockSize * sizeof(float) , blockSize * sizeof(float), NumBlocks , cudaMemcpyHostToDevice);

//call cuda function

  Sum<<<dimGrid, dimBlock, shared_mem>>>(a_d, b_d, r_d, blockSize, pitch, N);

//copy back

  Error = cudaMemcpy2D(b_h, pitch, r_d , pitch, blockSize, NumBlocks , cudaMemcpyDeviceToHost);

  printf("\n\n");

  for( i = 0; i < NumBlocks; i++)

  {

	for( j = 0; j < blockSize; j++)	

		printf("b_h[%d][%d] = %f\n",i , j , b_h[i][j]);

		//printf("b_h[%d][%d] = %f\n",i , j , b_h[i*NumBlocks + j]);

	

  }

//free memories

  cudaFree(r_d);

  cudaFree(a_d);

for(i = 0; i < NumBlocks; i++)

  {

	free(a_h[i]);

	a_h[i] = NULL;

	free(b_h[i]);

	b_h[i] = NULL;

  }

/*free(a_h);

  a_h = NULL;

free(b_h);

  b_h = NULL;*/

}//end main()

Observations:

I am expecting output to be <0,1,2,3,4,5,6,7,…15> but instead getting output as <0,1,2,3,0,0…0> that is after 1st block it is giving trailing zeros.

This implies that with the data was not copied correctly or there is some problem in retrieving data from device array a_d .

In my kernel function, the commented statements that you see was after referring to posts in this thread.

Second observation is that if I take N = 128 or 256 and blockSize as 32, it works but then crashes when I am about free the host array b_h at the end of the program. Any inputs will help me.

I have tried a lot but not succeeded.