cudaMemcpy2D help

BeachHut · July 9, 2008, 2:36pm

#include <stdio.h>

#include <cuda.h>

__global__ void test(int *p, size_t pitch){

	*((char *)p + threadIdx.x * pitch + threadIdx.y)=123;

}

main(){

	int *p, p_h[5][5], i, j;

	size_t pitch;

	dim3 block(5,5);

	cudaMallocPitch((void**) &p, &pitch, 5*sizeof(int), 5);

	test<<<1,block>>>(p, pitch);

	cudaMemcpy2D(p_h,5*sizeof(int),p,pitch,5*sizeof(int),5,cudaMemcpyDeviceToHost);

	

	for(i=0;i<5;i++){

  for(j=0;j<5;j++){

  	printf("%d ", p_h[i][j]);

  }

  printf("\n");

	}

	cudaFree(p);

}

I wanted to practice using 2D arrays in CUDA, so I wrote this simple program. Everything seems to work except the cudaMemcpy2D. It’s almost there - some of the values of p_h get set correctly, but others are just garbage.

Also: Why does p need to be cast as a char * when I am doing the pointer arithmetic in the kernel?

Thanks.

Sibi_A · July 11, 2008, 9:52am

Indexing seems to be wrong.

Try this kernel

__global__ void test(int *p, size_t pitch)

{

	int iy = blockDim.y * blockIdx.y + threadIdx.y;

	int ix = blockDim.x * blockIdx.x + threadIdx.x;

	if (iy >= 5 || ix >= 5)

  return;

	int *q = (int*)((char*)p + iy * pitch) + ix;

	*q = 123;

}

BeachHut · July 17, 2008, 8:44am

Thank you very much. That solved it.

In fact if I just change

*((char *)p + threadIdx.x * pitch + threadIdx.y)=123;

to

*((int *)((char *)p + threadIdx.x * pitch) + threadIdx.y)=123;

it works.

MisterAnderson42 · July 17, 2008, 12:19pm

I know that is how the programming guide demonstrates using pitch, but as you discovered it is messy and prone to errors. Pointer casting to different pointer types is best avoided.

I always divide the pitch by the element size (call it el_pitch) and index 2D memory like so:

p[iy * el_pitch + ix]

where ix,iy is the coordinates in the 2D array you are accessing.

cirus · July 28, 2009, 5:35pm

Please help me. I am also doing a similar problem.

I have 16 elements arranged in a 2D matrix at host with values <0,1,2,3…15>. Next, my blockSize = 4 so number of blocks = 4.

I have arranged in a 2 by 2 grid with each element of size 4*1 as a vector.

My kernel function:

__global__ void Sum(float *d_a, float* d_b, float *result, int blockSize, size_t pitch, int N)

{

  int i=0, j=0;

  float *row;

  //int idx = threadIdx.x + blockIdx.x * blockDim.x;

  //int idy = threadIdx.y + blockIdx.y * blockDim.y;

for(i = 0; i < 2; i++)

  {

	 row = (float*)((char*)d_a + i * pitch );

	 for(j = 0; j < 2; j++)

		result[i*pitch + j] = row[j]; //result[j] = row[j];

  }

//  result[idy * pitch + idx] = d_a[idy * pitch + idx];

}

This is my main function

int main(void)

{

  float** a_h = NULL, **b_h = NULL;

//float* a_h = NULL, *b_h = NULL;

  float *a_d, *r_d, *b_d;

int N = 16, i = 0 , j = 0;

  int blockSize = 4;

  int NumBlocks = (N%blockSize)? (N/blockSize + N%blockSize) : N/blockSize;

  int m = (NumBlocks%2)? (NumBlocks + 1) : NumBlocks/2;

dim3 dimGrid(2, m);

 dim3 dimBlock(blockSize, 1); 

 // dim3 dimGrid(1, 1, 1);

 // dim3 dimBlock(blockSize, N/blockSize); 

int M = NumBlocks * blockSize;

  int size = M * sizeof(float);

  int shared_mem = blockSize * sizeof(float);

  size_t pitch;

  cudaError_t Error;

/*a_h = (float*)malloc(size);

  memset(a_h , 0 , size);

b_h = (float*)malloc(size);

  memset(b_h , 0 , size);*/

//allocate memories and generate data.

  a_h = (float**)malloc(sizeof(float*) * NumBlocks);

  for(i = 0; i < NumBlocks; i++)

  {

	 a_h[i] = (float*)malloc(blockSize * sizeof(float) );

	 memset(a_h[i] , 0 , blockSize * sizeof(float));

  }

b_h = (float**)malloc(sizeof(float*) * NumBlocks);

  for(i = 0; i < NumBlocks; i++)

  {

	 b_h[i] = (float*)malloc(blockSize * sizeof(float) );

	 memset(b_h[i] , 0 , blockSize * sizeof(float));

  }

//produce data

  for( i = 0; i < NumBlocks; i++)

  {

	for( j = 0; j < blockSize; j++)	

	{

		if( (i*blockSize + j) < N)

			//a_h[i*NumBlocks + j] = (float)(i*blockSize + j);

			a_h[i][j] = (float)(i*blockSize + j);

		//printf("a_h[%d][%d] = %f\n",i , j , a_h[i*NumBlocks +j ]);

		printf("a_h[%d][%d] = %f\n",i , j , a_h[i][j]);

	}

}

Error = cudaMallocPitch( (void**)&r_d , &pitch , blockSize * sizeof(float) , NumBlocks);

  Error = cudaMalloc( (void**)&b_d ,blockSize * sizeof(float) );

  Error = cudaMallocPitch( (void**)&a_d , &pitch , blockSize * sizeof(float) , NumBlocks);

Error = cudaMemcpy2D(a_d, pitch, a_h , blockSize * sizeof(float) , blockSize * sizeof(float), NumBlocks , cudaMemcpyHostToDevice);

//call cuda function

  Sum<<<dimGrid, dimBlock, shared_mem>>>(a_d, b_d, r_d, blockSize, pitch, N);

//copy back

  Error = cudaMemcpy2D(b_h, pitch, r_d , pitch, blockSize, NumBlocks , cudaMemcpyDeviceToHost);

  printf("\n\n");

  for( i = 0; i < NumBlocks; i++)

  {

	for( j = 0; j < blockSize; j++)	

		printf("b_h[%d][%d] = %f\n",i , j , b_h[i][j]);

		//printf("b_h[%d][%d] = %f\n",i , j , b_h[i*NumBlocks + j]);

	

  }

//free memories

  cudaFree(r_d);

  cudaFree(a_d);

for(i = 0; i < NumBlocks; i++)

  {

	free(a_h[i]);

	a_h[i] = NULL;

	free(b_h[i]);

	b_h[i] = NULL;

  }

/*free(a_h);

  a_h = NULL;

free(b_h);

  b_h = NULL;*/

}//end main()

Observations:

I am expecting output to be <0,1,2,3,4,5,6,7,…15> but instead getting output as <0,1,2,3,0,0…0> that is after 1st block it is giving trailing zeros.

This implies that with the data was not copied correctly or there is some problem in retrieving data from device array a_d .

In my kernel function, the commented statements that you see was after referring to posts in this thread.

Second observation is that if I take N = 128 or 256 and blockSize as 32, it works but then crashes when I am about free the host array b_h at the end of the program. Any inputs will help me.

I have tried a lot but not succeeded.

Topic		Replies	Views
Can't get copyDeviceToHost to work with cudaMemcpy2D CUDA Programming and Performance	0	3657	November 13, 2009
test on 'cudaMallocPitch' and 'cudaMemcpy2D' CUDA Programming and Performance	1	617	November 16, 2010
cudaMemcpy2D error CUDA Programming and Performance	1	1164	November 11, 2009
problem with cudaMallocPitch and cudaMemcpy2D CUDA Programming and Performance	5	6421	April 22, 2009
CudaMallocPitch and CudaMemcpy2D CUDA Programming and Performance	7	5805	August 3, 2015
Using cudaMemcpy2D very strange CUDA Programming and Performance	2	1412	March 10, 2009
trouble with cudaMemcpy2D I cant get a matrix to copy into 2D pitched memory CUDA Programming and Performance	1	963	July 13, 2009
need help for cudaMemcpy2D() CUDA Programming and Performance	5	4653	December 8, 2009
2D array & Memory space Mostly about cudaMallocPitch & cudaMemcpy2D CUDA Programming and Performance	1	1533	October 15, 2009
Copying 2D array from host to device CUDA Programming and Performance	7	7381	July 27, 2010

cudaMemcpy2D help

Related topics