cudaMemcpy3D for allocating arrays for textures only?

Teja · March 2, 2011, 7:53am

Hi,

I have one doubt.

Why can’t them to allocate general purpose global memory??

Suppose if I want to add two 3D arrays, how do I assign threads?

CUDA C call…

__global__ void array(cudaPitchedPtr c, cudaPitchedPtr a, cudaPitchedPtr b, cudaExtent ext)

{

//  confused in this section

// whether this allocation is right??  

      int idx = blockDim.x * blockIdx.x + threadIdx.x;

if (idx>=ext.width*ext.height*ext.depth)

                return; //over size

dim z=idx/(ext.width*ext.height);

        dim y=(idx%(ext.width*ext.height))/ext.width;

        dim x=idx%ext.width;

 // How do i add two 3D arrays??

}

extern "C" void ker_thre_(cudaArray *a,cudaArray *b,cudaArray *c,int *x, int *y, int *z)

{

int NX,NY,NZ;

//	int i, j ,k;

NX=*x;

NY=*y;

NZ=*z;

size_t buf=NX*NY*NZ;

cudaPrintfInit(buf);

cudaError_t status = cudaSuccess;

cudaExtent ext;

  ext.width=NX*sizeof(int);

  ext.height=NY;

  ext.depth=NZ;

cudaPitchedPtr d_a;

 status=cudaMalloc3D(&d_a,ext);

 if(status != cudaSuccess)

   {

   fprintf(stderr, "Malloc d_a : %s\n", cudaGetErrorString(status));

   }

cudaPitchedPtr d_b;

 status=cudaMalloc3D(&d_b,ext);

 if(status != cudaSuccess)

   {

   fprintf(stderr, "Malloc d_b : %s\n", cudaGetErrorString(status));

   }

cudaPitchedPtr d_c;

 status=cudaMalloc3D(&d_c,ext);

 if(status != cudaSuccess)

   {

   fprintf(stderr, "Malloc d_c : %s\n", cudaGetErrorString(status));

   }

//..... Copy Host A to Device .........//

cudaMemcpy3DParms aptr ={0};

aptr.srcPtr= make_cudaPitchedPtr( (void*) a, NX*sizeof(int), NX, NY );

aptr.dstPtr= d_a;

aptr.extent= ext;

status=cudaMemcpy3D(&aptr);

if(status != cudaSuccess)

  {

  fprintf(stderr, "MemcpyHostToDevice : %s\n", cudaGetErrorString(status));

  }

//..... Copy Host B to Device .........//

cudaMemcpy3DParms bptr ={0};

bptr.srcPtr= make_cudaPitchedPtr( (void*) b, NX*sizeof(int), NX, NY );

bptr.dstPtr= d_b;

bptr.extent= ext;

status=cudaMemcpy3D(&bptr);

if(status != cudaSuccess)

  {

  fprintf(stderr, "MemcpyHosToDevice : %s\n", cudaGetErrorString(status));

  }

array<<< 1,1 >>>(d_c,d_a,d_b,ext);

//........ Copy Device to Host C.......//

cudaMemcpy3DParms cptr ={0};

cptr.srcPtr= d_c;

cptr.dstPtr= make_cudaPitchedPtr( (void*) c, NX*sizeof(int), NX, NY );

cptr.extent= ext;

status=cudaMemcpy3D(&cptr);

if(status != cudaSuccess)

  {

  fprintf(stderr, "MemcpyDeviceToHost: %s\n", cudaGetErrorString(status));

  }

/*

for(i=0;i<NX;i++) 

   {

   printf("\n"); 

   for(j=0;j<NY;j++) 

      {

      printf("\n");

      for(k=0;k<NZ;k++) 

         printf("%d\t", c[i+j*NX+k*NX*NY]);

      }

   }      

*/

printf("\n\n");

cudaPrintfEnd();

	

cudaFree(d_a.ptr);

cudaFree(d_b.ptr);

cudaFree(d_c.ptr);

Any suggestions ??

thanks in advance.

avidday · March 2, 2011, 8:07am

Yes. The resultant CUDA array is an opaque object which cannot be manipulated in used code unless it is bound to a texture and the texture API is used. If you are working with 3D arrays, just use linear memory and index into it in either row or column major ordering

column major ordering maps position(xindex,yindex,zindex) to memoryidx = xindex + (xdim * yindex) + (xdim * ydim *zindex)

row major ordering maps (xindex,yindex,zindex) to memoryidx = zindex + (zdim * yindex) + (zdim * ydim * xindex)

the column major order version should match how a FORTRAN multidimensional array is stored in memory.

Teja · March 2, 2011, 8:19am

Hi avidday,

Thank you for reply,

I agree with you. I tried. But performance wise CUDA C program takes more time ( linear array).

__global__ void array(int *a, int *b, int *c, int x, int y, int z)

{

int i, j ,k, A;

// Here, how do I divide threads?? 

/* 

for(k=0;k<z;k++)

   for(j=0;j<y;j++)

      for(i=0;i<x;i++)

         {

         A=i+j*x+k*x*y;

         c[A]=  a[A]+b[A];

         }

*/

}

extern "C" void ker_thre_(int *a,int *b,int *c,int *x, int *y, int *z)

{

int *d_a,*d_b, *d_c;

int n, i,j,k;

int nx,ny,nz;

nx=*x;

ny=*y;

nz=*z;

n=nx*ny*nz;

dim3 grid_size(*x,*y);

cudaMalloc((void **) &d_a, sizeof(int)*n);

cudaMalloc((void **) &d_b, sizeof(int)*n);

cudaMalloc((void **) &d_c, sizeof(int)*n);

printf("C - A\n\n");

for(k=0;k<nz;k++)

   {

   printf("\n");

   for(j=0;j<ny;j++)

      {

      printf("\n");

      for(i=0;i<nx;i++)

      printf("\t%d", a[i+j*nx+k*nx*ny]);

      }

   }

printf("\nC - B\n\n");

for(k=0;k<nz;k++)

   {

   printf("\n");

   for(j=0;j<ny;j++)

      {

      printf("\n");

      for(i=0;i<nx;i++)

       printf("\t%d", b[i+j*nx+k*nx*ny]);

      }

   }

cudaMemcpy( d_a, a, sizeof(int)*n, cudaMemcpyHostToDevice );

cudaMemcpy( d_b, b, sizeof(int)*n, cudaMemcpyHostToDevice );

cudaMemcpy( d_c, c, sizeof(int)*n, cudaMemcpyHostToDevice );

dim3 block(nx,ny);

dim3 grid(n/nx,n/ny);

array<<<grid, block>>>(d_a,d_b,d_c,nx,ny,nz);

//array<<<1,1>>>(d_a,d_b,d_c,nx,ny,nz);

cudaMemcpy(c,d_c, sizeof(int)*n, cudaMemcpyDeviceToHost);

printf("\n\nC-Result :\n");

for(k=0;k<nz;k++)

   {

   printf("\n");

   for(j=0;j<ny;j++)

      {

      printf("\n");

      for(i=0;i<nx;i++)

       printf("\t%d",c[i+j*nx+k*nx*ny]);

     }

  }

printf("\n\n");

cudaFree(d_a);

cudaFree(d_b);

cudaFree(d_c);

}

avidday · March 2, 2011, 10:05am

Remember, it is linear memory. The 3D array is stored like a big vector. Adding two such arrays becomes like adding two big vectors together. Vector addition is about the simplest kernel anyone could write. Use one thread per vector entry.

Topic		Replies	Views
Setting up 3d arryas I have some questions about how to use 3d arrays and cudaArrays CUDA Programming and Performance	10	27979	April 5, 2010
Writing to 3D texture CUDA Programming and Performance	10	20032	December 28, 2010
Guide: cudaMalloc3D and cudaArray's CUDA Programming and Performance	0	19461	July 10, 2011
Problem about cudaMemcpy3D() CUDA Programming and Performance	9	7180	October 26, 2008
Transferring multidimensional arrays to GPU CUDA Programming and Performance	1	4555	May 15, 2009
c++array, cudaarray, texture CUDA Programming and Performance	1	4557	October 18, 2008
3D arrays CUDA Programming and Performance	3	5065	March 26, 2008
3D Texturing CUDA Programming and Performance	7	6243	August 8, 2008
How to define a three-dimensional array? define a three-dimensional array on GPU CUDA Programming and Performance	13	11973	October 10, 2008
CUDA Array cudaMallocArray Vs CudaAllocArray3D Is there a guideline on selecting MallocArray Vs Mall CUDA Programming and Performance	1	3323	June 3, 2009

cudaMemcpy3D for allocating arrays for textures only?

Related topics