cudaMemcpy3D for allocating arrays for textures only?

Hi,

I have one doubt.

cudaMemcpy3D for allocating arrays for textures only?

Why can’t them to allocate general purpose global memory??

Suppose if I want to add two 3D arrays, how do I assign threads?

CUDA C call…

__global__ void array(cudaPitchedPtr c, cudaPitchedPtr a, cudaPitchedPtr b, cudaExtent ext)

{

//  confused in this section

// whether this allocation is right??  

      int idx = blockDim.x * blockIdx.x + threadIdx.x;

if (idx>=ext.width*ext.height*ext.depth)

                return; //over size

dim z=idx/(ext.width*ext.height);

        dim y=(idx%(ext.width*ext.height))/ext.width;

        dim x=idx%ext.width;

 // How do i add two 3D arrays??

}

extern "C" void ker_thre_(cudaArray *a,cudaArray *b,cudaArray *c,int *x, int *y, int *z)

{

int NX,NY,NZ;

//	int i, j ,k;

NX=*x;

NY=*y;

NZ=*z;

size_t buf=NX*NY*NZ;

cudaPrintfInit(buf);

cudaError_t status = cudaSuccess;

cudaExtent ext;

  ext.width=NX*sizeof(int);

  ext.height=NY;

  ext.depth=NZ;

cudaPitchedPtr d_a;

 status=cudaMalloc3D(&d_a,ext);

 if(status != cudaSuccess)

   {

   fprintf(stderr, "Malloc d_a : %s\n", cudaGetErrorString(status));

   }

cudaPitchedPtr d_b;

 status=cudaMalloc3D(&d_b,ext);

 if(status != cudaSuccess)

   {

   fprintf(stderr, "Malloc d_b : %s\n", cudaGetErrorString(status));

   }

cudaPitchedPtr d_c;

 status=cudaMalloc3D(&d_c,ext);

 if(status != cudaSuccess)

   {

   fprintf(stderr, "Malloc d_c : %s\n", cudaGetErrorString(status));

   }

//..... Copy Host A to Device .........//

cudaMemcpy3DParms aptr ={0};

aptr.srcPtr= make_cudaPitchedPtr( (void*) a, NX*sizeof(int), NX, NY );

aptr.dstPtr= d_a;

aptr.extent= ext;

status=cudaMemcpy3D(&aptr);

if(status != cudaSuccess)

  {

  fprintf(stderr, "MemcpyHostToDevice : %s\n", cudaGetErrorString(status));

  }

//..... Copy Host B to Device .........//

cudaMemcpy3DParms bptr ={0};

bptr.srcPtr= make_cudaPitchedPtr( (void*) b, NX*sizeof(int), NX, NY );

bptr.dstPtr= d_b;

bptr.extent= ext;

status=cudaMemcpy3D(&bptr);

if(status != cudaSuccess)

  {

  fprintf(stderr, "MemcpyHosToDevice : %s\n", cudaGetErrorString(status));

  }

array<<< 1,1 >>>(d_c,d_a,d_b,ext);

//........ Copy Device to Host C.......//

cudaMemcpy3DParms cptr ={0};

cptr.srcPtr= d_c;

cptr.dstPtr= make_cudaPitchedPtr( (void*) c, NX*sizeof(int), NX, NY );

cptr.extent= ext;

status=cudaMemcpy3D(&cptr);

if(status != cudaSuccess)

  {

  fprintf(stderr, "MemcpyDeviceToHost: %s\n", cudaGetErrorString(status));

  }

/*

for(i=0;i<NX;i++) 

   {

   printf("\n"); 

   for(j=0;j<NY;j++) 

      {

      printf("\n");

      for(k=0;k<NZ;k++) 

         printf("%d\t", c[i+j*NX+k*NX*NY]);

      }

   }      

*/

printf("\n\n");

cudaPrintfEnd();

	

cudaFree(d_a.ptr);

cudaFree(d_b.ptr);

cudaFree(d_c.ptr);

Any suggestions ??

thanks in advance.

Yes. The resultant CUDA array is an opaque object which cannot be manipulated in used code unless it is bound to a texture and the texture API is used. If you are working with 3D arrays, just use linear memory and index into it in either row or column major ordering

column major ordering maps position(xindex,yindex,zindex) to memoryidx = xindex + (xdim * yindex) + (xdim * ydim *zindex)

row major ordering maps (xindex,yindex,zindex) to memoryidx = zindex + (zdim * yindex) + (zdim * ydim * xindex)

the column major order version should match how a FORTRAN multidimensional array is stored in memory.

Hi avidday,

Thank you for reply,

I agree with you. I tried. But performance wise CUDA C program takes more time ( linear array).

__global__ void array(int *a, int *b, int *c, int x, int y, int z)

{

int i, j ,k, A;

// Here, how do I divide threads?? 

/* 

for(k=0;k<z;k++)

   for(j=0;j<y;j++)

      for(i=0;i<x;i++)

         {

         A=i+j*x+k*x*y;

         c[A]=  a[A]+b[A];

         }

*/

}

extern "C" void ker_thre_(int *a,int *b,int *c,int *x, int *y, int *z)

{

int *d_a,*d_b, *d_c;

int n, i,j,k;

int nx,ny,nz;

nx=*x;

ny=*y;

nz=*z;

n=nx*ny*nz;

dim3 grid_size(*x,*y);

cudaMalloc((void **) &d_a, sizeof(int)*n);

cudaMalloc((void **) &d_b, sizeof(int)*n);

cudaMalloc((void **) &d_c, sizeof(int)*n);

printf("C - A\n\n");

for(k=0;k<nz;k++)

   {

   printf("\n");

   for(j=0;j<ny;j++)

      {

      printf("\n");

      for(i=0;i<nx;i++)

      printf("\t%d", a[i+j*nx+k*nx*ny]);

      }

   }

printf("\nC - B\n\n");

for(k=0;k<nz;k++)

   {

   printf("\n");

   for(j=0;j<ny;j++)

      {

      printf("\n");

      for(i=0;i<nx;i++)

       printf("\t%d", b[i+j*nx+k*nx*ny]);

      }

   }

cudaMemcpy( d_a, a, sizeof(int)*n, cudaMemcpyHostToDevice );

cudaMemcpy( d_b, b, sizeof(int)*n, cudaMemcpyHostToDevice );

cudaMemcpy( d_c, c, sizeof(int)*n, cudaMemcpyHostToDevice );

dim3 block(nx,ny);

dim3 grid(n/nx,n/ny);

array<<<grid, block>>>(d_a,d_b,d_c,nx,ny,nz);

//array<<<1,1>>>(d_a,d_b,d_c,nx,ny,nz);

cudaMemcpy(c,d_c, sizeof(int)*n, cudaMemcpyDeviceToHost);

printf("\n\nC-Result :\n");

for(k=0;k<nz;k++)

   {

   printf("\n");

   for(j=0;j<ny;j++)

      {

      printf("\n");

      for(i=0;i<nx;i++)

       printf("\t%d",c[i+j*nx+k*nx*ny]);

     }

  }

printf("\n\n");

cudaFree(d_a);

cudaFree(d_b);

cudaFree(d_c);

}

Remember, it is linear memory. The 3D array is stored like a big vector. Adding two such arrays becomes like adding two big vectors together. Vector addition is about the simplest kernel anyone could write. Use one thread per vector entry.