Hi,
I have one doubt.
cudaMemcpy3D for allocating arrays for textures only?
Why can’t them to allocate general purpose global memory??
Suppose if I want to add two 3D arrays, how do I assign threads?
CUDA C call…
__global__ void array(cudaPitchedPtr c, cudaPitchedPtr a, cudaPitchedPtr b, cudaExtent ext)
{
// confused in this section
// whether this allocation is right??
int idx = blockDim.x * blockIdx.x + threadIdx.x;
if (idx>=ext.width*ext.height*ext.depth)
return; //over size
dim z=idx/(ext.width*ext.height);
dim y=(idx%(ext.width*ext.height))/ext.width;
dim x=idx%ext.width;
// How do i add two 3D arrays??
}
extern "C" void ker_thre_(cudaArray *a,cudaArray *b,cudaArray *c,int *x, int *y, int *z)
{
int NX,NY,NZ;
// int i, j ,k;
NX=*x;
NY=*y;
NZ=*z;
size_t buf=NX*NY*NZ;
cudaPrintfInit(buf);
cudaError_t status = cudaSuccess;
cudaExtent ext;
ext.width=NX*sizeof(int);
ext.height=NY;
ext.depth=NZ;
cudaPitchedPtr d_a;
status=cudaMalloc3D(&d_a,ext);
if(status != cudaSuccess)
{
fprintf(stderr, "Malloc d_a : %s\n", cudaGetErrorString(status));
}
cudaPitchedPtr d_b;
status=cudaMalloc3D(&d_b,ext);
if(status != cudaSuccess)
{
fprintf(stderr, "Malloc d_b : %s\n", cudaGetErrorString(status));
}
cudaPitchedPtr d_c;
status=cudaMalloc3D(&d_c,ext);
if(status != cudaSuccess)
{
fprintf(stderr, "Malloc d_c : %s\n", cudaGetErrorString(status));
}
//..... Copy Host A to Device .........//
cudaMemcpy3DParms aptr ={0};
aptr.srcPtr= make_cudaPitchedPtr( (void*) a, NX*sizeof(int), NX, NY );
aptr.dstPtr= d_a;
aptr.extent= ext;
status=cudaMemcpy3D(&aptr);
if(status != cudaSuccess)
{
fprintf(stderr, "MemcpyHostToDevice : %s\n", cudaGetErrorString(status));
}
//..... Copy Host B to Device .........//
cudaMemcpy3DParms bptr ={0};
bptr.srcPtr= make_cudaPitchedPtr( (void*) b, NX*sizeof(int), NX, NY );
bptr.dstPtr= d_b;
bptr.extent= ext;
status=cudaMemcpy3D(&bptr);
if(status != cudaSuccess)
{
fprintf(stderr, "MemcpyHosToDevice : %s\n", cudaGetErrorString(status));
}
array<<< 1,1 >>>(d_c,d_a,d_b,ext);
//........ Copy Device to Host C.......//
cudaMemcpy3DParms cptr ={0};
cptr.srcPtr= d_c;
cptr.dstPtr= make_cudaPitchedPtr( (void*) c, NX*sizeof(int), NX, NY );
cptr.extent= ext;
status=cudaMemcpy3D(&cptr);
if(status != cudaSuccess)
{
fprintf(stderr, "MemcpyDeviceToHost: %s\n", cudaGetErrorString(status));
}
/*
for(i=0;i<NX;i++)
{
printf("\n");
for(j=0;j<NY;j++)
{
printf("\n");
for(k=0;k<NZ;k++)
printf("%d\t", c[i+j*NX+k*NX*NY]);
}
}
*/
printf("\n\n");
cudaPrintfEnd();
cudaFree(d_a.ptr);
cudaFree(d_b.ptr);
cudaFree(d_c.ptr);
Any suggestions ??
thanks in advance.