Hi, I wrote the following two functions for copying memory to/from 3D pitched pointers (modified from older functions I found on this forum):
void copy3DHostToPitchedPtr(float *_src, cudaPitchedPtr _dst, int width, int height, int depth)
{
cudaExtent copy_extent = make_cudaExtent(width*sizeof(float),height,depth);
cudaMemcpy3DParms copyParams = {0};
float *h_source = _src;
copyParams.srcPtr = make_cudaPitchedPtr((void*)h_source, copy_extent.width, copy_extent.width/sizeof(float), copy_extent.height);
copyParams.dstPtr = _dst;
copyParams.kind = cudaMemcpyHostToDevice;
copyParams.extent = copy_extent;
CUDA_SAFE_CALL(cudaMemcpy3D(©Params));
CUT_CHECK_ERROR("Host -> Device Memcpy failed\n");
}
and:
void copy3DPitchedPtrToHost(cudaPitchedPtr _src, float *_dst, int width, int height, int depth)
{
cudaExtent copy_extent = make_cudaExtent(width*sizeof(float),height,depth);
cudaMemcpy3DParms copyParams = {0};
float *h_dest = _dst;
copyParams.srcPtr = _src;
copyParams.dstPtr = make_cudaPitchedPtr((void*)h_dest, width*sizeof(float), width, height);
copyParams.kind = cudaMemcpyDeviceToHost;
copyParams.extent = copy_extent;
CUDA_SAFE_CALL(cudaMemcpy3D(©Params));
CUT_CHECK_ERROR("Device -> Host Memcpy failed\n");
}
I also wrote a small test kernel that takes a 3d pitched pointer and populates it with numbers related to its element indexes:
__global__ void testKernel(cudaPitchedPtr v, int width, int height, int depth) {
//calculate x y z coordinates just for test purposes
unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int y = blockIdx.x*blockDim.y + threadIdx.y;
unsigned int z = blockIdx.x*blockDim.z + threadIdx.z;
char* devPtr = (char *) v.ptr;
size_t pitch = v.pitch;
size_t slicePitch = pitch*height;
char* slice = devPtr + z*slicePitch;
float* row = (float*)(slice + y*pitch);
//overwrite element
row[x] = x + y + z;
}
However, when I execute the main function I get an error while copying back from device to host (‘invalid configuration argument’). I also find quite weird the fact that if I do not call my test kernel, but simply copy the memory host->device and then device->host straight away I get no errors and a correct memory transfer. Here is the main function:
void cudaTest(float *v, int width, int depth, int height) {
cudaPitchedPtr vGPU_;
width_ = width;
height_ = height;
depth_ = depth;
//allocate memory
cudaExtent pExtent = make_cudaExtent(width*sizeof(float),height,depth);
CUDA_SAFE_CALL(cudaMalloc3D(vGPU_,pExtent));
//copy host->device
copy3DHostToPitchedPtr(v,vGPU_,width_,height_,depth_);
//kernel execution
dim3 threadBlock(8,8,2);
dim3 gridBlock(iDivUp(width_,threadBlock.x), iDivUp(height_,threadBlock.y), iDivUp(depth_,threadBlock.z));
testKernel<<<gridBlock,threadBlock>>>(vGPU_,width_,height_,depth_);
//copy device->host
copy3DPitchedPtrToHost(vGPU_,v,width_,height_,depth_);
}
I am not too sure what I am doing wrong with such a simple code. I am using a GeForce GTX 280 (1.3 computing capabilities) and CUDA 4.0.