Hello all!
I’ve been browsing the forums and the internet in general to try to understand the proper usage of cudaMemcpy3D. I am able to successfully copy an array from the host to the device:
void foo(void* mem_host, cudaPitchedPtr mem_device, size_t width, size_t height, size_t depth) {
cudaExtent extent = make_cudaExtent(width * sizeof(float), height, depth);
cudaMalloc3D(&mem_device, extent);
cudaMemcpy3DParms p = {0};
p.srcPtr = make_cudaPitchedPtr(mem_host, depth * sizeof(float), width, height);
p.dstPtr = mem_device;
p.extent = extent;
p.kind = cudaMemcpyHostToDevice;
cudaError_t errorCode = cudaMemcpy3D(&p);
if (cudaSuccess == errorCode) {
std::cout << "Successful Copy To Device!" << std::endl;
} else {
std::cout << "Error Copy To Device: " << cudaGetErrorString(errorCode)<< std::endl;
}
}
However, as soon as I try to do the reverse and copy the data back to the host from the device, I get error code 11 (invalid argument):
void bar(cudaPitchedPtr mem_device, void* mem_host, size_t width, size_t height, size_t depth) {
cudaExtent extent = make_cudaExtent(width * sizeof(float), height, depth);
cudaMemcpy3DParms q = {0};
q.dstPtr = make_cudaPitchedPtr(mem_host, depth * sizeof(float), width, height);
q.srcPtr = mem_device;
q.extent = extent;
q.kind = cudaMemcpyDeviceToHost;
cudaError_t errorCode = cudaMemcpy3D(&q);
if (cudaSuccess == errorCode) {
std::cout << "Successful Copy To Host!" << std::endl;
} else {
std::cout << "Error Copy To Host: " << cudaGetErrorString(errorCode) << std::endl;
}
}
If I comment out the “q.extent = extent” then it works no problem (because nothing gets copied!), but that’s the only argument I’ve edited where anything has changed.
For completion, here is the relevant calling code:
//declarations - width is the only meaningful dimension to me.
int depth = 4;
int height = 25;
int width = 3
size_t d_Pitch = 0;
cudaPitchedPtr d_3D;
float*** h_3D;
//initializations
h_3D = new float** [depth];
for (int i = 0; i < depth; ++i) {
h_3D[i] = new float* [height];
for (int j = 0; j < height; ++j)
h_3D[i][j] = new float [width];
}
int count = 0;
for (int i = 0; i < depth; ++i) {
for (int j = 0; j < height; ++j) {
for (int k = 0; k < width; ++k) {
h_3D[i][j][k] = count;
}
count++;
}
}
//copy to device
foo((void*)h_3D, d_3D, width, height, depth);
//The following is optional, lets me know if something did not copy back correctly
for (int i = 0; i < depth; ++i) {
for (int j = 0; j < height; ++j) {
for (int k = 0; k < width; ++k) {
h_3D[i][j][k] = -4;
}
}
}
//copy back to host
bar(d_3D, (void*)h_3D, width, height, depth);
//print out results to see and verify...
I feel the answer is something simple (maybe when copying back to host the height/width still have to be multiples of 16/64 as it was in version 2.1?), and any help would be greatly appreciated.
Links to other forum posts I have looked at:
http://forums.nvidia.com/index.php?showtopic=174233
http://forums.nvidia.com/index.php?showtopic=165400
http://forums.nvidia.com/index.php?showtopic=92374
(And another post I can’t seem to find at this moment where someone challenges people to find a dimension whose height and width are not multiples of 16 and 64 respectively that works for cudaMemcpy3D for Device-Device transfer).