CUDA 4.0 cudaMemcpy3D invalid argument Error copying from Device to Host

Hello all!

I’ve been browsing the forums and the internet in general to try to understand the proper usage of cudaMemcpy3D. I am able to successfully copy an array from the host to the device:

void foo(void* mem_host, cudaPitchedPtr mem_device, size_t width, size_t height, size_t depth) {

	cudaExtent extent = make_cudaExtent(width * sizeof(float), height, depth);

	cudaMalloc3D(&mem_device, extent);

	cudaMemcpy3DParms p = {0};

	p.srcPtr = make_cudaPitchedPtr(mem_host, depth * sizeof(float), width, height);

	p.dstPtr = mem_device;

	p.extent = extent;

	p.kind = cudaMemcpyHostToDevice;

	cudaError_t errorCode = cudaMemcpy3D(&p);

	if (cudaSuccess == errorCode) {

		std::cout << "Successful Copy To Device!" << std::endl;

	} else {

		std::cout << "Error Copy To Device: " << cudaGetErrorString(errorCode)<< std::endl;

	}

}

However, as soon as I try to do the reverse and copy the data back to the host from the device, I get error code 11 (invalid argument):

void bar(cudaPitchedPtr mem_device, void* mem_host, size_t width, size_t height, size_t depth) {

	cudaExtent extent = make_cudaExtent(width * sizeof(float), height, depth);

	cudaMemcpy3DParms q = {0};

	q.dstPtr = make_cudaPitchedPtr(mem_host, depth * sizeof(float), width, height);

	q.srcPtr = mem_device;

	q.extent = extent;

	q.kind = cudaMemcpyDeviceToHost;

	cudaError_t errorCode = cudaMemcpy3D(&q);

	if (cudaSuccess == errorCode) {

		std::cout << "Successful Copy To Host!" << std::endl;

	} else {

		std::cout << "Error Copy To Host: " << cudaGetErrorString(errorCode) << std::endl;

	} 

}

If I comment out the “q.extent = extent” then it works no problem (because nothing gets copied!), but that’s the only argument I’ve edited where anything has changed.

For completion, here is the relevant calling code:

//declarations - width is the only meaningful dimension to me.

int depth = 4;

int height = 25;

int width = 3

size_t d_Pitch = 0;

cudaPitchedPtr d_3D;

float*** h_3D;

//initializations

h_3D = new float** [depth];

for (int i = 0; i < depth; ++i) {

	h_3D[i] = new float* [height];

	for (int j = 0; j < height; ++j)

		h_3D[i][j] = new float [width];

}

int count = 0;

for (int i = 0; i < depth; ++i) {

	for (int j = 0; j < height; ++j) {

		for (int k = 0; k < width; ++k) {

			h_3D[i][j][k] = count;

		}

		count++;

	}

}

//copy to device

foo((void*)h_3D, d_3D, width, height, depth);

//The following is optional, lets me know if something did not copy back correctly

for (int i = 0; i < depth; ++i) {

	for (int j = 0; j < height; ++j) {

		for (int k = 0; k < width; ++k) {

			h_3D[i][j][k] = -4;

		}

	}

}

//copy back to host

bar(d_3D, (void*)h_3D, width, height, depth);

//print out results to see and verify...

I feel the answer is something simple (maybe when copying back to host the height/width still have to be multiples of 16/64 as it was in version 2.1?), and any help would be greatly appreciated.

Links to other forum posts I have looked at:

http://forums.nvidia.com/index.php?showtopic=174233

http://forums.nvidia.com/index.php?showtopic=165400

http://forums.nvidia.com/index.php?showtopic=92374

(And another post I can’t seem to find at this moment where someone challenges people to find a dimension whose height and width are not multiples of 16 and 64 respectively that works for cudaMemcpy3D for Device-Device transfer).