cudaMemcpy3D memory duplication

Hi everybody,

I’ve got an issue when I want to update the memory content of a cuda 3D array via cudaMemcpy3D.
Init : I create a cuda 3D array, copy a device buffer (“d_out”) and then bind it to a texture.

What I want is to update the content of my array in a for loop, unfortunately what I get is
a duplication of the content with an offset of size : width*height (seems strange ?)
Here is a little piece of code :

for(int a = 0; a <10; ++a)
      /** here some kernels reading the texture and
         writing into d_out ***/

      checkCuda( cudaMemcpy3D(&copyParams),pExec); 
        // update the content of the array with the modified "d_out"
        //but the result seems to be a duplication of d_out with a height *width offset
        // checked via simple reading he texture

Thanks a lot for your help!


Cannot comment much without seeing the code.
Try to use cudaThreadSynchronize() after your kernel.
It may be needed since you are doing a memcpy immediately after the kernel.

thank you for your answer, here are some details :

3d array and texture init :

// create 3D array
	cudaExtent extent = make_cudaExtent(W, H, Z);
	cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
	cudaArray* cu_array=0;
	checkCuda( cudaMalloc3DArray(&cu_array, &channelDesc, extent),pExec );

	// copy data to 3D array
	cudaMemcpy3DParms copyParams = {0};
	//memory pitch
	copyParams.srcPtr   = make_cudaPitchedPtr((void*)d_out,   extent.width*sizeof(float),extent.width , extent.height);
	copyParams.dstArray = cu_array;
	copyParams.extent   = extent;
	copyParams.kind     = cudaMemcpyDeviceToDevice;
	checkCuda( cudaMemcpy3D(&copyParams),pExec);
	// set texture parameters
	tex.normalized = false;                      // access with normalized texture coordinates
	tex.filterMode = cudaFilterModeLinear;      // linear interpolation
	tex.addressMode[0] = cudaAddressModeWrap;   // wrap texture coordinates
	tex.addressMode[1] = cudaAddressModeWrap;
	tex.addressMode[2] = cudaAddressModeWrap;

	// bind texture to array
	checkCuda(cudaBindTextureToArray(tex, cu_array, channelDesc),pExec);

and here is the for loop in the main :

for(int a = 0; a <nbangle; ++a)
		float theta_r = -buffAngTilt[a] * (PI/180);
		checkCuda( cudaMemset(d_out, 0, size_f), pExec); // re init
		// 1st kernel
		transformKernel<<<dimGrid, dimBlock>>>(     d_out, //output  // input data is in tex
												              Z,												theta_r);
		cudaError_t err = cudaGetLastError();
		 if( cudaSuccess != err) {
			fprintf(pExec, " :CheckMsg() CUDA error :  : (%d) %s.\n", (int)err, cudaGetErrorString( err ) );
		// 2nd kernel
		REprojectionKernel<<<dimGrid_REproj,dimBlock_REproj >>>(    d_vol_proj, //in
																	d_out, //out
		 err = cudaGetLastError();
		 if( cudaSuccess != err) {
			fprintf(pExec, " :CheckMsg() CUDA error :  : (%d) %s.\n", (int)err, cudaGetErrorString( err ) );		
		 //update array content
		 checkCuda( cudaMemcpy3D(&copyParams),pExec);