Invalid argument when calling cudaMemcpy3D

When running the following code I get an invalid argument error for cudaMemcoy3D in intializeAndBindInsert3DTexture

texture<float, 3, cudaReadModeElementType> insert3DTexture;

cudaArray *insert3DArray;

void InitializeAndBindInsert3DTexture(float *d_Input, DataSize dataSize) {
// 32 bit 1D float for resampled data
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
CheckCUDAError();

// Set texture parameters
insert3DTexture.addressMode[0] = cudaAddressModeClamp;
insert3DTexture.addressMode[1] = cudaAddressModeClamp;
insert3DTexture.normalized = false;
insert3DTexture.filterMode = cudaFilterModeLinear;

// Allocate 3D array
cudaExtent volumeSize = make_cudaExtent(dataSize.dataWidth, dataSize.dataHeight, dataSize.dataDepth);
cudaMalloc3DArray(&insert3DArray, &channelDesc, volumeSize);
CheckCUDAError();

// Copy modified volume data to the 3D array
cudaMemcpy3DParms copyParams = {0};
copyParams.srcPtr   = make_cudaPitchedPtr((void*)d_Input, sizeof(float)*dataSize.dataWidth, dataSize.dataWidth, dataSize.dataHeight);
copyParams.dstArray = insert3DArray;
copyParams.extent   = volumeSize;
copyParams.kind     = cudaMemcpyDeviceToDevice;
cudaMemcpy3D(&copyParams);
CheckCUDAError();

// Bind the array to the 3D texture
cudaBindTextureToArray(insert3DTexture, insert3DArray, channelDesc);
CheckCUDAError();

}

void Insert3D(float *d_OutputData, DataSize outputDataSize, float *d_InputData, DataSize inputDataSize, int startPosWidth, int startPosHeight, int startPosDepth) {

InitializeAndBindInsert3DTexture(d_InputData, inputDataSize);

// Set dimensions of the inserting kernel
int threadsInX = 8;
int	threadsInY = 8;
int	threadsInZ = 8;

int	blocksInX = (inputDataSize.dataWidth+threadsInX-1)/threadsInX;
int	blocksInY = (inputDataSize.dataHeight+threadsInY-1)/threadsInY;
int	blocksInZ = (inputDataSize.dataDepth+threadsInZ-1)/threadsInZ;

dim3 dimBlock = dim3(threadsInX, threadsInY, threadsInZ);
dim3 dimGrid = dim3(blocksInX, blocksInY*blocksInZ);

Insert3D<<<dimGrid, dimBlock>>>(d_OutputData, outputDataSize, startPosWidth, startPosHeight, startPosDepth, inputDataSize, blocksInY, 1.0f/(float)blocksInY);
CheckCUDAError();
cudaThreadSynchronize();

UnbindInsert3DTexture();

FreeInsert3DArray();

}

void CUDAInsert3D(float *h_Output, int outputWidth, int outputHeight, int outputDepth, float *h_Input, int inputWidth, int inputHeight, int inputDepth, int startPosWidth, int startPosHeight, int startPosDepth) {

// Determine size of input
DataSize dataSizeInput;

dataSizeInput.dataHeight = inputHeight;

dataSizeInput.dataWidth = inputWidth;

dataSizeInput.dataDepth = inputDepth;

dataSizeInput.dataSize = EstimateDataSize(dataSizeInput, sizeof(float));


// Determine size of output
DataSize dataSizeOutput;
dataSizeOutput.dataHeight = outputHeight;
dataSizeOutput.dataWidth = outputWidth;
dataSizeOutput.dataDepth = outputDepth;
dataSizeOutput.dataSize = EstimateDataSize(dataSizeOutput, sizeof(float));

float *d_Input, *d_Output;

// Allocate memory on the device for the input and output arguments
cudaMalloc((void**)&d_Input, dataSizeInput.dataSize);
CheckCUDAError();
cudaMalloc((void**)&d_Output, dataSizeOutput.dataSize);
CheckCUDAError();

// Copy input argument from host to device
cudaMemcpy(d_Input, h_Input, dataSizeInput.dataSize, cudaMemcpyHostToDevice);
CheckCUDAError();
cudaMemcpy(d_Output, h_Output, dataSizeOutput.dataSize, cudaMemcpyHostToDevice);
CheckCUDAError();

// Insert output from input
Insert3D(d_Output, dataSizeOutput, d_Input, dataSizeInput, startPosWidth, startPosHeight, startPosDepth);
	
// Copy output argument from device to host
cudaMemcpy(h_Output, d_Output, dataSizeOutput.dataSize, cudaMemcpyDeviceToHost);
CheckCUDAError();

// Free allocated memory on the device (input and output arguments)
cudaFree(d_Input);
CheckCUDAError();
cudaFree(d_Output);
CheckCUDAError();

return;

}

Does anyone have a suggestion to why I receive this error?

/D

Any suggestions?

Any suggestions?

Hhm, seems like no one knows what the problem is. After testing around a bit more i realised that it was for only certain sizes of the data that the copy fails.

The code:

// Allocate 3D array
cudaExtent volumeSize = make_cudaExtent(dataSize.dataWidth, dataSize.dataHeight, dataSize.dataDepth);
cudaMalloc3DArray(&insert3DArray, &channelDesc, volumeSize);
CheckCUDAError();

// Copy modified volume data to the 3D array
cudaMemcpy3DParms copyParams = {0};
copyParams.srcPtr = make_cudaPitchedPtr((void*)d_Input, sizeof(float)*dataSize.dataWidth, dataSize.dataWidth, dataSize.dataHeight);
copyParams.dstArray = insert3DArray;
copyParams.extent = volumeSize;
copyParams.kind = cudaMemcpyDeviceToDevice;
cudaMemcpy3D(&copyParams);
CheckCUDAError();

It seems that if dataSize.dataWidth/Heigth/Depth is not a multiple of 16 then cudaMemcpy3D fails. Why?

I know that the programming Guide suggests using cudaMalloc3D and so for 2D and 3D data, but as I understand it is only for efficient access. Or I’m I wrong.

To be noted is that d_Input is allocated with cudaMalloc.

/D

Hhm, seems like no one knows what the problem is. After testing around a bit more i realised that it was for only certain sizes of the data that the copy fails.

The code:

// Allocate 3D array
cudaExtent volumeSize = make_cudaExtent(dataSize.dataWidth, dataSize.dataHeight, dataSize.dataDepth);
cudaMalloc3DArray(&insert3DArray, &channelDesc, volumeSize);
CheckCUDAError();

// Copy modified volume data to the 3D array
cudaMemcpy3DParms copyParams = {0};
copyParams.srcPtr = make_cudaPitchedPtr((void*)d_Input, sizeof(float)*dataSize.dataWidth, dataSize.dataWidth, dataSize.dataHeight);
copyParams.dstArray = insert3DArray;
copyParams.extent = volumeSize;
copyParams.kind = cudaMemcpyDeviceToDevice;
cudaMemcpy3D(&copyParams);
CheckCUDAError();

It seems that if dataSize.dataWidth/Heigth/Depth is not a multiple of 16 then cudaMemcpy3D fails. Why?

I know that the programming Guide suggests using cudaMalloc3D and so for 2D and 3D data, but as I understand it is only for efficient access. Or I’m I wrong.

To be noted is that d_Input is allocated with cudaMalloc.

/D

I haven’t read your post very thoroughly, but take a look at http://forums.nvidia.com/index.php?showtopic=181937

I haven’t read your post very thoroughly, but take a look at http://forums.nvidia.com/index.php?showtopic=181937