cudaMalloc3D/cudaMemcpy3D & bad values in kernel Always reading 0 (device & emulation modes)

I’m copying from an array of unsigned char*'s, each pointing to a “2D” image stored in a linear array. I copy the slices one at a time to a single unsigned char* region in device memory using cudaMemcpy3D. When I read the data from within the kernel, the values are all 0, rather than the correct image values. I’ve tried it in emulation & normal device mode. I’m out of ideas!

Host code:

[codebox]

hostFunction()

{

cudaPitchedPtr cudaImgPadded;

size_t imgPaddedPitch;

cudaExtent paddedSize = make_cudaExtent(paddedRows, paddedCols, slicesPerIter);

cudaError_t e = cudaMalloc3D(&cudaImgPadded, paddedSize);

if(e != cudaSuccess) { printf(“imgPadded: %s\n”, cudaGetErrorString(e)); }

cudaMemcpy3DParms paddedCpyParams = {0};

cudaPos srcPos = {0, 0, 0};

paddedCpyParams.srcPos = srcPos;

paddedCpyParams.dstPtr = cudaImgPadded;

paddedCpyParams.extent = paddedCpySize;

paddedCpyParams.kind = cudaMemcpyHostToDevice;

for(long s = 0; s < slicesPerIter; s++)

{

  cudaPos dst = {0, 0, s};

  paddedCpyParams.dstPos = dst;

  paddedCpyParams.srcPtr = make_cudaPitchedPtr( (void*)(imgPadded.slicePtrs[s]), paddedCols * sizeof(char), paddedCols, paddedRows);

	

  CUDA_SAFE_CALL

  ( 

     //Since Image3D uses a std::vector of Images, we must copy each slice individually

     cudaMemcpy3D(paddedCpyParams)

  );

}

cudaPitchedPtr cudaOutput;

cudaExtent outputSize = make_cudaExtent(paddedRows, paddedCols, slicesPerIter);

CUDA_SAFE_CALL

(

  cudaMalloc3D(&cudaOutput, outputSize);

);

dim3 dimBlock(BLOCK_EDGE, BLOCK_EDGE, 1);

float gridCols = (float)paddedCols / (float)dimBlock.x;

float gridRows = (float)paddedRows / (float)dimBlock.y;

dim3 dimGrid(ceil(gridCols), ceil(gridRows), 1);

kernel<<<dimGrid, dimBlock>>> (cudaOutput, cudaImgPadded);

cudaMemcpy3DParms outputParams = {0};

outputParams.srcPos = srcPos;

outputParams.srcPtr = cudaOutput;

outputParams.extent = outputSize;

outputParams.dstPos = srcPos;

outputParams.dstPtr = make_cudaPitchedPtr((void*)(output.data), cols * sizeof(char), cols, rows);

outputParams.kind = cudaMemcpyDeviceToHost;

CUDA_SAFE_CALL

(

  cudaMemcpy3D(&outputParams)

);

// Then (for debugging) I write the output to a file

}

[/codebox]

Kernel:

[codebox]

global void kernel(cuda_pitchedPtr output, int outputCols, int outputRows, cuda_pitchedPtr imgPadded, int paddedCols, int paddedRows, int slices)

{

int row = threadIdx.x + (blockDim.x * blockIdx.x);

int col = threadIdx.y + (blockDim.y * blockIdx.y);

unsigned char* pixel = 0;

for(int s = 0; s < slices; s++)

{

  if((col < outputCols) && (row < outputRows))

  {

     int sliceOffset = s * imgPadded.pitch * paddedRows;

     int rowOffset = row * imgPadded.pitch;

pixel = (unsigned char*)((char*)(imgPadded.ptr) + col + rowOffset + sliceOffset);

     *((char*)(output.ptr) + col + rowOffset + sliceOffset) = *pixel;

  }

}

}

[/codebox]

The code I pasted in the box on the forum hasn’t been tested to see if it compiles. My actual code compiles and runs, but I simplified it and changed some variable names for the purpose of this post.

Also, the kernel is going to do calculations, but I’m not doing that now since I can’t read the input data.

If I replace the line where I read the input data with this:

int max = imgPadded.pitch * slices * paddedCols;

int scale = ((col + rowOffset + sliceOffset) * 255)/max;

*((char*)(output.ptr) + col + rowOffset + sliceOffset) = max;

It works like I’d expect it to, with the output image being a gradient as the rows/columns/slices are increasing.

Any ideas?

Thanks!