I’m copying from an array of unsigned char*'s, each pointing to a “2D” image stored in a linear array. I copy the slices one at a time to a single unsigned char* region in device memory using cudaMemcpy3D. When I read the data from within the kernel, the values are all 0, rather than the correct image values. I’ve tried it in emulation & normal device mode. I’m out of ideas!
Host code:
[codebox]
hostFunction()
{
cudaPitchedPtr cudaImgPadded;
size_t imgPaddedPitch;
cudaExtent paddedSize = make_cudaExtent(paddedRows, paddedCols, slicesPerIter);
cudaError_t e = cudaMalloc3D(&cudaImgPadded, paddedSize);
if(e != cudaSuccess) { printf(“imgPadded: %s\n”, cudaGetErrorString(e)); }
cudaMemcpy3DParms paddedCpyParams = {0};
cudaPos srcPos = {0, 0, 0};
paddedCpyParams.srcPos = srcPos;
paddedCpyParams.dstPtr = cudaImgPadded;
paddedCpyParams.extent = paddedCpySize;
paddedCpyParams.kind = cudaMemcpyHostToDevice;
for(long s = 0; s < slicesPerIter; s++)
{
cudaPos dst = {0, 0, s};
paddedCpyParams.dstPos = dst;
paddedCpyParams.srcPtr = make_cudaPitchedPtr( (void*)(imgPadded.slicePtrs[s]), paddedCols * sizeof(char), paddedCols, paddedRows);
CUDA_SAFE_CALL
(
//Since Image3D uses a std::vector of Images, we must copy each slice individually
cudaMemcpy3D(paddedCpyParams)
);
}
cudaPitchedPtr cudaOutput;
cudaExtent outputSize = make_cudaExtent(paddedRows, paddedCols, slicesPerIter);
CUDA_SAFE_CALL
(
cudaMalloc3D(&cudaOutput, outputSize);
);
dim3 dimBlock(BLOCK_EDGE, BLOCK_EDGE, 1);
float gridCols = (float)paddedCols / (float)dimBlock.x;
float gridRows = (float)paddedRows / (float)dimBlock.y;
dim3 dimGrid(ceil(gridCols), ceil(gridRows), 1);
kernel<<<dimGrid, dimBlock>>> (cudaOutput, cudaImgPadded);
cudaMemcpy3DParms outputParams = {0};
outputParams.srcPos = srcPos;
outputParams.srcPtr = cudaOutput;
outputParams.extent = outputSize;
outputParams.dstPos = srcPos;
outputParams.dstPtr = make_cudaPitchedPtr((void*)(output.data), cols * sizeof(char), cols, rows);
outputParams.kind = cudaMemcpyDeviceToHost;
CUDA_SAFE_CALL
(
cudaMemcpy3D(&outputParams)
);
// Then (for debugging) I write the output to a file
}
[/codebox]
Kernel:
[codebox]
global void kernel(cuda_pitchedPtr output, int outputCols, int outputRows, cuda_pitchedPtr imgPadded, int paddedCols, int paddedRows, int slices)
{
int row = threadIdx.x + (blockDim.x * blockIdx.x);
int col = threadIdx.y + (blockDim.y * blockIdx.y);
unsigned char* pixel = 0;
for(int s = 0; s < slices; s++)
{
if((col < outputCols) && (row < outputRows))
{
int sliceOffset = s * imgPadded.pitch * paddedRows;
int rowOffset = row * imgPadded.pitch;
pixel = (unsigned char*)((char*)(imgPadded.ptr) + col + rowOffset + sliceOffset);
*((char*)(output.ptr) + col + rowOffset + sliceOffset) = *pixel;
}
}
}
[/codebox]
The code I pasted in the box on the forum hasn’t been tested to see if it compiles. My actual code compiles and runs, but I simplified it and changed some variable names for the purpose of this post.
Also, the kernel is going to do calculations, but I’m not doing that now since I can’t read the input data.
If I replace the line where I read the input data with this:
int max = imgPadded.pitch * slices * paddedCols;
int scale = ((col + rowOffset + sliceOffset) * 255)/max;
*((char*)(output.ptr) + col + rowOffset + sliceOffset) = max;
It works like I’d expect it to, with the output image being a gradient as the rows/columns/slices are increasing.
Any ideas?
Thanks!