Hi. I’m new in CUDA and now i’m trying to write kernel which copies image from one memory to another
for educational purposes.
So, that’s how i do it:
inline bool cudaAllocMapped( void** cpuPtr, void** gpuPtr, size_t size )
{
if( !cpuPtr || !gpuPtr || size == 0 )
return false;
if( CUDA_FAILED(cudaHostAlloc(cpuPtr, size, cudaHostAllocMapped)) )
return false;
if( CUDA_FAILED(cudaHostGetDevicePointer(gpuPtr, *cpuPtr, 0)) )
return false;
memset(*cpuPtr, 0, size);
return true;
}
__global__ void gpuCopy(uchar1* input, uchar1* output, size_t outputWidth, size_t outputHeight)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < 0 || x > 1920
|| y < 0 || y > 1080)
{
return;
}
const int inputPosition = y * 1920 + x;
output[inputPosition * 3].x = input[inputPosition * 3].x;
output[inputPosition * 3 + 1].x = input[inputPosition * 3 + 1].x;
output[inputPosition * 3 + 2].x = input[inputPosition * 3 + 2].x;
}
cudaError_t cudaCopy(uchar1* input, uchar1* output, size_t outputWidth, size_t outputHeight, cudaStream_t stream)
{
if (!input)
{
return cudaErrorInvalidDevicePointer;
}
if (outputWidth == 0 || outputHeight == 0)
{
return cudaErrorInvalidValue;
}
const dim3 blockDim(8, 8);
const dim3 gridDim(iDivUp(outputWidth, blockDim.x), iDivUp(outputHeight, blockDim.y));
gpuCopy<<<gridDim, blockDim, 0, stream>>>(input, output, outputWidth, outputHeight);
return cudaSuccess;
}
void* cpuPtr, gpuPtr;
cudaAllocMapped(&cpuPtr, &gpuPtr, 1920 * 1080 * 3);
Mat img = cv::imread(“image.tif”);
cuda::GpuMat imgCuda;
imgCuda.upload(img);
cudaCopy((uchar1)imgCuda.data, (uchar1*)gpuPtr, 1920, 1080, 0);
cudaDeviceSynchronize();
Mat out(1080, 1920, img.type(), cpuPtr);
imwrite(“image_copy.tif”, out);
But copy is note same as original image.
Original:
After copy:
What is wrong here?