I’m not the original poster, but am having the same problem. Here’s some of my code.
printf("Step sizes: %d, %d\n", paddedImg.pitch(), oDeviceDst.pitch());
printf("ROI: %d, %d\n", imgSz.width, imgSz.height);
printf("Kernel Size: %d, %d\n", kernelSize.width, kernelSize.height);
printf("Kernel Anchor: %d, %d\n", kernelAnchor.x, kernelAnchor.y);
printf("Divisor: %d\n", intKernel->divisor);
// Copy kernel to GPU
Npp32s *d_kernel;
int step;
d_kernel = nppiMalloc_32s_C1(kernelSize.width, kernelSize.height, &step);
printf("Kernel step: %d\n", step);
cudaMemcpy2D(d_kernel, step, intKernel->val, kernelSize.width * sizeof(Npp32s), kernelSize.width * sizeof(Npp32s), kernelSize.height, cudaMemcpyHostToDevice);
printf("cudaMemcpy2d error status: %s\n", cudaGetErrorString( cudaGetLastError() ) );
Npp32s *cpuKernel = (Npp32s*)malloc(kernelSize.width*kernelSize.width*sizeof(Npp32s));
cudaMemcpy2D(cpuKernel, kernelSize.width * sizeof(Npp32s), d_kernel, step, kernelSize.width * sizeof(Npp32s), kernelSize.height, cudaMemcpyDeviceToHost);
printf("cudaMemcpy2d error status: %s\n", cudaGetErrorString( cudaGetLastError() ) );
for (int y = 0; y < kernelSize.height; y++) {
for (int x = 0; x < kernelSize.width; x++) {
printf("%3d ", cpuKernel[y*kernelSize.width+x]);
}
printf("\n");
}
eStatusNPP = nppiFilter_8u_C1R(paddedImg.data(widthOffset, heightOffset), paddedImg.pitch(),
oDeviceDst.data(), oDeviceDst.pitch(),
imgSz, d_kernel, kernelSize, kernelAnchor, intKernel->divisor);
printf("nppiFilter error status: %d\n", eStatusNPP);
This is the output from the printf statements in the code running on the 512x512 Lena image that comes with NPP. I’ll attach the output image I get as well.
Step sizes: 768, 768
ROI: 512, 512
Kernel Size: 3, 3
Kernel Anchor: 1, 1
Divisor: 9
Kernel step: 256
cudaMemcpy2d error status: no error
cudaMemcpy2d error status: no error
1 1 1
1 1 1
1 1 1
nppiFilter error status: 0
Saved image: …/…/data/Lena_unsharpMask.pgm