I was playing around with NPP and came across nppiSum_8u_C1R(const Npp8u * pSrc, int nSrcStep, NppiSize oSizeROI, Npp8u * pDeviceBuffer, Npp64f * pSum). First question is: Am I right assuming in pSum the sum of all pixel values?
And if that is true, what is wrong in this code snipped?
I always get denormalized results in pSum (6.216239955045e-315#DEN) in this example. On the other hand, using nppiMean_StdDev_8u_C1R(…) in the same way returns correct values.
Is pSum a host, in contrast to nppiMean_StdDev_8u_C1R, or a device pointer? Either way, it doesn’t return the expected value.
NppStatus is always NPP_NO_ERROR. Thanks in advance!
Now I’m completely lost: In the NPP documentation it says, that nppiMean_StdDev_8u_C1R() uses host pointers for pMean and pStdDev. But using host pointers in the sample code produces garbage values. Only device pointers work. Is the documentation wrong or have I missed something?
For completeness: I’m using NPP 4 with CUDA 4 on a Geforce GTX 285 with Windows 7 (64-Bit).
int pitch = 0;
NppiSize size; //image data size
size.height = 256;
size.width = 256;
//alloc image on device
Npp8u* d_image = nppiMalloc_8u_C1(size.width, size.height, &pitch);
//alloc image on host
unsigned char* h_img = (unsigned char*)malloc(size.width * size.height);
//fill host image with random data
for (int i = 0; i < size.width; i++)
for (int j = 0; j < size.height; j++)
{
h_img[i + j * size.width] = rand() % 255;
}
//copy host image to device image
cudaMemcpy2D(d_image, pitch, h_img, size.width, size.width, size.height, cudaMemcpyHostToDevice);
NppStatus status;
//buffer size for nppiReductionGetBufferHostSize_8u_C1R
int bufferSize = 0;
//deviceptr for sum result
Npp64f* d_sum;
cudaMalloc((void **)&d_sum, sizeof(Npp64f));
//deviceptr for stddev/mean result
Npp64f* d_mean;
cudaMalloc((void **)&d_mean, sizeof(Npp64f));
//deviceptr for stddev/mean result
Npp64f* d_dev;
cudaMalloc((void **)&d_dev, sizeof(Npp64f));
//Get buffer size
status = nppiReductionGetBufferHostSize_8u_C1R(size, &bufferSize);
//alloc buffer on device
Npp8u* buffer;
cudaMalloc((void **)&buffer, bufferSize);
//run nppi stddev/mean function -> results are fine
status = nppiMean_StdDev_8u_C1R(d_image, pitch, size, d_mean, d_dev);
//run nppi sum function -> d_sum contains only garbage after execution
status = nppiSum_8u_C1R(d_image, pitch, size, buffer, d_sum);
//Copy values to host
double h_sum, h_mean, h_dev;
cudaMemcpy(&h_sum , d_sum , sizeof(Npp64f), cudaMemcpyDeviceToHost);
cudaMemcpy(&h_mean, d_mean, sizeof(Npp64f), cudaMemcpyDeviceToHost);
cudaMemcpy(&h_dev , d_dev , sizeof(Npp64f), cudaMemcpyDeviceToHost);
//Copy image back to host
unsigned char* host = (unsigned char*)malloc(size.height*size.width);
cudaMemcpy2D(host, size.width, d_image, pitch, size.width, size.height, cudaMemcpyDeviceToHost);