CUDA NPP image dot product having cudaErrorUnknown

The function nppiDotProd_8u64f_C1R causes a cudaErrorUnknown. I’m able to compile and run properly boxFilterNPP and histEqualizationNPP so I assume my system is healthy. I’m running with a GTX470 (compute capability 2.0), CUDA 5.5 and VS2012 x64 on Windows7. I’ve also run many variations of it on two systems and having the same problem. Here is the code:

NppGpuComputeCapability capability = nppGetGpuComputeCapability();

NppiSize sizeROI;
sizeROI.width = 640;
sizeROI.height = 480;

int nBufferSize = 0;
NppStatus status = nppiDotProdGetBufferHostSize_8u64f_C1R(sizeROI,&nBufferSize);
if(status != NPP_SUCCESS) return status;

unsigned char *pDeviceBuffer;
cudaError_t err = cudaMalloc((void**)&pDeviceBuffer,nBufferSize);
if(err != cudaSuccess) return err;

int stepByte1 = 0;
Npp8u * buf1 = nppiMalloc_8u_C1(sizeROI.width, sizeROI.height, &stepByte1);
status = nppiSet_8u_C1R(1,buf1,stepByte1,sizeROI);
if(status != NPP_SUCCESS) return status;

int stepByte2 = 0;
Npp8u * buf2 = nppiMalloc_8u_C1(sizeROI.width, sizeROI.height, &stepByte2);
status = nppiSet_8u_C1R(1,buf2,stepByte2,sizeROI);
if(status != NPP_SUCCESS) return status;

err = cudaDeviceSynchronize();
if(err != cudaSuccess) return err;

double dp = 0;
status = nppiDotProd_8u64f_C1R(buf1,stepByte1,buf2,stepByte2,sizeROI,&dp,pDeviceBuffer);
if(status != NPP_SUCCESS) return status;

err = cudaDeviceSynchronize(); // return cudaErrorUnknown
                // CUDA memchecker gives me "OutOfRangeStore" exception
if(err != cudaSuccess) return err;

printf("result: %f\n", dp);

nppiFree(buf1);
nppiFree(buf2);
cudaFree(pDeviceBuffer);

Any idea about my problem?

Thank you very much!!

Solved. The result argument in the nppiDotProd call must be a device pointer, not a host pointer. It can fixed by allocating memory for dp on the device. Thanks to talonmies. (the doc wasn’t very verbose about this detail)