Assume I am trying to filter a 5x5 image with a 3x3 kernel.
The following setup code to nppiFilter_8u_C1R fails with KERNEL_EXECUTION
[codebox]
//inputDims={7,7}
//imDims ={5,5}
//kernelDims= {3,3}
Npp8u* pSrc = nppiMalloc_8u_C1(inputDims[0],inputDims[1], (int*)inputDims);
Npp8u* pDst = nppiMalloc_8u_C1(imDims[0],imDims[1], (int*)imDims);
mxAssert((pSrc!= 0),"Could not allocate source memory on GPU");
mxAssert((pDst!= 0),"Could not allocate destination memory on GPU");
// 7x7, padded buffer
cudaMemcpy(pSrc,mxGetData(prhs[0]),inputDims[0]*inputDims[1]
,cudaMemcpyHostToDevice);
//Coeffcients are expected to be stored in reverse order.
//Does this need to be copied to constant memory? How?
const Npp32s* pKernel = static_cast<Npp32s*>(mxGetData(prhs[1]));
//Advance source pointer beyond the padding to the actual start of image
pSrc= pSrc +
padSize*(imDims[0]+2*padSize) //top rows
+ padSize; //left side pad
//the number of rows is the distance
//from one raster line to the next
Npp32s nSrcStep = static_cast<Npp32s> (imDims[0]) + 2*padSize;
//Destination does not have padding
Npp32s nDstStep = static_cast<Npp32s> (imDims[0]);
NppiSize oSizeROI;
oSizeROI.width = imDims[0]; //int
oSizeROI.height = imDims[1];
NppiSize oKernelSize;
oKernelSize.width = kernelDims[0];
oKernelSize.height = kernelDims[1];
//must be centered, i.e. 0.5 * (width - 1) in present implementation
NppiPoint oAnchor;
oAnchor.x=static_cast<int>(0.5*(kernelDims[0]-1)); //TTD: is this the 'correct' way?
oAnchor.y=static_cast<int>(0.5*(kernelDims[1]-1));
//The factor by which the convolved summation from the Filter operation should be divided.
//If equal to the sum of coefficients, this will keep the maximum result value within full scale.
Npp32s nDivisor =1;
[/codebox]