Finally got it working.
Code is posted below.
Thanks Robert.
inline int findCudaDevice()
{
cudaDeviceProp deviceProp;
int devID = 0;
// Otherwise pick the device with highest Gflops/s
devID = gpuGetMaxGflopsDeviceId();
checkCudaErrors(cudaSetDevice(devID));
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
return devID;
}
inline int cudaDeviceInit()
{
int deviceCount;
checkCudaErrors(cudaGetDeviceCount(&deviceCount));
if (deviceCount == 0)
{
std::cerr << "CUDA error: no devices supporting CUDA." << std::endl;
exit(EXIT_FAILURE);
}
int dev = findCudaDevice();
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, dev);
std::cerr << "cudaSetDevice GPU" << dev << " = " << deviceProp.name << std::endl;
checkCudaErrors(cudaSetDevice(dev));
return dev;
}
int main(int argc, char *argv[])
{
// Load the input images
cv::Mat cvImageLeft = cv::imread(strLeftFileName, cv::IMREAD_GRAYSCALE);
if (cvImageLeft.empty())
{
throw std::runtime_error("Can't open '" + strLeftFileName + "'");
}
// initalize cuda device
int devID = cudaDeviceInit();
if ( devID != 0)
throw std::runtime_error("cudaDeviceInit fail ");
cudaError_t cudaRet ;
int nSrcStep;
// need to alloc cuda memory for source
Npp8u * pSrc = nppiMalloc_8u_C1(cvImageLeft.cols, cvImageLeft.rows, &nSrcStep);
printf("nSrcStep %d \n", nSrcStep);
// Need to copy image from Host to GPU Pay attention GPU memory is in power of 2 thus stride copy is required
for(int i=0; i< cvImageLeft.rows ; i++)
cudaRet = cudaMemcpy(pSrc + i*nSrcStep, cvImageLeft.data + i*cvImageLeft.cols , cvImageLeft.cols,cudaMemcpyHostToDevice);
if (cudaRet != cudaSuccess)
throw std::runtime_error("cudaMemcpyHostToDevice fail ");
// Need to define input {width height}
NppiSize oSrcSize = {cvImageLeft.cols, cvImageLeft.rows};
// Need to define input ROI {upper left x, upper left y, ROI width, ROI height}
NppiRect oSrcRectROI = {0, 0, cvImageLeft.cols, cvImageLeft.rows};
// output file is scaled in 1/2 in x and y axis
cv::Mat cvOut(cvImageLeft.rows/2,cvImageLeft.cols/2,cvImageLeft.type());
int nDstStep;
// need to alloc cuda memory for destenation
Npp8u * pDst = nppiMalloc_8u_C1(cvImageLeft.cols/2, cvImageLeft.rows/2, &nDstStep);
printf("nDstStep %d \n", nDstStep);
// Need to define output {width height}
NppiSize oDstSize = {cvOut.cols, cvOut.rows};
// Need to define output ROI {upper left x, upper left y, ROI width, ROI height}
NppiRect oDstRectROI = {0, 0, cvOut.cols, cvOut.rows};
int eInterpolation = NPPI_INTER_LINEAR;
NppStatus status;
status = nppiResize_8u_C1R(pSrc, nSrcStep, oSrcSize, oSrcRectROI,
pDst, nDstStep, oDstSize, oDstRectROI,
eInterpolation);
if(status == NPP_SUCCESS)
{
// Need to copy image from GPU to HOST Pay attention GPU memory is in power of 2 thus stride copy is required
for(int i=0; i< cvOut.rows ; i++)
cudaRet = cudaMemcpy(cvOut.data + i*cvOut.cols ,pDst + i*nDstStep,cvOut.cols,cudaMemcpyDeviceToHost);
if (cudaRet != cudaSuccess)
throw std::runtime_error("cudaMemcpyDeviceToHost fail ");
nppiFree(pDst);
nppiFree(pSrc);
cv::imwrite("resize.png", cvOut);
}
else
throw std::runtime_error("NPP NOT SUCCESS");
return 0;
}