I have written small piece of code using NPP and opencv-3.1 on GTX-745 platform. It is working with image size 800x600 but not working with other image sizes.
Is there anything i missed ?
I am not getting why it is behaving like this ?
Is NPP have any restriction on image size ?
below is the code what i have written.
#include <iostream>
#include <cuda_runtime.h>
#include <npp.h>
#include <helper_string.h>
#include <helper_cuda.h>
#include "opencv2/core/version.hpp"
#include "opencv2/imgproc/imgproc.hpp"
#include "opencv2/highgui/highgui.hpp"
//#include "ImagesCPU.h"
//#include <ImagesNPP.h>
#include <chrono>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
int main(int argc, const char *argv[])
{
cv::Mat srcImg, dstImg, grayImg;
double maxVal=0, minVal=0;
cv::Mat blur;
//load image using opencv
srcImg = cv::imread(argv[1]);
if (srcImg.channels() == 3) {
cvtColor(srcImg, grayImg, CV_RGB2GRAY);
}
else {
grayImg = srcImg;
}
cv::Mat_<float> grayImg32 = cv::Mat::zeros(cvSize(grayImg.cols, grayImg.rows), CV_32FC1);
grayImg.convertTo(grayImg32, CV_32F);
NppStatus eStatusNPP;
int pitch = grayImg32.step;
std::cout << "step width height : " <<grayImg32.step <<" "<<grayImg32.cols <<" "<<grayImg32.rows <<std::endl;
std::cout << "step width height : " <<grayImg.step <<" "<<grayImg.cols <<" "<< grayImg.rows <<std::endl;
NppiSize sizeRoi = {grayImg.cols-5+1, grayImg.rows-5+1};
NppiSize size = {grayImg.cols, grayImg.rows};
float *dstCuda, *srcCuda;
auto start = std::chrono::high_resolution_clock::now();
gpuErrchk(cudaMalloc((void**)&srcCuda, grayImg.cols*grayImg.rows*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&dstCuda, grayImg.cols*grayImg.rows*sizeof(float)));
auto end = std::chrono::high_resolution_clock::now();
std::cout<< " cuda malloc : "<< std::chrono::duration_cast<std::chrono::microseconds>(end-start).count() <<std::endl;
start = std::chrono::high_resolution_clock::now();
//copy data to device
gpuErrchk(cudaMemcpy(srcCuda, (float*)grayImg32.data, grayImg.cols*grayImg.rows*sizeof(float), cudaMemcpyHostToDevice));
end = std::chrono::high_resolution_clock::now();
std::cout << " memcpy H2D : "<< std::chrono::duration_cast<std::chrono::microseconds>(end-start).count() <<std::endl;
start = std::chrono::high_resolution_clock::now();
eStatusNPP = nppiFilterGauss_32f_C1R(srcCuda, pitch, dstCuda, pitch, sizeRoi, NPP_MASK_SIZE_5_X_5);
if(eStatusNPP != NPP_SUCCESS)
std::cout << " err: nppiFilterGauss_32f_C1R : " << eStatusNPP << std::endl;
//gpuErrchk(cudaMemcpy(dstCuda, srcCuda, grayImg.cols*grayImg.rows*sizeof(float), cudaMemcpyDeviceToDevice));
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaPeekAtLastError());
end = std::chrono::high_resolution_clock::now();
std::cout<< " gauss : "<< std::chrono::duration_cast<std::chrono::microseconds>(end-start).count() <<std::endl;
//save dst image
cv::Mat_<float> dstImg32 = cv::Mat::zeros(cvSize(grayImg.cols, grayImg.rows), CV_32FC1);
//copy data back to host
start = std::chrono::high_resolution_clock::now();
gpuErrchk(cudaMemcpy((float *)dstImg32.data, dstCuda, grayImg.cols*grayImg.rows*sizeof(float), cudaMemcpyDeviceToHost));
end = std::chrono::high_resolution_clock::now();
std::cout<< " memcpy D2H : "<< std::chrono::duration_cast<std::chrono::microseconds>(end-start).count() <<std::endl;
//display
minMaxLoc(dstImg32, &minVal, &maxVal);
std::cout <<"min max val: " << minVal <<" "<< maxVal << std::endl;
dstImg32.convertTo(blur, CV_8U, 255.0/(maxVal - minVal), -minVal * 255.0/(maxVal - minVal));
cv::imshow("opencv_npp", blur);
cv::waitKey(0);
return 0;
}