I’m doing some tests about the new filter median functions in CUDA toolkit 6.0, and the performance of these function really disappoints me.
Here is my code:
// note: "demo.jpg" is a grayscale JPG file in 2592x1944 pixels.
cv::Mat srcMat = cv::imread("\intel\demo.jpg", CV_LOAD_IMAGE_GRAYSCALE);
gpu::GpuMat srcMatG(srcMat), tempMatG(srcMat.size(), srcMat.type());
NppiSize sz;
sz.width = srcMatG.cols;
sz.height = srcMatG.rows;
NppiSize mask;
mask.width = 5;
mask.height = 5;
NppiPoint anchor;
anchor.x = 2;
anchor.y = 2;
NppStatus status;
Npp32u nBufferSize;
status = nppiFilterMedianGetBufferSize_8u_C1R(sz, mask, &nBufferSize);
if(status != NPP_NO_ERROR)
{
cout << "Stop: Error Calling nppiFilterMedianGetBufferSize_8u_C1R" << endl;
return;
}
cout << "Required buffer size: " << nBufferSize << endl;
gpu::GpuMat bufMatG = gpu::createContinuous(1, nBufferSize, CV_8UC1);
status = nppiFilterMedian_8u_C1R(srcMatG.ptr<Npp8u>(), static_cast<int>(srcMatG.step), tempMatG.ptr<Npp8u>(), static_cast<int>(tempMatG.step), sz,
mask, anchor, bufMatG.ptr<Npp8u>());
if(status != NPP_NO_ERROR)
{
cout << "Stop: Error Calling nppiFilterMedian_8u_C1R" << endl;
return;
}
cout << "Repeating 100 times with timing..." << endl;
chrono::system_clock::time_point tp1 = chrono::system_clock::now();
for(int i = 0; i < 100; i++)
{
nppiFilterMedian_8u_C1R(srcMatG.ptr<Npp8u>(), static_cast<int>(srcMatG.step), tempMatG.ptr<Npp8u>(), static_cast<int>(tempMatG.step), sz,
mask, anchor, bufMatG.ptr<Npp8u>());
}
chrono::system_clock::time_point tp2 = chrono::system_clock::now();
cout << "CUDA Calculation time: " << chrono::duration_cast<chrono::milliseconds>(tp2 - tp1).count() << " ms." << endl;
cout << "Redo using CPU..." << endl;
cv::Mat tempMat;
cv::medianBlur(srcMat, tempMat, 5);
chrono::system_clock::time_point tp3 = chrono::system_clock::now();
for(int i = 0; i < 100; i++)
{
cv::medianBlur(srcMat, tempMat, 5);
}
chrono::system_clock::time_point tp4 = chrono::system_clock::now();
cout << "CPU Calculation time: " << chrono::duration_cast<chrono::milliseconds>(tp4 - tp3).count() << " ms." << endl;
And the results(i5-4200M,GTX 850M,8G RAM,Win8.1,Visual Studio 2012,Driver Version:337.88):
Required buffer size: 53166080
Repeating 100 times with timing…
CUDA Calculation time: 15029 ms.
Redo using CPU…
CPU Calculation time: 2490 ms.
Questions:
1.Is there anything wrong with my code that hurts the performance?
2.Why do the filter median functions require a large scratch buffer that is 10x the size of the source image data?