Performance issue of new filter median functions in NPP

I’m doing some tests about the new filter median functions in CUDA toolkit 6.0, and the performance of these function really disappoints me.

Here is my code:

// note: "demo.jpg" is a grayscale JPG file in 2592x1944 pixels.
	cv::Mat srcMat = cv::imread("\intel\demo.jpg",  CV_LOAD_IMAGE_GRAYSCALE);
	gpu::GpuMat srcMatG(srcMat), tempMatG(srcMat.size(), srcMat.type());
	NppiSize sz;
	sz.width = srcMatG.cols;
	sz.height = srcMatG.rows;
	NppiSize mask;
	mask.width = 5;
	mask.height = 5;
	NppiPoint anchor;
	anchor.x = 2;
	anchor.y = 2;
	NppStatus status;
	Npp32u nBufferSize;
	status = nppiFilterMedianGetBufferSize_8u_C1R(sz, mask, &nBufferSize);
	if(status != NPP_NO_ERROR)
	{
		cout << "Stop: Error Calling nppiFilterMedianGetBufferSize_8u_C1R" << endl;
		return;
	}
	cout << "Required buffer size: " << nBufferSize << endl;
	gpu::GpuMat bufMatG = gpu::createContinuous(1, nBufferSize, CV_8UC1);
	status = nppiFilterMedian_8u_C1R(srcMatG.ptr<Npp8u>(), static_cast<int>(srcMatG.step), tempMatG.ptr<Npp8u>(), static_cast<int>(tempMatG.step), sz, 
               mask, anchor, bufMatG.ptr<Npp8u>());
	if(status != NPP_NO_ERROR)
	{
		cout << "Stop: Error Calling nppiFilterMedian_8u_C1R" << endl;
		return;
	}
	cout << "Repeating 100 times with timing..." << endl;
	chrono::system_clock::time_point tp1 = chrono::system_clock::now();
	for(int i = 0; i < 100; i++)
	{
		nppiFilterMedian_8u_C1R(srcMatG.ptr<Npp8u>(), static_cast<int>(srcMatG.step), tempMatG.ptr<Npp8u>(), static_cast<int>(tempMatG.step), sz, 
               mask, anchor, bufMatG.ptr<Npp8u>());
	}
	chrono::system_clock::time_point tp2 = chrono::system_clock::now();
	cout << "CUDA Calculation time: " << chrono::duration_cast<chrono::milliseconds>(tp2 - tp1).count() << " ms." << endl;
	cout << "Redo using CPU..." << endl;
	cv::Mat tempMat;
	cv::medianBlur(srcMat, tempMat, 5);
	chrono::system_clock::time_point tp3 = chrono::system_clock::now();
	for(int i = 0; i < 100; i++)
	{
		cv::medianBlur(srcMat, tempMat, 5);
	}
	chrono::system_clock::time_point tp4 = chrono::system_clock::now();
	cout << "CPU Calculation time: " << chrono::duration_cast<chrono::milliseconds>(tp4 - tp3).count() << " ms." << endl;

And the results(i5-4200M,GTX 850M,8G RAM,Win8.1,Visual Studio 2012,Driver Version:337.88):
Required buffer size: 53166080
Repeating 100 times with timing…
CUDA Calculation time: 15029 ms.
Redo using CPU…
CPU Calculation time: 2490 ms.

Questions:
1.Is there anything wrong with my code that hurts the performance?
2.Why do the filter median functions require a large scratch buffer that is 10x the size of the source image data?