matchTemplate use OpenCV matchTemplate with GPU

I compare matchTemplate function (OpenCV 2.2) running with regular CPU and GPU on a small NVIDIA card with 8 cores.
My test is following:


// the newer cvLoadImage alternative with MATLAB-style name
Mat m_Img = imread(“E:\Projs\Tests\T3\1.bmp”, 0);

int H=m_Img.rows;
int W=m_Img.cols;
int Tmpl=25;

int cx=H/2.0;
int cy=W/2.0;

gpu::GpuMat d_src(m_Img);

//Make a rectangle
Rect roi1(cx, cy, Tmpl, Tmpl);
//Point a cv::Mat header at it (no allocation is done)
Mat m_Tmplt = m_Img(roi1);

Mat m_result;

Mat m_RowImg;

int Offset= Tmpl/2.0;

gpu::GpuMat d_templ(m_Tmplt), d_result, d_srcROI;

for (int j=0; j<5; j++) {

for(i=200; i<210; i++) {
m_result.create(1, W - 25 + 1, CV_32F);

   Rect roi1(0, i-Offset, W, Tmpl);
   m_RowImg = m_Img(roi1);

   matchTemplate( m_RowImg, m_Tmplt,
                  m_result, CV_TM_CCORR_NORMED );


for(i=200; i<210; i++) {
d_result.create(1, W - 25 + 1, CV_32F);
Rect roi1(0, i-Offset, W, Tmpl);
d_srcROI = gpu::GpuMat(m_RowImg);//d_src(roi1);
gpu::matchTemplate(d_srcROI, d_templ, d_result, CV_TM_CCORR_NORMED);


 Mat res = d_result;
 for(int k=1000; k<1400; k++) 
   SUBTEST << k << " " << (double)[k] << " " << (double)[k]<<'\n';


Suprisingly the GPU loop is running 10 times slower than the CPU one.
In case when I run the test on full image, without ROI, the GPU is working 3 times faster as expected.
Do somebody know something about such a problem?
Is it a problem with ROI definition?