kernel sample

It kernel executing 286msec on GPU and 15msec on CPU. Why?

__global__ void TrackingMultiKernel

        	( 

        	CUDA_Image* image1,  // Массив изображений

        	CUDA_Image* image2,  // Массив изображений

        	unsigned char* DataForImage1, 

        	unsigned char* DataForImage2, 

            int* offs_x1, 

            int* offs_y1,

            int* offs_x2, 

               int* offs_y2,

            CUDA_Grid* grid,

        	char* DataForGrid,

        	int MasSize,

        	int iterations_number

               ) 

  {

int i = blockIdx.x + threadIdx.x;

    int j = blockIdx.y + threadIdx.y;

	int SizeDataForImage = image1[0].width * image1[0].height * sizeof(unsigned char); 

	int SizeDataForGrid   = (grid[0].grid_width * grid[0].grid_height) * sizeof(CUDA_Block); 

	for( int k = 0; k < MasSize; k++ )

  {

  float disp_factor = 2.0;

  float spring_force = 0.0;

  float disp_threshold = 0.2;

  float min_quality = 0.5;

 unsigned char* Img1 = (DataForImage1+k*SizeDataForImage);

  unsigned char* Img2 = (DataForImage2+k*SizeDataForImage);

  CUDA_Block* Gr1 =	(CUDA_Block*)(DataForGrid+k*SizeDataForGrid);

  

  int g_width     = grid[k].grid_width;

  int g_height    = grid[k].grid_height;

  int g_stepx     = grid[k].grid_stepx;

  int g_stepy     = grid[k].grid_stepy;

 int b_width     = grid[k].block_width;

  int b_height    = grid[k].block_height;

 int lPitch1 = image1[k].width;

  int lPitch2 = image2[k].height;

 typedef const unsigned char *_image_ptr;

 _image_ptr plane1 = Img1;

  _image_ptr plane2 = Img2;

 int i1_width    = image1[k].width;

  int i1_height   = image1[k].height;

  int i2_width    = image2[k].width;

  int i2_height   = image2[k].height;

 CUDA_Block *block = &(Gr1[i + j*g_width]);

  

  block->deltaX = block->deltaY = 0;

  block->valid = true;

 int xs1 = offs_x1[k] + i * g_stepx, ys1 = offs_y1[k] + j * g_stepy;

  float xs2 = offs_x2[k] + i * g_stepx + block->offsX;

  float ys2 = offs_y2[k] + j * g_stepy + block->offsY;

  float x = xs2, y = ys2;

 Gr1[0].Test = 1;

 // Если у нас блок выходит на пределы хотябы одной картинки то пропускаем блок

  if

  	( 

  	x < 0 || y < 0 ||

  	xs1 + b_width  > i1_width  || 

  	ys1 + b_height > i1_height ||

  	x   + b_width  > i2_width  || 

  	y   + b_height > i2_height 

  	)

  	{

  	block->valid = false; //continue;

  	return;

  	}

 float diff_sum = 0;

  int iter;

 for( iter = 0; iter < 20; ++iter ) //iterations_number

  	{

  	float gxx, gyy, gxy, ex, ey;

  	gxx = gyy = gxy = ex = ey = 0;

 	int xmax = b_width;

  	int ymax = b_height;

                

                

  	if( x < 0 || x + xmax > i2_width || y < 0 || y + ymax > i2_height ) 

    { 

    block->valid = false; 

    break; 

                }

 	// берем только остаток по координатам

  	float d_rx = x - (int)x;

  	float d_ry = y - (int)y;

 	// операция x_ = 1.0 - x

  	float d_rx_ = 1.0 - d_rx;

  	float d_ry_ = 1.0 - d_ry;

 	diff_sum = 0;

 	for( int iy = 0; iy < ymax; ++iy ) 

    {

    _image_ptr op = &plane2[((int)y  + iy)*lPitch1 + (int)x]; 

    _image_ptr np = &plane1[(ys1 + iy)*lPitch2 + xs1];

   for( short ix = 0; ix < xmax; ++ix ) 

    	{

    	float a00 = (*op * d_rx_ * d_ry_ + 

           *(op + 1) * d_rx * d_ry_ + 

        *(op + lPitch1) * d_rx_ * d_ry + 

        *(op + lPitch1 + 1) * d_rx * d_ry);

    	op += lPitch1;

    	float a10 = (*op * d_rx_ * d_ry_ + 

           *(op + 1) * d_rx * d_ry_ + 

        *(op + lPitch1) * d_rx_ * d_ry + 

        *(op + lPitch1 + 1) * d_rx * d_ry);

        op += 1 - lPitch1;

    	float a01 = (*op * d_rx_ * d_ry_ + 

           *(op + 1) * d_rx * d_ry_ + 

        *(op + lPitch1) * d_rx_ * d_ry + 

        *(op + lPitch1 + 1) * d_rx * d_ry);

    	float gx = a01 - a00 + (*(np + 1) - *np);

    	float gy = a10 - a00 + (*(np + lPitch2) - *np);

    	float diff = *np - a00;

    	gxx += gx*gx;

    	gxy += gx*gy;

    	gyy += gy*gy;

    	ex += diff*gx;

    	ey += diff*gy;

    	diff_sum += fabs(diff);

    	++np;

    	} // end for ix

    } // end for iy

       

  	// solve equation

  	float  det = gxx*gyy - gxy*gxy;

 	float ev = sqrt((gxx + gyy)/2.0 - (sqrt((gxx - gyy)*(gxx - gyy) + 4*gxy*gxy)/2.0));

 	block->quality = ev;// / float(xmax*ymax);

  	if( block->quality < min_quality ) break;

 	float dx = (disp_factor*(gyy*ex - gxy*ey)) / det;

  	float dy = (disp_factor*(gxx*ey - gxy*ex)) / det;

  	float spring_x = (xs2 - x)*spring_force;

  	float spring_y = (ys2 - y)*spring_force;

 	x += (dx + spring_x);

  	y += (dy + spring_y);

                   

  	if( fabs(dx) + fabs(dy) < disp_threshold ) break;

  	} // end for iter

 if( block->valid ) 

  	{

  	block->deltaX  = x - xs2 + block->offsX;

  	block->deltaY  = y - ys2 + block->offsY;

  	block->diff = pow( diff_sum/(min_quality+block->quality), (float)0.3);

  	block->iters = iter;

  	}

  } // end for k

}