It kernel executing 286msec on GPU and 15msec on CPU. Why?
__global__ void TrackingMultiKernel
(
CUDA_Image* image1, // Массив изображений
CUDA_Image* image2, // Массив изображений
unsigned char* DataForImage1,
unsigned char* DataForImage2,
int* offs_x1,
int* offs_y1,
int* offs_x2,
int* offs_y2,
CUDA_Grid* grid,
char* DataForGrid,
int MasSize,
int iterations_number
)
{
int i = blockIdx.x + threadIdx.x;
int j = blockIdx.y + threadIdx.y;
int SizeDataForImage = image1[0].width * image1[0].height * sizeof(unsigned char);
int SizeDataForGrid = (grid[0].grid_width * grid[0].grid_height) * sizeof(CUDA_Block);
for( int k = 0; k < MasSize; k++ )
{
float disp_factor = 2.0;
float spring_force = 0.0;
float disp_threshold = 0.2;
float min_quality = 0.5;
unsigned char* Img1 = (DataForImage1+k*SizeDataForImage);
unsigned char* Img2 = (DataForImage2+k*SizeDataForImage);
CUDA_Block* Gr1 = (CUDA_Block*)(DataForGrid+k*SizeDataForGrid);
int g_width = grid[k].grid_width;
int g_height = grid[k].grid_height;
int g_stepx = grid[k].grid_stepx;
int g_stepy = grid[k].grid_stepy;
int b_width = grid[k].block_width;
int b_height = grid[k].block_height;
int lPitch1 = image1[k].width;
int lPitch2 = image2[k].height;
typedef const unsigned char *_image_ptr;
_image_ptr plane1 = Img1;
_image_ptr plane2 = Img2;
int i1_width = image1[k].width;
int i1_height = image1[k].height;
int i2_width = image2[k].width;
int i2_height = image2[k].height;
CUDA_Block *block = &(Gr1[i + j*g_width]);
block->deltaX = block->deltaY = 0;
block->valid = true;
int xs1 = offs_x1[k] + i * g_stepx, ys1 = offs_y1[k] + j * g_stepy;
float xs2 = offs_x2[k] + i * g_stepx + block->offsX;
float ys2 = offs_y2[k] + j * g_stepy + block->offsY;
float x = xs2, y = ys2;
Gr1[0].Test = 1;
// Если у нас блок выходит на пределы хотябы одной картинки то пропускаем блок
if
(
x < 0 || y < 0 ||
xs1 + b_width > i1_width ||
ys1 + b_height > i1_height ||
x + b_width > i2_width ||
y + b_height > i2_height
)
{
block->valid = false; //continue;
return;
}
float diff_sum = 0;
int iter;
for( iter = 0; iter < 20; ++iter ) //iterations_number
{
float gxx, gyy, gxy, ex, ey;
gxx = gyy = gxy = ex = ey = 0;
int xmax = b_width;
int ymax = b_height;
if( x < 0 || x + xmax > i2_width || y < 0 || y + ymax > i2_height )
{
block->valid = false;
break;
}
// берем только остаток по координатам
float d_rx = x - (int)x;
float d_ry = y - (int)y;
// операция x_ = 1.0 - x
float d_rx_ = 1.0 - d_rx;
float d_ry_ = 1.0 - d_ry;
diff_sum = 0;
for( int iy = 0; iy < ymax; ++iy )
{
_image_ptr op = &plane2[((int)y + iy)*lPitch1 + (int)x];
_image_ptr np = &plane1[(ys1 + iy)*lPitch2 + xs1];
for( short ix = 0; ix < xmax; ++ix )
{
float a00 = (*op * d_rx_ * d_ry_ +
*(op + 1) * d_rx * d_ry_ +
*(op + lPitch1) * d_rx_ * d_ry +
*(op + lPitch1 + 1) * d_rx * d_ry);
op += lPitch1;
float a10 = (*op * d_rx_ * d_ry_ +
*(op + 1) * d_rx * d_ry_ +
*(op + lPitch1) * d_rx_ * d_ry +
*(op + lPitch1 + 1) * d_rx * d_ry);
op += 1 - lPitch1;
float a01 = (*op * d_rx_ * d_ry_ +
*(op + 1) * d_rx * d_ry_ +
*(op + lPitch1) * d_rx_ * d_ry +
*(op + lPitch1 + 1) * d_rx * d_ry);
float gx = a01 - a00 + (*(np + 1) - *np);
float gy = a10 - a00 + (*(np + lPitch2) - *np);
float diff = *np - a00;
gxx += gx*gx;
gxy += gx*gy;
gyy += gy*gy;
ex += diff*gx;
ey += diff*gy;
diff_sum += fabs(diff);
++np;
} // end for ix
} // end for iy
// solve equation
float det = gxx*gyy - gxy*gxy;
float ev = sqrt((gxx + gyy)/2.0 - (sqrt((gxx - gyy)*(gxx - gyy) + 4*gxy*gxy)/2.0));
block->quality = ev;// / float(xmax*ymax);
if( block->quality < min_quality ) break;
float dx = (disp_factor*(gyy*ex - gxy*ey)) / det;
float dy = (disp_factor*(gxx*ey - gxy*ex)) / det;
float spring_x = (xs2 - x)*spring_force;
float spring_y = (ys2 - y)*spring_force;
x += (dx + spring_x);
y += (dy + spring_y);
if( fabs(dx) + fabs(dy) < disp_threshold ) break;
} // end for iter
if( block->valid )
{
block->deltaX = x - xs2 + block->offsX;
block->deltaY = y - ys2 + block->offsY;
block->diff = pow( diff_sum/(min_quality+block->quality), (float)0.3);
block->iters = iter;
}
} // end for k
}