there is the code written by opencl

__kernel void compute_gradients_8UC1_kernel(

const int height, const int width,

const int img_step, const int grad_quadstep, const int qangle_step,

__global const uchar * img, __global float * grad, __global QANGLE_TYPE * qangle,

const float angle_scale, const char correct_gamma, const int cnbins)

{

const int x = get_global_id(0);

const int tid = get_local_id(0);

const int gSizeX = get_local_size(0);

const int gidY = get_group_id(1);

```
__global const uchar* row = img + gidY * img_step;
//__local float sh_row[NTHREADS + 2];
__local float sh_row[128 + 2];
if (x < width)
sh_row[tid + 1] = row[x];
else
sh_row[tid + 1] = row[width - 2];
if (tid == 0)
sh_row[0] = row[(int)max(x - 1, 1)];
if (tid == gSizeX - 1)
sh_row[gSizeX + 1] = row[(int)min(x + 1, width - 2)];
barrier(CLK_LOCAL_MEM_FENCE);
if (x < width)
{
float dx;
if (correct_gamma == 1)
dx = sqrt(sh_row[tid + 2]) - sqrt(sh_row[tid]);
else
dx = sh_row[tid + 2] - sh_row[tid];
float dy = 0.f;
if (gidY > 0 && gidY < height - 1)
{
float a = (float)img[(gidY + 1) * img_step + x];
float b = (float)img[(gidY - 1) * img_step + x];
if (correct_gamma == 1)
dy = sqrt(a) - sqrt(b);
else
dy = a - b;
}
float mag = sqrt(dx * dx + dy * dy);
float ang = (atan2(dy, dx) + CV_PI_F) * angle_scale - 0.5f;
int hidx = (int)floor(ang);
ang -= hidx;
hidx = (hidx + cnbins) % cnbins;
qangle[(gidY * qangle_step + x) << 1] = hidx;
qangle[((gidY * qangle_step + x) << 1) + 1] = (hidx + 1) % cnbins;
grad[(gidY * grad_quadstep + x) << 1] = mag * (1.f - ang);
grad[((gidY * grad_quadstep + x) << 1) + 1] = mag * ang;
}
```

}

and that is written by cuda

template <int nthreads, int correct_gamma>

**global** void compute_gradients_8UC1_kernel(int height, int width, const PtrStepb img,

float angle_scale, PtrStepf grad, PtrStepb qangle)

{

const int x = blockIdx.x * blockDim.x + threadIdx.x;

int local_x = threadIdx.x;

```
const unsigned char* row = (const unsigned char*)img.ptr(blockIdx.y);
__shared__ float sh_row[128 + 2];
if (x < width)
sh_row[local_x + 1] = row[x];
else
sh_row[local_x + 1] = row[width - 2];
if (local_x == 0)
sh_row[0] = row[::max(x - 1, 1)];
if (local_x == blockDim.x - 1)
sh_row[blockDim.x + 1] = row[::min(x + 1, width - 2)];
__syncthreads();
if (x < width)
{
float dx;
if (correct_gamma)
dx = __fsqrt_rz(sh_row[local_x + 2]) - __fsqrt_rz(sh_row[local_x]);
else
dx = sh_row[local_x + 2] - sh_row[local_x];
float dy = 0.f;
if (blockIdx.y > 0 && blockIdx.y < height - 1)
{
float a = ((const unsigned char*)img.ptr(blockIdx.y + 1))[x];
float b = ((const unsigned char*)img.ptr(blockIdx.y - 1))[x];
if (correct_gamma)
dy = __fsqrt_rz(a) - __fsqrt_rz(b);
else
dy = a - b;
}
float mag = __fsqrt_rz(dx * dx + dy * dy);
float ang = (atan2f(dy, dx) + CV_PI_F) * angle_scale - 0.5f;
int hidx = (int)floorf(ang);
ang -= hidx;
hidx = (hidx + cnbins) % cnbins;
((uchar2*)qangle.ptr(blockIdx.y))[x] = make_uchar2(hidx, (hidx + 1) % cnbins);
((float2*) grad.ptr(blockIdx.y))[x] = make_float2(mag * (1.f - ang), mag * ang);
}
}
```

: use a picture (768 * 576) test them

the opencl used only 2.82ms, but the cuda used about 8 ms, why?