Hi Guys,
This function worked quickly on CUDA 0.8, than CUDA 1.0:
__global__ void BlurKernel
(
unsigned char* input_data,
unsigned char* output_data,
int _width, int _height/*,
int blur_len = 32,
double blur_force = 0.2*/
)
{
int blur_len = 4;
double blur_force = 0.2, radius = 0.0, force = 0.0;
int x = blockIdx.x*blockDim.x + threadIdx.x;
int y = blockIdx.y*blockDim.y + threadIdx.y;
int res = 0;
double count = 0.0;
unsigned char* src = input_data;
unsigned char* dst = output_data;
for(int j1 = y - blur_len; j1 <= y + blur_len; ++j1 )
{
if( j1 >=0 && j1 < _height )
{
src = input_data + j1*_width;
for( int i1 = x - blur_len; i1 <= x + blur_len; ++i1 )
{
if( i1 >= 0 && i1 < _width )
{
// 0 - данная точка, 1.0 - на расстоянии blur_len
radius = sqrt (double((j1-y)*(j1-y) + (i1-x)*(i1-x)))/ (double)blur_len;
//radius = (double(__mul24((j1-y),(j1-y)) + __mul24((j1-y),(i1-x))))/(double)blur_len;
force = 1 - pow (radius/2, blur_force);
res += int(*(src + i1)* force);
count += force;
} // end if
} // end for i1
} // end if
} // end for j1
*(dst + y*_width + x) = count ? int(res/count) : *(input_data + y*_width + x);
}