Slow working

Hi Guys,

This function worked quickly on CUDA 0.8, than CUDA 1.0:

__global__ void BlurKernel

                          ( 

        unsigned char* input_data, 

        unsigned char* output_data, 

        int _width, int _height/*,

        int blur_len = 32,

        double blur_force = 0.2*/

        ) 

{

	int blur_len = 4;

	double blur_force = 0.2, radius = 0.0, force = 0.0;

	

    int x = blockIdx.x*blockDim.x + threadIdx.x;

    int y = blockIdx.y*blockDim.y + threadIdx.y;

	int res = 0;

    double count = 0.0;

	unsigned char* src = input_data;

    unsigned char* dst = output_data;

    

	for(int j1 = y - blur_len; j1 <= y + blur_len; ++j1 )

	{

       if( j1 >=0 && j1 < _height )

         {

         src = input_data + j1*_width;

         for( int i1 = x - blur_len; i1 <= x + blur_len; ++i1 )

  	{

  	if( i1 >= 0 && i1 < _width )

              {

              // 0 - данная точка, 1.0 - на расстоянии blur_len

              radius = sqrt (double((j1-y)*(j1-y) + (i1-x)*(i1-x)))/ (double)blur_len;

     //radius = (double(__mul24((j1-y),(j1-y)) + __mul24((j1-y),(i1-x))))/(double)blur_len;

              force = 1 - pow (radius/2, blur_force);

           res += int(*(src + i1)* force);

              count += force;

              } // end if

  	} // end for i1

         } // end if

	} // end for j1

  

	*(dst + y*_width + x) = count ? int(res/count) : *(input_data + y*_width + x);

}

How long does the kernel take in v1.0 and how long did it take in v0.8?

Paulius

0.8 - 47ms
1.0 - 63ms