PERFORMANCE ISSUES!

Hello everyone I have developed a code for Anisotropic diffusion algorithm which is an image processing algorithm in Opencl. And this is my first major venture into developing OpenCL codes.

I am having some major performance issues. The same code in CUDa with out use of texture memory or any optimizations gave atleast a speedup of 10 over the soft ware version. I am not able to figure out where the serialization is occuring. Can anyone please help me. The kernel code is given in the snippet below.

//kernel code for second OpenCL

//#include<stdlib.h>

//#include<math.h>

 void find_min(float * a, float * b,int i,int j){

float temp = * a;

if( * a > * b ){

                * a = * b ;

                * b = temp ;

        }

}

void find_min3(float *a, float *b, float *c, int i, int j){

find_min(a,b,i,j);

        find_min(a,c,i,j);

}

void find_minx3(float *a, float *b, float *c, int i, int j){

find_min(b,c,i,j);

        find_min(a,c,i,j);

}

void find_min6(float *a , float *b, float *c, float *d, float *e,float *f,int i, int j){

find_min(a,d,i,j);

        find_min(b,e,i,j);

        find_min(c,f,i,j);

        find_min3(a,b,c,i,j);

        find_minx3(d,e,f,i,j);

}

void find_min5(float *a, float *b, float *c, float *d, float *e,int i, int j){

find_min(a,b,i,j);

        find_min(c,d,i,j);

        find_min3(a,c,e,i,j);

        find_minx3(b,d,e,i,j);

}

void find_min4(float *a, float *b, float *c, float *d, int i, int j)

{

find_min(a,b,i,j);

        find_min(c,d,i,j);

        find_min(a,c,i,j);

        find_min(b,d,i,j);

}

void find_min_final(float *a, float *b, float *c,int i, int j){

find_minx3(a,b,c,i,j);

        find_min(a,b,i,j);

}

__kernel void MedianFilter_Sgn_Kernel (__global const float *img_proc_PDE, __global float *img_sgn, __const int threshold,__const int width,__const int height )

{ 

int thresh = threshold; 

int i = get_global_id(0);

 // int j = get_global_id(1);

int j;

  float v[9]= {0};

  float temp_center_pixel;

  if(i < ( height - 1 ) && i>0)

	//if(j<(width - 1) && j>0)

	for(j = 1; j<width-1; j++ )

{

temp_center_pixel = img_proc_PDE[ i * width + j ];

  v[0]= img_proc_PDE[ ( i - 1 ) * width + ( j - 1 ) ];

  v[1]= img_proc_PDE[ ( i - 1 ) * width + j ];

  v[2]= img_proc_PDE[ ( i - 1 ) * width + ( j + 1 ) ];

  v[3]= img_proc_PDE[ i * width + ( j - 1 ) ];

  v[4]= img_proc_PDE[ i * width + j ];

  v[5]= img_proc_PDE[ i * width + ( j + 1 ) ];

find_min6(&v[0],&v[1],&v[2],&v[3],&v[4],&v[5],i,j);

v[5]= img_proc_PDE[ ( i + 1 ) * width + ( j - 1 ) ];

find_min5(&v[1],&v[2],&v[3],&v[4],&v[5],i,j);

v[5]= img_proc_PDE[ ( i + 1 ) * width + j ];

find_min4(&v[2],&v[3],&v[4],&v[5],i,j);

v[5]= img_proc_PDE[ ( i + 1 ) * width + ( j + 1 ) ];

find_min_final(&v[3],&v[4],&v[5],i,j);

img_sgn[ i * width + j ] = (fabs( v[4]-temp_center_pixel ) > thresh) * 1+0; //If estimated noise, set it 1

}

	//barrier(CLK_LOCAL_MEM_FENCE);

}//end of medianfilter_sgn

__kernel void FilterPDE_Sgn_Kernel(__global float *img_proc_PDE,__global float *img_sgn, __global float *img_temp, int width,int height)

{

int i= get_global_id(0);

 // int j= get_global_id(1);

int j;

  float step= 0.25;

  float grad_N= 0, grad_S= 0, grad_E= 0, grad_W= 0;

  float Coeff_N, Coeff_S, Coeff_E, Coeff_W;

  int k=100.0;

// if(i < height - 1 && j < width - 1) {

  if(i < height-1 && i > 0)

	for(j=1; j < width; j++)

{

img_temp[ i * width + j ] = img_proc_PDE[ i * width + j ];

if((i > 0 ) & (img_sgn[( i - 1 ) * width + j ] == 0))

	grad_N = img_proc_PDE[ ( i - 1 ) * width + j ] - img_proc_PDE[ i * width + j ];

  else

	grad_N = 0;

if(j < (width - 1) & img_sgn[ i * width + ( j + 1 )] == 0)

	grad_E = img_proc_PDE[ i * width + ( j + 1 )] - img_proc_PDE[ i * width + j ];

  else

	grad_E = 0;

if((i < (height - 1)) & img_sgn[( i + 1) *  width + j ] == 0)

	grad_S = img_proc_PDE[( i + 1 ) * width + j ] - img_proc_PDE[ i * width + j ];

  else

	grad_S = 0;

if((j > 0) & img_sgn[ i * width + ( j - 1 )] == 0)

	grad_W = img_proc_PDE[i * width + ( j - 1 )] - img_proc_PDE[i * width + j];

  else

	grad_W = 0;

	

	Coeff_N = ( k * k ) / ( k * k + grad_N * grad_N );

	Coeff_S = ( k * k ) / ( k * k + grad_S * grad_S );

	Coeff_E = ( k * k ) / ( k * k + grad_E * grad_E );

	Coeff_W = ( k * k ) / ( k * k + grad_W * grad_W );

	img_temp[i * width + j] = img_temp[i * width + j] + img_sgn[i * width + j] * step * (Coeff_N * grad_N+Coeff_S * grad_S+ Coeff_E * grad_E+ Coeff_W * grad_W);

	img_proc_PDE[i * width + j]= img_temp[i * width + j]; 

}

}//end filter PDE

In the main C code the kernels are invoked in a loop 30*50 times.

the memory transfer overhead is minimal. Can someone also please give me some ideas as to how to improve the performance. one is use of Image/Texture memory which is my next step. But other than that.