Hello everyone I have developed a code for Anisotropic diffusion algorithm which is an image processing algorithm in Opencl. And this is my first major venture into developing OpenCL codes.
I am having some major performance issues. The same code in CUDa with out use of texture memory or any optimizations gave atleast a speedup of 10 over the soft ware version. I am not able to figure out where the serialization is occuring. Can anyone please help me. The kernel code is given in the snippet below.
//kernel code for second OpenCL
//#include<stdlib.h>
//#include<math.h>
void find_min(float * a, float * b,int i,int j){
float temp = * a;
if( * a > * b ){
* a = * b ;
* b = temp ;
}
}
void find_min3(float *a, float *b, float *c, int i, int j){
find_min(a,b,i,j);
find_min(a,c,i,j);
}
void find_minx3(float *a, float *b, float *c, int i, int j){
find_min(b,c,i,j);
find_min(a,c,i,j);
}
void find_min6(float *a , float *b, float *c, float *d, float *e,float *f,int i, int j){
find_min(a,d,i,j);
find_min(b,e,i,j);
find_min(c,f,i,j);
find_min3(a,b,c,i,j);
find_minx3(d,e,f,i,j);
}
void find_min5(float *a, float *b, float *c, float *d, float *e,int i, int j){
find_min(a,b,i,j);
find_min(c,d,i,j);
find_min3(a,c,e,i,j);
find_minx3(b,d,e,i,j);
}
void find_min4(float *a, float *b, float *c, float *d, int i, int j)
{
find_min(a,b,i,j);
find_min(c,d,i,j);
find_min(a,c,i,j);
find_min(b,d,i,j);
}
void find_min_final(float *a, float *b, float *c,int i, int j){
find_minx3(a,b,c,i,j);
find_min(a,b,i,j);
}
__kernel void MedianFilter_Sgn_Kernel (__global const float *img_proc_PDE, __global float *img_sgn, __const int threshold,__const int width,__const int height )
{
int thresh = threshold;
int i = get_global_id(0);
// int j = get_global_id(1);
int j;
float v[9]= {0};
float temp_center_pixel;
if(i < ( height - 1 ) && i>0)
//if(j<(width - 1) && j>0)
for(j = 1; j<width-1; j++ )
{
temp_center_pixel = img_proc_PDE[ i * width + j ];
v[0]= img_proc_PDE[ ( i - 1 ) * width + ( j - 1 ) ];
v[1]= img_proc_PDE[ ( i - 1 ) * width + j ];
v[2]= img_proc_PDE[ ( i - 1 ) * width + ( j + 1 ) ];
v[3]= img_proc_PDE[ i * width + ( j - 1 ) ];
v[4]= img_proc_PDE[ i * width + j ];
v[5]= img_proc_PDE[ i * width + ( j + 1 ) ];
find_min6(&v[0],&v[1],&v[2],&v[3],&v[4],&v[5],i,j);
v[5]= img_proc_PDE[ ( i + 1 ) * width + ( j - 1 ) ];
find_min5(&v[1],&v[2],&v[3],&v[4],&v[5],i,j);
v[5]= img_proc_PDE[ ( i + 1 ) * width + j ];
find_min4(&v[2],&v[3],&v[4],&v[5],i,j);
v[5]= img_proc_PDE[ ( i + 1 ) * width + ( j + 1 ) ];
find_min_final(&v[3],&v[4],&v[5],i,j);
img_sgn[ i * width + j ] = (fabs( v[4]-temp_center_pixel ) > thresh) * 1+0; //If estimated noise, set it 1
}
//barrier(CLK_LOCAL_MEM_FENCE);
}//end of medianfilter_sgn
__kernel void FilterPDE_Sgn_Kernel(__global float *img_proc_PDE,__global float *img_sgn, __global float *img_temp, int width,int height)
{
int i= get_global_id(0);
// int j= get_global_id(1);
int j;
float step= 0.25;
float grad_N= 0, grad_S= 0, grad_E= 0, grad_W= 0;
float Coeff_N, Coeff_S, Coeff_E, Coeff_W;
int k=100.0;
// if(i < height - 1 && j < width - 1) {
if(i < height-1 && i > 0)
for(j=1; j < width; j++)
{
img_temp[ i * width + j ] = img_proc_PDE[ i * width + j ];
if((i > 0 ) & (img_sgn[( i - 1 ) * width + j ] == 0))
grad_N = img_proc_PDE[ ( i - 1 ) * width + j ] - img_proc_PDE[ i * width + j ];
else
grad_N = 0;
if(j < (width - 1) & img_sgn[ i * width + ( j + 1 )] == 0)
grad_E = img_proc_PDE[ i * width + ( j + 1 )] - img_proc_PDE[ i * width + j ];
else
grad_E = 0;
if((i < (height - 1)) & img_sgn[( i + 1) * width + j ] == 0)
grad_S = img_proc_PDE[( i + 1 ) * width + j ] - img_proc_PDE[ i * width + j ];
else
grad_S = 0;
if((j > 0) & img_sgn[ i * width + ( j - 1 )] == 0)
grad_W = img_proc_PDE[i * width + ( j - 1 )] - img_proc_PDE[i * width + j];
else
grad_W = 0;
Coeff_N = ( k * k ) / ( k * k + grad_N * grad_N );
Coeff_S = ( k * k ) / ( k * k + grad_S * grad_S );
Coeff_E = ( k * k ) / ( k * k + grad_E * grad_E );
Coeff_W = ( k * k ) / ( k * k + grad_W * grad_W );
img_temp[i * width + j] = img_temp[i * width + j] + img_sgn[i * width + j] * step * (Coeff_N * grad_N+Coeff_S * grad_S+ Coeff_E * grad_E+ Coeff_W * grad_W);
img_proc_PDE[i * width + j]= img_temp[i * width + j];
}
}//end filter PDE
In the main C code the kernels are invoked in a loop 30*50 times.
the memory transfer overhead is minimal. Can someone also please give me some ideas as to how to improve the performance. one is use of Image/Texture memory which is my next step. But other than that.