THREADS NOT ACCESSING.

GAGS · September 24, 2010, 3:55pm

Hello everyone,

                    I am implementing Anisotropic Diffusion algorithm in OpenCL. I am porting the code from CUDA to Opencl Being completly new to OpenCL i am having trouble access the pixels in my image.

My .cpp code snippet is given below.

[codebox]while(iter != 0) {

//printf(“iteration number %d \n”,iter);

//load images in opencl buffer

ErrNum= clEnqueueWriteBuffer(commandQueue,d_img_noised, CL_TRUE,0, sizeof(float)widthheight,img_noised,0,NULL,NULL);

shrCheckError(ErrNum, CL_SUCCESS);

ErrNum= clEnqueueWriteBuffer(commandQueue,d_img_proc_med,CL_TRUE,0,s

izeof(float)widthheight,img_proc_med,0,NULL,NULL);

shrCheckError(ErrNum, CL_SUCCESS);

ErrNum= clEnqueueWriteBuffer(commandQueue,d_img_proc_PDE,CL_TRUE,0,s

izeof(float)widthheight,img_proc_PDE,0,NULL,NULL);

shrCheckError(ErrNum, CL_SUCCESS);

ErrNum= clEnqueueWriteBuffer(commandQueue,d_img_sgn,CL_TRUE,0,sizeof

(float)widthheight,img_sgn,0,NULL,NULL);

shrCheckError(ErrNum, CL_SUCCESS);

ErrNum= clEnqueueWriteBuffer(commandQueue,d_img_temp,CL_TRUE,0,sizeo

f(float)widthheight,img_temp,0,NULL,NULL);

shrCheckError(ErrNum, CL_SUCCESS);

int counter =30; //iteration time

//const int threshold=25;

while(counter != 0){

//MedianFilter_Sgn_Kernel

//set kernel arguments

//printf(“the count is counter %d”, counter);

cl_uint i=0;

ErrNum= clSetKernelArg(Kernel1,i++,sizeof(cl_mem),(void*)&d_img_proc_PDE);

shrCheckError(ErrNum, CL_SUCCESS);

ErrNum= clSetKernelArg(Kernel1,i++,sizeof(cl_mem),(void*)&d_img_sgn);

shrCheckError(ErrNum, CL_SUCCESS);

ErrNum= clSetKernelArg(Kernel1,i++,sizeof(size_t),&threshold);

shrCheckError(ErrNum, CL_SUCCESS);

ErrNum= clSetKernelArg(Kernel1,i++,sizeof(size_t),&width);

shrCheckError(ErrNum, CL_SUCCESS);

ErrNum= clSetKernelArg(Kernel1,i++,sizeof(size_t),&height);

shrCheckError(ErrNum, CL_SUCCESS);

//printf(“the width is %d height is %d threshold is %d”,width,height,threshold);

//cl_kernel Kernel[2];

//size_t global_work_size= BLOCKSIZE

ErrNum= clEnqueueNDRangeKernel(commandQueue,Kernel1,2,NULL,&global_work_size,0,0,0,0);

/*if(ErrNum == CL_SUCCESS){

printf(“kernel 1 is executed!!\n”);}

*/shrCheckError(ErrNum, CL_SUCCESS);

//FilterPDE_Sgn_Kernel

cl_uint j=0;

ErrNum= clSetKernelArg(Kernel2,j++,sizeof(cl_mem),(void*)&d_img_proc_PDE);

shrCheckError(ErrNum, CL_SUCCESS);

ErrNum= clSetKernelArg(Kernel2,j++,sizeof(cl_mem),(void*)&d_img_sgn);

shrCheckError(ErrNum, CL_SUCCESS);

ErrNum= clSetKernelArg(Kernel2,j++,sizeof(cl_mem),(void*)&d_img_temp);

shrCheckError(ErrNum, CL_SUCCESS);

ErrNum= clSetKernelArg(Kernel2,j++,sizeof(size_t),&width);

shrCheckError(ErrNum, CL_SUCCESS);

ErrNum= clSetKernelArg(Kernel2,j++,sizeof(size_t),&height);

shrCheckError(ErrNum, CL_SUCCESS);

ErrNum=clEnqueueNDRangeKernel(commandQueue,Kernel2,2,NULL,&global_work_size,0,0,0,0);

/*if(ErrNum == CL_SUCCESS){

printf(“kernel 2 is executed!!\n”);}

*/shrCheckError(ErrNum, CL_SUCCESS);

//put a barrier

clFinish(commandQueue);

//PSNR

PSNR(img_original,img_proc_PDE,&psnr,width,height);

//printf(“the PSNRfor iteration number %d is %f\n”,counter, psnr);

//ErrNum = clEnqueueReadBuffer(commandQueue,d_img_proc_PDE,CL_TRUE,0,si

zeof(float)widthheight,(void*)img_proc_PDE,0,NULL,NULL);

counter–;

}

ErrNum = clEnqueueReadBuffer(commandQueue,d_img_proc_PDE,CL_TRUE,0,si

zeof(float)widthheight,img_proc_PDE,0,NULL,NULL);

iter–;

}

[/codebox]

Given below is the kernel code snippet where the threads are getting accessed. or not getting accessed :(

[codebox]/*if(ErrNum == CL_SUCCESS){

printf(“kernel 2 is executed!!\n”);}

*/shrCheckError(ErrNum, CL_SUCCESS);

//put a barrier

clFinish(commandQueue);

//PSNR

PSNR(img_original,img_proc_PDE,&psnr,width,height);

//printf(“the PSNRfor iteration number %d is %f\n”,counter, psnr);

//ErrNum = clEnqueueReadBuffer(commandQueue,d_img_proc_PDE,CL_TRUE,0,si

zeof(float)widthheight,(void*)img_proc_PDE,0,NULL,NULL);

counter–;

}

ErrNum = clEnqueueReadBuffer(commandQueue,d_img_proc_PDE,CL_TRUE,0,si

zeof(float)widthheight,img_proc_PDE,0,NULL,NULL);

iter–;

}

.

Rest of the code for computation.[/codebox]

I am accessing the noised image multiple times but for some reason the threads are not accessing the pixels . As a result my output image is same as the noised input image. Can you please tell me as to why this is happening. And I am pretty sure there is nothing wrong with the logic of the implementation as it is working just fine for the CUDA implementation.

jcpalmer · September 24, 2010, 4:52pm

A couple of things come to mind, although bear in mind I do not know nor care to know what a Anisotropic Diffusion actually is:

The term threads does not have direct meaning in OpenCL, I assume you mean work items. Many here do not know CUDA, including me. Threads discussed in OpenCL are more likely to be about host threads & multi-GPU topics.
It does not look like your kernelArgs change. For those that is true, putting them outside your counter loop will give you better performance / readability.
Your second snippet does not look like kernel code, just different host code. I have no clue as to what that is.

Perhaps you might want to isolate if you have a host code “plumbing” problem. You could set counter = 1, and in your kernel just set all the elements of the first texel of the output image to like 37 & return. Do you get a 37 back? If not, do whatever you need to do. After that, any problem is probably some mistake made in the algorithm translation.

jcpalmer · September 24, 2010, 4:52pm

A couple of things come to mind, although bear in mind I do not know nor care to know what a Anisotropic Diffusion actually is:

The term threads does not have direct meaning in OpenCL, I assume you mean work items. Many here do not know CUDA, including me. Threads discussed in OpenCL are more likely to be about host threads & multi-GPU topics.
It does not look like your kernelArgs change. For those that is true, putting them outside your counter loop will give you better performance / readability.
Your second snippet does not look like kernel code, just different host code. I have no clue as to what that is.

Perhaps you might want to isolate if you have a host code “plumbing” problem. You could set counter = 1, and in your kernel just set all the elements of the first texel of the output image to like 37 & return. Do you get a 37 back? If not, do whatever you need to do. After that, any problem is probably some mistake made in the algorithm translation.

GAGS · September 24, 2010, 6:19pm

I am very sorry. the code snippet for the kernel code is given below:

[codebox]__kernel void MedianFilter_Sgn_Kernel (__global float *img_proc_PDE, __global float *img_sgn, const int threshold, int width, int height )

{

int thresh = threshold;

int i = get_global_id(0);

int j = get_global_id(1);

float v[9]= {0};

float temp_center_pixel;

if(i < ( height - 1 ) && j < ( width - 1 ) )

{

.

the required computations[/codebox]

GAGS · September 24, 2010, 6:19pm

I am very sorry. the code snippet for the kernel code is given below:

[codebox]__kernel void MedianFilter_Sgn_Kernel (__global float *img_proc_PDE, __global float *img_sgn, const int threshold, int width, int height )

{

int thresh = threshold;

int i = get_global_id(0);

int j = get_global_id(1);

float v[9]= {0};

float temp_center_pixel;

if(i < ( height - 1 ) && j < ( width - 1 ) )

{

.

the required computations[/codebox]

karbous · September 26, 2010, 7:21pm

I can’t see any problem with your code snippet (is it long to post the whole code or can you make a simpler example of your problem?), but I would strongly recommend you to use OpenCL type image2d_t instead of float* for your picture (because of the performance and nature of your problem).

karbous · September 26, 2010, 7:21pm

I can’t see any problem with your code snippet (is it long to post the whole code or can you make a simpler example of your problem?), but I would strongly recommend you to use OpenCL type image2d_t instead of float* for your picture (because of the performance and nature of your problem).

karbous · September 26, 2010, 7:21pm

I can’t see any problem with your code snippet (is it long to post the whole code or can you make a simpler example of your problem?), but I would strongly recommend you to use OpenCL type image2d_t instead of float* for your picture (because of the performance and nature of your problem).

GAGS · September 26, 2010, 9:06pm

Thanks for replying, i have posted the whole of the kernel code below.

[codebox]__kernel void MedianFilter_Sgn_Kernel (__global float *img_proc_PDE, __global float *img_sgn, const int threshold, int width, int height )

{

int thresh = threshold;

int i = get_global_id(0);

int j = get_global_id(1);

float v[9]= {0};

float temp_center_pixel;

if(i < ( height - 1 ) && j < ( width - 1 ) )

{

temp_center_pixel = img_proc_PDE[ i * width + j ];

v[0]= img_proc_PDE[ ( i - 1 ) * width + ( j - 1 ) ];

v[1]= img_proc_PDE[ ( i - 1 ) * width + j ];

v[2]= img_proc_PDE[ ( i - 1 ) * width + ( j + 1 ) ];

v[3]= img_proc_PDE[ i * width + ( j - 1 ) ];

v[4]= img_proc_PDE[ i * width + j ];

v[5]= img_proc_PDE[ i * width + ( j + 1 ) ];

find_min6(&v[0],&v[1],&v[2],&v[3],&v[4],&v[5],i,j);

v[5]= img_proc_PDE[ ( i + 1 ) * width + ( j - 1 ) ];

find_min5(&v[1],&v[2],&v[3],&v[4],&v[5],i,j);

v[5]= img_proc_PDE[ ( i + 1 ) * width + j ];

find_min4(&v[2],&v[3],&v[4],&v[5],i,j);

v[5]= img_proc_PDE[ ( i + 1 ) * width + ( j + 1 ) ];

find_min_final(&v[3],&v[4],&v[5],i,j);

img_sgn[ i * width + j ] = (( v[4]-temp_center_pixel ) > thresh) * 255+0; //If estimated noise, set it 1

}

    barrier(CLK_LOCAL_MEM_FENCE);

}//end of medianfilter_sgn

__kernel void FilterPDE_Sgn_Kernel(__global float *img_proc_PDE,__global float *img_sgn, __global float *img_temp, int width,int height)

{

int i= get_global_id(0);

int j= get_global_id(1);

float step= 0.25;

float grad_N= 0, grad_S= 0, grad_E= 0, grad_W= 0;

float Coeff_N, Coeff_S, Coeff_E, Coeff_W;

int k=100.0;

if(i < height - 1 && j < width - 1) {

img_temp[ i * width + j ] = img_proc_PDE[ i * width + j ];

if((i > 0 ) & (img_sgn[( i - 1 ) * width + j ] == 0))

    grad_N = img_proc_PDE[ ( i - 1 ) * width + j ] - img_proc_PDE[ i * width + j ];

else

    grad_N = 0;

if(j < (width - 1) & img_sgn[ i * width + ( j + 1 )] == 0)

    grad_E = img_proc_PDE[ i * width + ( j + 1 )] - img_proc_PDE[ i * width + j ];

else

    grad_E = 0;

if((i < (height - 1)) & img_sgn[( i + 1) * width + j ] == 0)

    grad_S = img_proc_PDE[( i + 1 ) * width + j ] - img_proc_PDE[ i * width + j ];

else

    grad_S = 0;

if((j > 0) & img_sgn[ i * width + ( j - 1 )] == 0)

    grad_W = img_proc_PDE[i * width + ( j - 1 )] - img_proc_PDE[i * width + j];

else

    grad_W = 0;

Coeff_N = ( k * k ) / ( k * k + grad_N * grad_N );

    Coeff_S = ( k * k ) / ( k * k + grad_S * grad_S );

    Coeff_E = ( k * k ) / ( k * k + grad_E * grad_E );

    Coeff_W = ( k * k ) / ( k * k + grad_W * grad_W );

img_temp[i * width + j] = img_temp[i * width + j] + img_sgn[i * width + j] * step * (Coeff_N * grad_N+Coeff_S * grad_S+ Coeff_E * grad_E+ Coeff_W * grad_W);

    img_proc_PDE[i * width + j]= img_temp[i * width + j];

}

}//end filter PDE

                                  [/codebox]

GAGS · September 26, 2010, 9:06pm

Thanks for replying, i have posted the whole of the kernel code below.

[codebox]__kernel void MedianFilter_Sgn_Kernel (__global float *img_proc_PDE, __global float *img_sgn, const int threshold, int width, int height )

{

int thresh = threshold;

int i = get_global_id(0);

int j = get_global_id(1);

float v[9]= {0};

float temp_center_pixel;

if(i < ( height - 1 ) && j < ( width - 1 ) )

{

temp_center_pixel = img_proc_PDE[ i * width + j ];

v[0]= img_proc_PDE[ ( i - 1 ) * width + ( j - 1 ) ];

v[1]= img_proc_PDE[ ( i - 1 ) * width + j ];

v[2]= img_proc_PDE[ ( i - 1 ) * width + ( j + 1 ) ];

v[3]= img_proc_PDE[ i * width + ( j - 1 ) ];

v[4]= img_proc_PDE[ i * width + j ];

v[5]= img_proc_PDE[ i * width + ( j + 1 ) ];

find_min6(&v[0],&v[1],&v[2],&v[3],&v[4],&v[5],i,j);

v[5]= img_proc_PDE[ ( i + 1 ) * width + ( j - 1 ) ];

find_min5(&v[1],&v[2],&v[3],&v[4],&v[5],i,j);

v[5]= img_proc_PDE[ ( i + 1 ) * width + j ];

find_min4(&v[2],&v[3],&v[4],&v[5],i,j);

v[5]= img_proc_PDE[ ( i + 1 ) * width + ( j + 1 ) ];

find_min_final(&v[3],&v[4],&v[5],i,j);

img_sgn[ i * width + j ] = (( v[4]-temp_center_pixel ) > thresh) * 255+0; //If estimated noise, set it 1

}

    barrier(CLK_LOCAL_MEM_FENCE);

}//end of medianfilter_sgn

__kernel void FilterPDE_Sgn_Kernel(__global float *img_proc_PDE,__global float *img_sgn, __global float *img_temp, int width,int height)

{

int i= get_global_id(0);

int j= get_global_id(1);

float step= 0.25;

float grad_N= 0, grad_S= 0, grad_E= 0, grad_W= 0;

float Coeff_N, Coeff_S, Coeff_E, Coeff_W;

int k=100.0;

if(i < height - 1 && j < width - 1) {

img_temp[ i * width + j ] = img_proc_PDE[ i * width + j ];

if((i > 0 ) & (img_sgn[( i - 1 ) * width + j ] == 0))

    grad_N = img_proc_PDE[ ( i - 1 ) * width + j ] - img_proc_PDE[ i * width + j ];

else

    grad_N = 0;

if(j < (width - 1) & img_sgn[ i * width + ( j + 1 )] == 0)

    grad_E = img_proc_PDE[ i * width + ( j + 1 )] - img_proc_PDE[ i * width + j ];

else

    grad_E = 0;

if((i < (height - 1)) & img_sgn[( i + 1) * width + j ] == 0)

    grad_S = img_proc_PDE[( i + 1 ) * width + j ] - img_proc_PDE[ i * width + j ];

else

    grad_S = 0;

if((j > 0) & img_sgn[ i * width + ( j - 1 )] == 0)

    grad_W = img_proc_PDE[i * width + ( j - 1 )] - img_proc_PDE[i * width + j];

else

    grad_W = 0;

Coeff_N = ( k * k ) / ( k * k + grad_N * grad_N );

    Coeff_S = ( k * k ) / ( k * k + grad_S * grad_S );

    Coeff_E = ( k * k ) / ( k * k + grad_E * grad_E );

    Coeff_W = ( k * k ) / ( k * k + grad_W * grad_W );

img_temp[i * width + j] = img_temp[i * width + j] + img_sgn[i * width + j] * step * (Coeff_N * grad_N+Coeff_S * grad_S+ Coeff_E * grad_E+ Coeff_W * grad_W);

    img_proc_PDE[i * width + j]= img_temp[i * width + j];

}

}//end filter PDE

                                  [/codebox]

GAGS · September 26, 2010, 9:06pm

Thanks for replying, i have posted the whole of the kernel code below.

[codebox]__kernel void MedianFilter_Sgn_Kernel (__global float *img_proc_PDE, __global float *img_sgn, const int threshold, int width, int height )

{

int thresh = threshold;

int i = get_global_id(0);

int j = get_global_id(1);

float v[9]= {0};

float temp_center_pixel;

if(i < ( height - 1 ) && j < ( width - 1 ) )

{

temp_center_pixel = img_proc_PDE[ i * width + j ];

v[0]= img_proc_PDE[ ( i - 1 ) * width + ( j - 1 ) ];

v[1]= img_proc_PDE[ ( i - 1 ) * width + j ];

v[2]= img_proc_PDE[ ( i - 1 ) * width + ( j + 1 ) ];

v[3]= img_proc_PDE[ i * width + ( j - 1 ) ];

v[4]= img_proc_PDE[ i * width + j ];

v[5]= img_proc_PDE[ i * width + ( j + 1 ) ];

find_min6(&v[0],&v[1],&v[2],&v[3],&v[4],&v[5],i,j);

v[5]= img_proc_PDE[ ( i + 1 ) * width + ( j - 1 ) ];

find_min5(&v[1],&v[2],&v[3],&v[4],&v[5],i,j);

v[5]= img_proc_PDE[ ( i + 1 ) * width + j ];

find_min4(&v[2],&v[3],&v[4],&v[5],i,j);

v[5]= img_proc_PDE[ ( i + 1 ) * width + ( j + 1 ) ];

find_min_final(&v[3],&v[4],&v[5],i,j);

img_sgn[ i * width + j ] = (( v[4]-temp_center_pixel ) > thresh) * 255+0; //If estimated noise, set it 1

}

    barrier(CLK_LOCAL_MEM_FENCE);

}//end of medianfilter_sgn

__kernel void FilterPDE_Sgn_Kernel(__global float *img_proc_PDE,__global float *img_sgn, __global float *img_temp, int width,int height)

{

int i= get_global_id(0);

int j= get_global_id(1);

float step= 0.25;

float grad_N= 0, grad_S= 0, grad_E= 0, grad_W= 0;

float Coeff_N, Coeff_S, Coeff_E, Coeff_W;

int k=100.0;

if(i < height - 1 && j < width - 1) {

img_temp[ i * width + j ] = img_proc_PDE[ i * width + j ];

if((i > 0 ) & (img_sgn[( i - 1 ) * width + j ] == 0))

    grad_N = img_proc_PDE[ ( i - 1 ) * width + j ] - img_proc_PDE[ i * width + j ];

else

    grad_N = 0;

if(j < (width - 1) & img_sgn[ i * width + ( j + 1 )] == 0)

    grad_E = img_proc_PDE[ i * width + ( j + 1 )] - img_proc_PDE[ i * width + j ];

else

    grad_E = 0;

if((i < (height - 1)) & img_sgn[( i + 1) * width + j ] == 0)

    grad_S = img_proc_PDE[( i + 1 ) * width + j ] - img_proc_PDE[ i * width + j ];

else

    grad_S = 0;

if((j > 0) & img_sgn[ i * width + ( j - 1 )] == 0)

    grad_W = img_proc_PDE[i * width + ( j - 1 )] - img_proc_PDE[i * width + j];

else

    grad_W = 0;

Coeff_N = ( k * k ) / ( k * k + grad_N * grad_N );

    Coeff_S = ( k * k ) / ( k * k + grad_S * grad_S );

    Coeff_E = ( k * k ) / ( k * k + grad_E * grad_E );

    Coeff_W = ( k * k ) / ( k * k + grad_W * grad_W );

img_temp[i * width + j] = img_temp[i * width + j] + img_sgn[i * width + j] * step * (Coeff_N * grad_N+Coeff_S * grad_S+ Coeff_E * grad_E+ Coeff_W * grad_W);

    img_proc_PDE[i * width + j]= img_temp[i * width + j];

}

}//end filter PDE

                                  [/codebox]

karbous · September 27, 2010, 9:03am

Where are find_min* functions defined? In the code you posted there are no obvious assignment to v[4]. So, v[4] and temp_center_pixel looks always as they have the same values and img_sgn is therefore the same for the whole picture, isn’t it? (Perhaps try out, to verify my theory, assign v[4] another pixels, eg: v[4]= img_proc_PDE[ ( i - 1 ) * width + ( j - 1 ) ]; comment out the find_min functions and see what happens to the picture.)

Btw. barrier (barrier(CLK_LOCAL_MEM_FENCE);) is not needed as your threads have no common local memory variables. The condition if(i < ( height - 1 ) && j < ( width - 1 ) ) seems not to fulfill all the needs, imagine i=0,j=0 and you access img_proc_PDE[ ( i - 1 ) * width + ( j - 1 ) ];

just a tip: to identify code bugs it is handy to try out ati-stream-sdk and run your OpenCL code on CPU, use linux GDB and single step through your kernels

karbous · September 27, 2010, 9:03am

Where are find_min* functions defined? In the code you posted there are no obvious assignment to v[4]. So, v[4] and temp_center_pixel looks always as they have the same values and img_sgn is therefore the same for the whole picture, isn’t it? (Perhaps try out, to verify my theory, assign v[4] another pixels, eg: v[4]= img_proc_PDE[ ( i - 1 ) * width + ( j - 1 ) ]; comment out the find_min functions and see what happens to the picture.)

Btw. barrier (barrier(CLK_LOCAL_MEM_FENCE);) is not needed as your threads have no common local memory variables. The condition if(i < ( height - 1 ) && j < ( width - 1 ) ) seems not to fulfill all the needs, imagine i=0,j=0 and you access img_proc_PDE[ ( i - 1 ) * width + ( j - 1 ) ];

just a tip: to identify code bugs it is handy to try out ati-stream-sdk and run your OpenCL code on CPU, use linux GDB and single step through your kernels

karbous · September 27, 2010, 9:03am

Where are find_min* functions defined? In the code you posted there are no obvious assignment to v[4]. So, v[4] and temp_center_pixel looks always as they have the same values and img_sgn is therefore the same for the whole picture, isn’t it? (Perhaps try out, to verify my theory, assign v[4] another pixels, eg: v[4]= img_proc_PDE[ ( i - 1 ) * width + ( j - 1 ) ]; comment out the find_min functions and see what happens to the picture.)

Btw. barrier (barrier(CLK_LOCAL_MEM_FENCE);) is not needed as your threads have no common local memory variables. The condition if(i < ( height - 1 ) && j < ( width - 1 ) ) seems not to fulfill all the needs, imagine i=0,j=0 and you access img_proc_PDE[ ( i - 1 ) * width + ( j - 1 ) ];

just a tip: to identify code bugs it is handy to try out ati-stream-sdk and run your OpenCL code on CPU, use linux GDB and single step through your kernels