Hi all!
I’m using this bilateral filter in my project, but this function is not optimized at all.
Is there someone who can help me to optimize it ?
Using shared memory or other, I don’t understand how to use it.
Thanks!
********************************************************/
/* Name kernelBF.cu */
/* Last date of modification : 20/04/09 */
#ifndef _KERNEL_H_
#define _KERNEL_H_
#define GDATA(base,iDx,iDy,pitch) *((float*)((char*)(base) + (iDy) * pitch) + (iDx)) // used to cooperate with the pitch created with CudaMallocPitch
__global__ void
KernelBF(float* d_idata, size_t pitch_in,
U16 * d_RGBmap, // U16 == (unsigned short int)
float* d_odata, size_t pitch_out,
unsigned int width, unsigned int height,
float * d_skernel)
/* Kernel Parameters */
/* d_idata : disparity map before filtering, pitch_idata: pitch created with CudaMallocPitch
/* d_odata : disparity map after filtering, pitch_pdata: pitch created with Cudamallocpitch
/* d_RGBmap : Color map (R,G,B) size 2*width*height, pitch_color: pitch created with CudaMallocPitch
/* width : largeur, height : hauteur (nbcol, nbrow)
/* d_skernel : kernel computed on the host side containing the coefficients for spatial filtering
{
/********* Variables used in BF computation ********/
float sig_color = 200.0f; // value distance sigma
float sig_disp = 200.0f; // disparity distance sigma
// float sig_spatial = 200.0f;
float d_disp=0.0f,d_color=0.0f;
float w_spatial=1.0f, w_color=1.0f,w_disp=1.0f,w_final=1.0f;
// the spatial distance sigma is defined in BF.cu
// Thread Index
const int tix=threadIdx.x;
const int tiy=threadIdx.y;
// Global Index
int idx=blockIdx.x*blockDim.x+tix;
int idy=blockIdx.y*blockDim.y+tiy;
if ((idx <width)&&(idy<height)){
float sum=0.0f;
float w_t=0.0f;
for (int i=-KERNEL_RADIUS;i<=KERNEL_RADIUS;i++){
for (int j=-KERNEL_RADIUS;j<=KERNEL_RADIUS;j++){
if ( ((idx+i)>=0)&&((idy+j)>=0)&&((idx+i)<width)&&((idy+j)<height)){
w_spatial=d_skernel[KERNEL_RADIUS-i+(KERNEL_W)*(KERNEL_RADIUS-j)];
d_color=(float) ( abs(d_RGBmap[(idx+width*idy)*3]-d_RGBmap[(idx+i+width*(idy+j))*3])+
abs(d_RGBmap[(idx+width*idy)*3+1]-d_RGBmap[(idx+i+width*(idy+j))*3+1])+
abs(d_RGBmap[(idx+width*idy)*3+2]-d_RGBmap[(idx+i+width*(idy+j))*3+2]));
d_color*=d_color;
w_color=exp(-d_color/sig_color);
d_disp=(GDATA(d_idata,idx+i,idy+j,pitch_in)-GDATA(d_idata,idx,idy,pitch_in));
d_disp*=d_disp;
w_disp=exp(-d_disp/sig_disp);
w_final=w_spatial*w_color*w_disp;
w_t+=w_final;
sum+=w_final*GDATA(d_idata,idx+i,idy+j,pitch_in);
}
}// for j
}// for i
if (w_t !=0 ){
GDATA(d_odata,idx,idy,pitch_out)= (float)(sum/w_t);
}else{ GDATA(d_odata,idx,idy,pitch_out)=0.0f;}
}// if*/
}
#endif // #ifndef _TEMPLATE_KERNEL_H_