GPU bilateral filter need help

I am implementing bilateral filter on GPU using brute force method. I am doing the following things in mu code:

(i) copying input image to texture memory
(ii) i have calculated spatial gaussian kernel in host code itself and passing it as input to kernel code.
(iii) passing output image as input to kernel code
(iv) calculating output pixel value by multiplying with gaussian kernel and range filter values. Output image size has been assigned the same size as input image.

But i am getting segmentation error while assigning the output value image. Also i m getting 0 as output image value, when i am copying using cudaMemcpy function. Please help me out as I have my course presentation next week, need to show the working code [:(]… I am pasting my kernel code here:

//Kernel

texture<float, 2, cudaReadModeElementType> tex;

global void BilateralFilter(float* outputImage, float* d_kernel, int kernelRad, float sigmar, int width, int height) {

int i = blockIdx.x*blockDim.x + threadIdx.x;
int j = blockIdx.y*blockDim.y + threadIdx.y;
float in = (float) i/width;
float jn = (float) j/height;
float ival0 = tex2D(tex, in, jn);
float ival = 0.0;
   float rangFilt = 0.0;
float weight = 0.0;
float out = 0.0;
float temp = 0.0;

for (int ti = -kernelRad; ti < kernelRad; ti++) {

	for (int tj = -kernelRad; tj < kernelRad; tj++) {
		in = (float) (i + ti)/width;
		in = fmax(0.0, in);
		in = fmin(in, 1.0);
		jn = (float) (j + tj)/height;
		jn = fmax(0.0, jn);
		jn = fmin(1.0, jn);  //for restricting the values outside prescribed range
		ival = tex2D(tex, in, jn);
		rangFilt = expf(-(ival - ival0)*(ival - ival0)/(sigmar*sigmar));
		temp = rangFilt*d_kernel[(ti + kernelRad) + (2*kernelRad + 1)*(tj + kernelRad)];
		out += temp*ival;
		weight += temp;
	}
}
outputImage[i + j*width] = (float) out/weight;

}

This version (non optimized at all) works…

I’ve been breaking my back on a faster version (using the TILE division explained in the SDK), but so far, I don’t end up with the expected image.

Here is the code corresponding to the brute force version of the implementation of the Bilateral Filter.

********************************************************/

/*  Name kernelBF.cu									*/

/*  Last date of modification : 20/04/09				*/

#ifndef _KERNEL_H_

#define _KERNEL_H_

#define GDATA(base,iDx,iDy,pitch) *((float*)((char*)(base) + (iDy) * pitch) + (iDx))  // used to cooperate with the pitch created with CudaMallocPitch

__global__ void

KernelBF(float* d_idata, size_t pitch_in, 

		 U16 * d_RGBmap,  // U16 == (unsigned short int)

		 float* d_odata, size_t pitch_out,

		 unsigned int width, unsigned int height,

		 float * d_skernel) 

/* Kernel Parameters */

/* d_idata : disparity map before filtering, pitch_idata: pitch created with CudaMallocPitch

/* d_odata : disparity map after filtering, pitch_pdata: pitch created with Cudamallocpitch

/* d_RGBmap : Color map (R,G,B) size  2*width*height, pitch_color: pitch created with CudaMallocPitch

/* width : largeur, height : hauteur (nbcol, nbrow) 

/* d_skernel : kernel computed on the host side containing the coefficients for spatial filtering

{

	/********* Variables used in BF computation ********/	

	

	float sig_color = 200.0f; // value distance sigma

	float sig_disp = 200.0f; // disparity distance sigma

	//	float sig_spatial = 200.0f; 

	

	

	float d_disp=0.0f,d_color=0.0f;

	float w_spatial=1.0f, w_color=1.0f,w_disp=1.0f,w_final=1.0f;

	// the spatial distance sigma is defined in BF.cu

	

	// Thread Index

	const int tix=threadIdx.x;

	const int tiy=threadIdx.y;

	// Global Index

	int idx=blockIdx.x*blockDim.x+tix;

	int idy=blockIdx.y*blockDim.y+tiy;

	if ((idx <width)&&(idy<height)){

		float sum=0.0f;

		float w_t=0.0f;

		for (int i=-KERNEL_RADIUS;i<=KERNEL_RADIUS;i++){

			for (int j=-KERNEL_RADIUS;j<=KERNEL_RADIUS;j++){

				if ( ((idx+i)>=0)&&((idy+j)>=0)&&((idx+i)<width)&&((idy+j)<height)){

					

					w_spatial=d_skernel[KERNEL_RADIUS-i+(KERNEL_W)*(KERNEL_RADIUS-j)];

					d_color=(float) (	abs(d_RGBmap[(idx+width*idy)*3]-d_RGBmap[(idx+i+width*(idy+j))*3])+

										abs(d_RGBmap[(idx+width*idy)*3+1]-d_RGBmap[(idx+i+width*(idy+j))*3+1])+

										abs(d_RGBmap[(idx+width*idy)*3+2]-d_RGBmap[(idx+i+width*(idy+j))*3+2]));

					d_color*=d_color;

					w_color=exp(-d_color/sig_color);

					d_disp=(GDATA(d_idata,idx+i,idy+j,pitch_in)-GDATA(d_idata,idx,idy,pitch_in));

					d_disp*=d_disp;

					w_disp=exp(-d_disp/sig_disp);	

					w_final=w_spatial*w_color*w_disp;

					w_t+=w_final;

					sum+=w_final*GDATA(d_idata,idx+i,idy+j,pitch_in);

				}

				

			}// for j

		}// for i

		if (w_t !=0 ){ 

			GDATA(d_odata,idx,idy,pitch_out)= (float)(sum/w_t);

		}else{ GDATA(d_odata,idx,idy,pitch_out)=0.0f;}

			

		}// if*/

										

	

	

}

#endif // #ifndef _TEMPLATE_KERNEL_H_

Hope it helps someone !