Bilateral Filter on GPU Can somebody please help

I am an undergraduate student at IIT Bombay and working on implementing image processing algorithms in GPU. I am writing a code to implement bilateral filter on GPU using the brute force method. But the program is giving segmentation error. Can someone please point out the error… i am new to CUDA and there are not many ppl here to help me out.

I am pasting my kernel and host codes here. I have my presentation next week and I need to show a working code. please help me out.

Code:

[codebox] #include <stdlib.h>

#include <stdio.h>

#include <cutil.h>

//Kernel

texture<float, 2, cudaReadModeElementType> tex;

global void BilateralFilter(float* outputImage, float* d_kernel, int kernelRad, float sigmar, int width, int height) {

int i = blockIdx.x*blockDim.x + threadIdx.x;

int j = blockIdx.y*blockDim.y + threadIdx.y;

float in = (float) i/width; float jn = (float) j/height;

float ival0 = tex2D(tex, in, jn);

float ival = 0.0;

float rangFilt = 0.0;

float weight = 0.0;

float out = 0.0;

float temp = 0.0;



for (int ti = -kernelRad; ti <= kernelRad; ti++) {

	for (int tj = -kernelRad; tj <= kernelRad; tj++) {

		in = (float) (i + ti)/width;

		in = fmax(0.0, in);

		in = fmin(in, 1.0);

		jn = (float) (j + tj)/height;

		jn = fmax(0.0, jn);

		jn = fmin(1.0, jn);

		ival = tex2D(tex, in, jn);

		rangFilt = expf(-(ival - ival0)*(ival - ival0)/(sigmar*sigmar));

		temp = rangFilt*d_kernel[(ti + kernelRad) + (2*kernelRad + 1)*(tj + kernelRad)];

		out += temp*ival;

		weight += temp;

	}

}



outputImage[i + j*width] = out/weight;

}

//Main Program

int main() {

//Start the timer

unsigned int timer = 0;

CUT_SAFE_CALL(cutCreateTimer(&timer));

CUT_SAFE_CALL(cutStartTimer(timer));

//Loading input image

char* image_file = “test.pgm”;

float* image = NULL;

unsigned int width = 0, height = 0;

int sizeI = widthheightsizeof(float);

CUT_SAFE_CALL(cutLoadPGMf(image_file, &image, &width, &height));

//Binding Image to Texture Memory

cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);

cudaArray* cu_array;

CUDA_SAFE_CALL(cudaMallocArray( &cu_array, &channelDesc, width, height));

CUDA_SAFE_CALL(cudaMemcpyToArray(cu_array, 0, 0, image, sizeI, cudaMemcpyHostToDevice));

// set texture parameters

tex.addressMode[0] = cudaAddressModeClamp; //out-of -range texture coordinates will be clamped to the valid range

tex.addressMode[1] = cudaAddressModeClamp;

tex.filterMode = cudaFilterModeLinear;

tex.normalized = true;

// Bind the array to the texture

CUDA_SAFE_CALL(cudaBindTextureToArray(tex, cu_array, channelDesc));

//Filter Characteristics

int sigmad = 1;

float sigmar = 0.1;

int kernel_width = 4*sigmad + 1;

int KERNEL_SIZE = kernel_width*kernel_width*sizeof(float);

float* h_kernel = (float*) malloc(KERNEL_SIZE);



for (int i = 0; i < kernel_width; i++) {

	for (int j = 0; j < kernel_width;  j++) {

		h_kernel[i + j*kernel_width] = expf((-(i - 2*sigmad)*(i - 2*sigmad) - (j - 2*sigmad)*(j - 2*sigmad))/(sigmad*sigmad));

	}

}



float* d_kernel;

cudaMalloc((void**) &d_kernel, KERNEL_SIZE);

CUDA_SAFE_CALL(cudaMemcpy(d_kernel, h_kernel, KERNEL_SIZE, cudaMemcpyHostToDevice));



//Output Image Array

float* outputImage;

cudaMalloc((void**) &outputImage, sizeof(sizeI));



dim3 Grid(8, 8, 1);

dim3 Block((int) width/Grid.x - 1, (int) height/Grid.y - 1);

BilateralFilter<<<Block, Grid>>> (outputImage, d_kernel, 2*sigmad, sigmar, width, height);



//Copying Output to device

float* output = (float*) malloc(CUDA_SAFE_CALL(cudaMemcpy(output, outputImage, sizeI, cudaMemcpyDeviceToHost));



//writing output image

char* out_file = "output.pgm";

CUT_SAFE_CALL(cutSavePGMf(out_file, output, width, height));



//stop the timer and print output

CUT_SAFE_CALL(cutStopTimer(timer));

printf("Processing time: %f (ms)\n", cutGetTimerValue(timer));

CUT_SAFE_CALL(cutDeleteTimer(timer));

return 0;

} [/codebox]