I am an undergraduate student at IIT Bombay and working on implementing image processing algorithms in GPU. I am writing a code to implement bilateral filter on GPU using the brute force method. But the program is giving segmentation error. Can someone please point out the error… i am new to CUDA and there are not many ppl here to help me out.
I am pasting my kernel and host codes here. I have my presentation next week and I need to show a working code. please help me out.
Code:
[codebox] #include <stdlib.h>
#include <stdio.h>
#include <cutil.h>
//Kernel
texture<float, 2, cudaReadModeElementType> tex;
global void BilateralFilter(float* outputImage, float* d_kernel, int kernelRad, float sigmar, int width, int height) {
int i = blockIdx.x*blockDim.x + threadIdx.x;
int j = blockIdx.y*blockDim.y + threadIdx.y;
float in = (float) i/width; float jn = (float) j/height;
float ival0 = tex2D(tex, in, jn);
float ival = 0.0;
float rangFilt = 0.0;
float weight = 0.0;
float out = 0.0;
float temp = 0.0;
for (int ti = -kernelRad; ti <= kernelRad; ti++) {
for (int tj = -kernelRad; tj <= kernelRad; tj++) {
in = (float) (i + ti)/width;
in = fmax(0.0, in);
in = fmin(in, 1.0);
jn = (float) (j + tj)/height;
jn = fmax(0.0, jn);
jn = fmin(1.0, jn);
ival = tex2D(tex, in, jn);
rangFilt = expf(-(ival - ival0)*(ival - ival0)/(sigmar*sigmar));
temp = rangFilt*d_kernel[(ti + kernelRad) + (2*kernelRad + 1)*(tj + kernelRad)];
out += temp*ival;
weight += temp;
}
}
outputImage[i + j*width] = out/weight;
}
//Main Program
int main() {
//Start the timer
unsigned int timer = 0;
CUT_SAFE_CALL(cutCreateTimer(&timer));
CUT_SAFE_CALL(cutStartTimer(timer));
//Loading input image
char* image_file = “test.pgm”;
float* image = NULL;
unsigned int width = 0, height = 0;
int sizeI = widthheightsizeof(float);
CUT_SAFE_CALL(cutLoadPGMf(image_file, &image, &width, &height));
//Binding Image to Texture Memory
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaArray* cu_array;
CUDA_SAFE_CALL(cudaMallocArray( &cu_array, &channelDesc, width, height));
CUDA_SAFE_CALL(cudaMemcpyToArray(cu_array, 0, 0, image, sizeI, cudaMemcpyHostToDevice));
// set texture parameters
tex.addressMode[0] = cudaAddressModeClamp; //out-of -range texture coordinates will be clamped to the valid range
tex.addressMode[1] = cudaAddressModeClamp;
tex.filterMode = cudaFilterModeLinear;
tex.normalized = true;
// Bind the array to the texture
CUDA_SAFE_CALL(cudaBindTextureToArray(tex, cu_array, channelDesc));
//Filter Characteristics
int sigmad = 1;
float sigmar = 0.1;
int kernel_width = 4*sigmad + 1;
int KERNEL_SIZE = kernel_width*kernel_width*sizeof(float);
float* h_kernel = (float*) malloc(KERNEL_SIZE);
for (int i = 0; i < kernel_width; i++) {
for (int j = 0; j < kernel_width; j++) {
h_kernel[i + j*kernel_width] = expf((-(i - 2*sigmad)*(i - 2*sigmad) - (j - 2*sigmad)*(j - 2*sigmad))/(sigmad*sigmad));
}
}
float* d_kernel;
cudaMalloc((void**) &d_kernel, KERNEL_SIZE);
CUDA_SAFE_CALL(cudaMemcpy(d_kernel, h_kernel, KERNEL_SIZE, cudaMemcpyHostToDevice));
//Output Image Array
float* outputImage;
cudaMalloc((void**) &outputImage, sizeof(sizeI));
dim3 Grid(8, 8, 1);
dim3 Block((int) width/Grid.x - 1, (int) height/Grid.y - 1);
BilateralFilter<<<Block, Grid>>> (outputImage, d_kernel, 2*sigmad, sigmar, width, height);
//Copying Output to device
float* output = (float*) malloc(CUDA_SAFE_CALL(cudaMemcpy(output, outputImage, sizeI, cudaMemcpyDeviceToHost));
//writing output image
char* out_file = "output.pgm";
CUT_SAFE_CALL(cutSavePGMf(out_file, output, width, height));
//stop the timer and print output
CUT_SAFE_CALL(cutStopTimer(timer));
printf("Processing time: %f (ms)\n", cutGetTimerValue(timer));
CUT_SAFE_CALL(cutDeleteTimer(timer));
return 0;
} [/codebox]