Hi everyone,
im trying to implement an Image manipulation-Program, a simple bilateral Filter. My code look like this:
texture<float, 2, cudaReadModeElementType> texRefArray;
__global__ void bilateralFilter(float* dstData, int elemNr, int width, int height, float* maskData, int sigmaR){
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int curMask;
float intens, factor, val0, vali, wp=0, k=0;
int filterRadius=4;
-----------------val0 = tex2D(texRefArray, (x)/(float)width, (y)/(float)height);
----------------if(x>=filterRadius && y>=filterRadius && x<width-filterRadius && y<height-filterRadius)
{
for (int j=-filterRadius; j<=filterRadius; j++)
{
for (int i=-filterRadius; i<=filterRadius; i++)
{
vali = tex2D(texRefArray, (x+i)/(float)width, (y+j)/(float)height);
curMask = i + filterRadius + (j + filterRadius)*(filterRadius*2+1);
intens = val0-vali;
factor = exp(-0.5 * intens*intens/(sigmaR*sigmaR)) * maskData[curMask];
wp += factor * vali;
k += factor;
} //end for i
}//end for j
----------------dstData[x+y*elemNr] = val0; // wp/k;
}//end if(filterRadius)
} //end BilateralFilter
int main(int argc, char *argv[])
{
IplImage* srcImg_h;
IplImage* dstImg_h;
int height,width;
//load Image
srcImg_h=cvLoadImage(argv[1]);
if(!srcImg_h){
printf("Could not load image file: %s\n",argv[1]);
exit(0);
}//end if()
if(srcImg_h->nChannels == 3)
{
IplImage* tmp_img = cvCreateImage(cvGetSize(srcImg_h), IPL_DEPTH_8U, 1);
cvCvtColor(srcImg_h, tmp_img, CV_BGR2GRAY);
srcImg_h = tmp_img;
} //end if()
//create dst IMG
dstImg_h = cvCreateImage( cv::Size(srcImg_h->width, srcImg_h->height), IPL_DEPTH_8U, 1 );
//IMGData and Maskdata Device and Host
float* src_imgData_h = (float*) srcImg_h->imageData ;
float* dst_imgData_h =(float*) dstImg_h->imageData ;
float* src_imgData_d, *dst_imgData_d;
float* maskData_h, *maskData_d;
//Img Properties
width = srcImg_h->width;
height = srcImg_h->height;
size_t size = width*height*sizeof(float);
//filter Properties
int filterRad=4, sigmaR=5, sigmaS=1;
size_t maskSize = (filterRad*2+1)*(filterRad*2+1)*sizeof(float);
//Compute MaskData
maskData_h = (float *)malloc(maskSize);
printf("maskData: \n");
for(int j=-filterRad; j<=filterRad;j++){
for(int i=-filterRad; i<=filterRad; i++){
maskData_h[i+filterRad+(j+filterRad)*(filterRad*2+1)]=exp(-0.5 * pow(sqrt(i*i+j*j)/sigmaS,2));
printf(" %f " ,maskData_h[i+filterRad+(j+filterRad)*(filterRad*2+1)]);
}
printf("\n");
}
//maskData Host to Device
cudaMalloc((void **) &maskData_d, maskSize);
cudaMemcpy(maskData_d, maskData_h, maskSize, cudaMemcpyHostToDevice);
// ------------------------
// Device memory allocation
// ------------------------
// Pitch linear input data
float *d_idataPL;
size_t d_pitchBytes;
cudaMallocPitch((void**) &d_idataPL, &d_pitchBytes, width*sizeof(float), height);
// Array input data
cudaArray *d_idataArray;
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
cudaMallocArray(&d_idataArray, &channelDesc, width, height);
cudaMemcpyToArray(d_idataArray, 0, 0, src_imgData_h, size, cudaMemcpyHostToDevice);
size_t h_pitchBytes = width*sizeof(float);
// Array
texRefArray.normalized = 1;
texRefArray.filterMode = cudaFilterModePoint;
texRefArray.addressMode[0] = cudaAddressModeClamp;
texRefArray.addressMode[1] = cudaAddressModeClamp;
cudaBindTextureToArray(texRefArray, d_idataArray, channelDesc);
cudaMalloc((void **) &dst_imgData_d, size);
cudaMemset2D(dst_imgData_d, d_pitchBytes, 0, width*sizeof(float), height);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
// execution configuration parameters
dim3 blockDim(16, 16, 1);
dim3 gridDim((width + blockDim.x - 1)/ blockDim.x, (height + blockDim.y - 1) / blockDim.y, 1);
// dim3 grid(width/TILE_DIM, height/TILE_DIM), threads(TILE_DIM, TILE_DIM);
// for (int i=0; i < width; i++) {
bilateralFilter<<<gridDim, blockDim, 0>>>(dst_imgData_d, d_pitchBytes/sizeof(float), width, height, maskData_d, sigmaR);
// }
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float timeArray;
cudaEventElapsedTime(&timeArray, start, stop);
cudaMemcpy2D(dst_imgData_h, h_pitchBytes, dst_imgData_d, d_pitchBytes, width*sizeof(float), height, cudaMemcpyDeviceToHost);
//Create Window and show Image
cvNamedWindow("Output", CV_WINDOW_AUTOSIZE);
cvMoveWindow("Output", 300, 100);
cvShowImage("Output", dstImg_h);
// wait for a key
cvWaitKey(0);
// release the image
cvReleaseImage(&dstImg_h);
cudaUnbindTexture(texRefArray);
cudaFreeArray(d_idataArray);
cudaFree(dst_imgData_d);
cudaEventDestroy(start);
cudaEventDestroy(stop);
}//end Main()
i have tryed to highlight the codesnippt, i have problems with.
First of all i want to explain what i am trying to do:
-
Load an Image and get the Imagedata from that
-
bind Texture to an Array and fill it with the Image-data
-
call my Kernel:
3.1 get the data from the texture and save it in my val0 – vali are the Data around my val0 multiplyed with the standard gaussian
3.2 do some calculations and save the result in dstData
4.copy back the dstdata and show me the Image
the whole code works so far if i set the dstData equal to val0 - i get the unchanged image, but if i try to calculate for example the mean i get completely wired Data and image. (only “salpt and pepper” - noises)
i would be glad if someone could look over my code and tell me what i am doing wrong.
best Regards
Ibi