problems reagarding binding an array to a texture

Hi everyone,

im trying to implement an Image manipulation-Program, a simple bilateral Filter. My code look like this:

texture<float, 2, cudaReadModeElementType> texRefArray;

__global__ void bilateralFilter(float* dstData, int elemNr, int width, int height, float* maskData, int sigmaR){

	int x = blockIdx.x * blockDim.x + threadIdx.x;

	int y = blockIdx.y * blockDim.y + threadIdx.y;

	int  curMask;

	float intens, factor, val0, vali, wp=0, k=0;

	int filterRadius=4;

-----------------val0 = 	tex2D(texRefArray, (x)/(float)width, (y)/(float)height);

----------------if(x>=filterRadius && y>=filterRadius && x<width-filterRadius && y<height-filterRadius)

	{

		for (int j=-filterRadius; j<=filterRadius; j++)

		{

	                for (int i=-filterRadius; i<=filterRadius; i++)

			{

			    vali = tex2D(texRefArray, (x+i)/(float)width, (y+j)/(float)height);

	                    curMask = i + filterRadius + (j + filterRadius)*(filterRadius*2+1);

	                    intens = val0-vali;

			    	factor = exp(-0.5 * intens*intens/(sigmaR*sigmaR)) * maskData[curMask];

	                    	wp += factor * vali;

	                    	k += factor;

	                } //end for i

		}//end for j

----------------dstData[x+y*elemNr]     = val0; // wp/k;

	}//end if(filterRadius)

} //end BilateralFilter

int main(int argc, char *argv[])

{

	IplImage* srcImg_h;

  	IplImage* dstImg_h;

	int height,width;

	//load Image

	srcImg_h=cvLoadImage(argv[1]);

	if(!srcImg_h){

		printf("Could not load image file: %s\n",argv[1]);

		exit(0);

	}//end if()

	if(srcImg_h->nChannels == 3)

	{

		IplImage* tmp_img = cvCreateImage(cvGetSize(srcImg_h), IPL_DEPTH_8U, 1);

		cvCvtColor(srcImg_h, tmp_img, CV_BGR2GRAY);

		srcImg_h = tmp_img;

	} //end if()

	//create dst IMG

	dstImg_h =  cvCreateImage( cv::Size(srcImg_h->width, srcImg_h->height), IPL_DEPTH_8U, 1 );

	//IMGData and Maskdata Device and Host

	float* src_imgData_h = (float*) srcImg_h->imageData ;

	float* dst_imgData_h =(float*) dstImg_h->imageData ;

	float* src_imgData_d, *dst_imgData_d;

	float* maskData_h, *maskData_d;

	//Img Properties

	width = srcImg_h->width;

	height = srcImg_h->height;

	size_t size = width*height*sizeof(float);

	//filter Properties

	int filterRad=4, sigmaR=5, sigmaS=1;

	size_t maskSize = (filterRad*2+1)*(filterRad*2+1)*sizeof(float);

	

	//Compute MaskData

	maskData_h = (float *)malloc(maskSize);

	printf("maskData: \n");

	for(int j=-filterRad; j<=filterRad;j++){

		for(int i=-filterRad; i<=filterRad; i++){

			maskData_h[i+filterRad+(j+filterRad)*(filterRad*2+1)]=exp(-0.5 * pow(sqrt(i*i+j*j)/sigmaS,2));

			printf(" %f  " ,maskData_h[i+filterRad+(j+filterRad)*(filterRad*2+1)]);

		}

		printf("\n");

	}

	//maskData Host to Device

	cudaMalloc((void **) &maskData_d, maskSize);

	cudaMemcpy(maskData_d, maskData_h, maskSize, cudaMemcpyHostToDevice);

	

	

	// ------------------------

	// Device memory allocation

	// ------------------------

	// Pitch linear input data

	float *d_idataPL;

	size_t d_pitchBytes;

	cudaMallocPitch((void**) &d_idataPL, &d_pitchBytes, width*sizeof(float), height);

	// Array input data

	cudaArray *d_idataArray;

	cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();

	cudaMallocArray(&d_idataArray, &channelDesc, width, height);

	cudaMemcpyToArray(d_idataArray, 0, 0, src_imgData_h, size, cudaMemcpyHostToDevice);

	size_t h_pitchBytes = width*sizeof(float);

	// Array

	texRefArray.normalized = 1;

	texRefArray.filterMode = cudaFilterModePoint;

	texRefArray.addressMode[0] = cudaAddressModeClamp;

	texRefArray.addressMode[1] = cudaAddressModeClamp;

	cudaBindTextureToArray(texRefArray, d_idataArray, channelDesc);

	cudaMalloc((void **) &dst_imgData_d, size);

	cudaMemset2D(dst_imgData_d, d_pitchBytes, 0, width*sizeof(float), height);

	cudaEvent_t start, stop;

	cudaEventCreate(&start);

	cudaEventCreate(&stop);

	cudaEventRecord(start, 0);

	// execution configuration parameters

	dim3 blockDim(16, 16, 1);

	dim3 gridDim((width + blockDim.x - 1)/ blockDim.x, (height + blockDim.y - 1) / blockDim.y, 1);

//	dim3 grid(width/TILE_DIM, height/TILE_DIM), threads(TILE_DIM, TILE_DIM);

//	for (int i=0; i < width; i++) {

		bilateralFilter<<<gridDim, blockDim, 0>>>(dst_imgData_d, d_pitchBytes/sizeof(float), width, height, maskData_d, sigmaR);

//	}

	cudaEventRecord(stop, 0);

	cudaEventSynchronize(stop);

	float timeArray;

	cudaEventElapsedTime(&timeArray, start, stop);    

	cudaMemcpy2D(dst_imgData_h, h_pitchBytes, dst_imgData_d, d_pitchBytes, width*sizeof(float), height, cudaMemcpyDeviceToHost);

	//Create Window and show Image

	cvNamedWindow("Output", CV_WINDOW_AUTOSIZE); 

	cvMoveWindow("Output", 300, 100);

	cvShowImage("Output", dstImg_h);

	// wait for a key

	cvWaitKey(0);

	// release the image

	cvReleaseImage(&dstImg_h);

	cudaUnbindTexture(texRefArray);

	cudaFreeArray(d_idataArray);

	cudaFree(dst_imgData_d);

	cudaEventDestroy(start);

	cudaEventDestroy(stop);

}//end Main()

i have tryed to highlight the codesnippt, i have problems with.

First of all i want to explain what i am trying to do:

  1. Load an Image and get the Imagedata from that

  2. bind Texture to an Array and fill it with the Image-data

  3. call my Kernel:

3.1 get the data from the texture and save it in my val0 – vali are the Data around my val0 multiplyed with the standard gaussian

3.2 do some calculations and save the result in dstData

4.copy back the dstdata and show me the Image

the whole code works so far if i set the dstData equal to val0 - i get the unchanged image, but if i try to calculate for example the mean i get completely wired Data and image. (only “salpt and pepper” - noises)

i would be glad if someone could look over my code and tell me what i am doing wrong.

best Regards

Ibi