CUDA image processing Accelaration tips anyone?

Hi everyone,

I have a problem with the results of image processing …

I try to implement the edge detection with Sobel, but as you can see, unwanted lines appear in the picture.

Does maybe someone knows why this is so?

The program is similar to the CUDA SDK, but I process about 1000 images with 3- way overlap and therefore can not use texture memory.

Here is a simple example on how to work

What is wrong?

__device__ unsigned char Sobel(unsigned char pix00,unsigned char pix01, unsigned char pix02,

					  unsigned char pix10,unsigned char pix11, unsigned char pix12,

					  unsigned char pix20, unsigned char pix21, unsigned char pix22){

					  short horizontal  = pix02 +2*pix12 +pix22 - pix00 -2*pix10 - pix20;

					  short vertical = pix00 + 2*pix01 + pix02 - pix20 -2*pix21 - pix22;

					  short suma = (short)(abs(horizontal) + abs(vertical));

					  if(suma<0)

						return 0;

					  else if(suma>0xff)

						return 0xff;

					  else

						  return (unsigned char)suma; 

}

__global__ void kernel( unsigned char *d_Data,int width, int hight){

	

	unsigned char pix00 = d_Data[threadIdx.x-1 + (blockIdx.x-1)*width];

	unsigned char pix01 = d_Data[threadIdx.x+0 + (blockIdx.x-1)*width];

	unsigned char pix02 = d_Data[threadIdx.x+1 + (blockIdx.x-1)*width];

	unsigned char pix10 = d_Data[threadIdx.x-1 + (blockIdx.x+0)*width];

	unsigned char pix11 = d_Data[threadIdx.x+0 + (blockIdx.x+0)*width];

	unsigned char pix12 = d_Data[threadIdx.x+1 + (blockIdx.x+0)*width];

	unsigned char pix20 = d_Data[threadIdx.x-1 + (blockIdx.x+1)*width];

	unsigned char pix21 = d_Data[threadIdx.x+0 + (blockIdx.x+1)*width];

	unsigned char pix22 = d_Data[threadIdx.x+1 + (blockIdx.x+1)*width];

	d_Data[threadIdx.x + blockIdx.x*width ] = Sobel(pix00, pix01, pix02, 

							pix10, pix11, pix12,

							pix20, pix21, pix22);

	

}

int main ( int argc, char ** argv ){

	unsigned char* h_Data;

	unsigned char* d_Data;

	unsigned char* image;

	char *name_image_load="tony10070.pgm";

	char *name_image_save="tony10070_res.pgm";

	int size;

	unsigned int hight;

	unsigned int width;

	// load image in unsigned char* image 

	size=width*hight;

	cudaMallocHost((void ** )&h_Data, size*sizeof(unsigned char));

	

	cudaMalloc ((void ** ) &d_Data, size*sizeof(unsigned char));

	

	cudaMemcpy(h_Data, image, size, cudaMemcpyHostToHost);

	cudaMemcpy(d_Data, h_Data, size, cudaMemcpyHostToDevice);

	

	kernel<<<hight, width>>>(d_Data, width, hight);

	cudaMemcpy(h_Data,d_Data, size, cudaMemcpyDeviceToHost);

	

	cudaThreadSynchronize();

	

	// save image

	cudaFree (d_Data);

	cudaFreeHost(h_Data);

	return 0; 

}

Thank you very much