Problem with Sobel Filter unwanted lines appear in the picture

Hi everyone,

I have a problem with the results of image processing …

I try to implement the edge detection with Sobel, but as you can see, unwanted lines appear in the picture.

Does maybe someone knows why this is so?

The program is similar to the CUDA SDK, but I process about 1000 images with 3- way overlap and therefore can not use texture memory.

Here is a simple example on how to work (for one image)

What is wrong ?

__device__ unsigned char Sobel(unsigned char pix00,unsigned char pix01, unsigned char pix02,

                                     unsigned char pix10,unsigned char pix11, unsigned char pix12,

                                     unsigned char pix20, unsigned char pix21, unsigned char pix22){

short horizontal  = pix02 +2*pix12 +pix22 - pix00 -2*pix10 - pix20;

          short vertical = pix00 + 2*pix01 + pix02 - pix20 -2*pix21 - pix22;

short suma = (short)(abs(horizontal) + abs(vertical));

          if(suma<0)

             return 0;

          else if(suma>0xff)

             return 0xff;

          else

             return (unsigned char)suma; 

}

__global__ void kernel( unsigned char *d_Data,int width, int hight){

unsigned char pix00 = d_Data[threadIdx.x-1 + (blockIdx.x-1)*width];

        unsigned char pix01 = d_Data[threadIdx.x+0 + (blockIdx.x-1)*width];

        unsigned char pix02 = d_Data[threadIdx.x+1 + (blockIdx.x-1)*width];

        unsigned char pix10 = d_Data[threadIdx.x-1 + (blockIdx.x+0)*width];

        unsigned char pix11 = d_Data[threadIdx.x+0 + (blockIdx.x+0)*width];

        unsigned char pix12 = d_Data[threadIdx.x+1 + (blockIdx.x+0)*width];

        unsigned char pix20 = d_Data[threadIdx.x-1 + (blockIdx.x+1)*width];

        unsigned char pix21 = d_Data[threadIdx.x+0 + (blockIdx.x+1)*width];

        unsigned char pix22 = d_Data[threadIdx.x+1 + (blockIdx.x+1)*width];

d_Data[threadIdx.x + blockIdx.x*width ] = Sobel(pix00, pix01, pix02, 

                                                        pix10, pix11, pix12,

                                                        pix20, pix21, pix22);

}

int main ( int argc, char ** argv ){

unsigned char* h_Data;

        unsigned char* d_Data;

        unsigned char* image;

char *name_image_load="tony10070.pgm";

        char *name_image_save="tony10070_res.pgm";

int size;

        unsigned int hight;

        unsigned int width;

// load image in unsigned char* image 

size=width*hight;

cudaMallocHost((void ** )&h_Data, size*sizeof(unsigned char));

cudaMalloc ((void ** ) &d_Data, size*sizeof(unsigned char));

cudaMemcpy(h_Data, image, size, cudaMemcpyHostToHost);

        cudaMemcpy(d_Data, h_Data, size, cudaMemcpyHostToDevice);

kernel<<<hight, width>>>(d_Data, width, hight);

cudaMemcpy(h_Data,d_Data, size, cudaMemcpyDeviceToHost);

cudaThreadSynchronize();

// save image

cudaFree (d_Data);

        cudaFreeHost(h_Data);

return 0; 

}

image.jpg

image.jpg

Hi everyone,

I have a problem with the results of image processing …

I try to implement the edge detection with Sobel, but as you can see, unwanted lines appear in the picture.

Does maybe someone knows why this is so?

The program is similar to the CUDA SDK, but I process about 1000 images with 3- way overlap and therefore can not use texture memory.

Here is a simple example on how to work (for one image)

What is wrong ?

__device__ unsigned char Sobel(unsigned char pix00,unsigned char pix01, unsigned char pix02,

                                     unsigned char pix10,unsigned char pix11, unsigned char pix12,

                                     unsigned char pix20, unsigned char pix21, unsigned char pix22){

short horizontal  = pix02 +2*pix12 +pix22 - pix00 -2*pix10 - pix20;

          short vertical = pix00 + 2*pix01 + pix02 - pix20 -2*pix21 - pix22;

short suma = (short)(abs(horizontal) + abs(vertical));

          if(suma<0)

             return 0;

          else if(suma>0xff)

             return 0xff;

          else

             return (unsigned char)suma; 

}

__global__ void kernel( unsigned char *d_Data,int width, int hight){

unsigned char pix00 = d_Data[threadIdx.x-1 + (blockIdx.x-1)*width];

        unsigned char pix01 = d_Data[threadIdx.x+0 + (blockIdx.x-1)*width];

        unsigned char pix02 = d_Data[threadIdx.x+1 + (blockIdx.x-1)*width];

        unsigned char pix10 = d_Data[threadIdx.x-1 + (blockIdx.x+0)*width];

        unsigned char pix11 = d_Data[threadIdx.x+0 + (blockIdx.x+0)*width];

        unsigned char pix12 = d_Data[threadIdx.x+1 + (blockIdx.x+0)*width];

        unsigned char pix20 = d_Data[threadIdx.x-1 + (blockIdx.x+1)*width];

        unsigned char pix21 = d_Data[threadIdx.x+0 + (blockIdx.x+1)*width];

        unsigned char pix22 = d_Data[threadIdx.x+1 + (blockIdx.x+1)*width];

d_Data[threadIdx.x + blockIdx.x*width ] = Sobel(pix00, pix01, pix02, 

                                                        pix10, pix11, pix12,

                                                        pix20, pix21, pix22);

}

int main ( int argc, char ** argv ){

unsigned char* h_Data;

        unsigned char* d_Data;

        unsigned char* image;

char *name_image_load="tony10070.pgm";

        char *name_image_save="tony10070_res.pgm";

int size;

        unsigned int hight;

        unsigned int width;

// load image in unsigned char* image 

size=width*hight;

cudaMallocHost((void ** )&h_Data, size*sizeof(unsigned char));

cudaMalloc ((void ** ) &d_Data, size*sizeof(unsigned char));

cudaMemcpy(h_Data, image, size, cudaMemcpyHostToHost);

        cudaMemcpy(d_Data, h_Data, size, cudaMemcpyHostToDevice);

kernel<<<hight, width>>>(d_Data, width, hight);

cudaMemcpy(h_Data,d_Data, size, cudaMemcpyDeviceToHost);

cudaThreadSynchronize();

// save image

cudaFree (d_Data);

        cudaFreeHost(h_Data);

return 0; 

}

That looks simple: you have mismatched block/threads! Typically you should have block of 16x16 pixels and 256 threads in each block. Ever take care that you are in the image and that the data are in the image !

The following is an example of my own of subtracting one image to another :

extern "C"

void CuAfix_Subtract(

							float *WorkD,float *WorkS,uint Pitch,

							uint cu,uint lu,

							cudaStream_t stream)

{

uint fPitch;

dim3 grid(iDivUp(cu, global_TileC), iDivUp(lu, global_TileL));

dim3 threads(global_TileC, global_TileL);

	fPitch=Pitch/sizeof(float);

	CuAfix_Subtract_K<<<grid,threads,0,stream>>>(

			WorkD,WorkS,fPitch,

			cu,lu);

}

__global__

void CuAfix_Subtract_K(

							float *WorkD,float *WorkS,uint fPitch,

							 uint cu,uint lu)

{

uint  ix,iy;

float w;

	ix = blockIdx.x * blockDim.x + threadIdx.x;

	iy = blockIdx.y * blockDim.y + threadIdx.y;

	if(ix < cu && iy < lu){

		

		w=WorkD[ix+iy*fPitch];

		WorkD[ix+iy*fPitch]=w-WorkS[ix+iy*fPitch];

	}

}

global_TileC and global_TileL can be tuned. Start with 16 x 16

Yves

That looks simple: you have mismatched block/threads! Typically you should have block of 16x16 pixels and 256 threads in each block. Ever take care that you are in the image and that the data are in the image !

The following is an example of my own of subtracting one image to another :

extern "C"

void CuAfix_Subtract(

							float *WorkD,float *WorkS,uint Pitch,

							uint cu,uint lu,

							cudaStream_t stream)

{

uint fPitch;

dim3 grid(iDivUp(cu, global_TileC), iDivUp(lu, global_TileL));

dim3 threads(global_TileC, global_TileL);

	fPitch=Pitch/sizeof(float);

	CuAfix_Subtract_K<<<grid,threads,0,stream>>>(

			WorkD,WorkS,fPitch,

			cu,lu);

}

__global__

void CuAfix_Subtract_K(

							float *WorkD,float *WorkS,uint fPitch,

							 uint cu,uint lu)

{

uint  ix,iy;

float w;

	ix = blockIdx.x * blockDim.x + threadIdx.x;

	iy = blockIdx.y * blockDim.y + threadIdx.y;

	if(ix < cu && iy < lu){

		

		w=WorkD[ix+iy*fPitch];

		WorkD[ix+iy*fPitch]=w-WorkS[ix+iy*fPitch];

	}

}

global_TileC and global_TileL can be tuned. Start with 16 x 16

Yves