Hello everyone, i have been facing a problem which i can’t resolve … pls help if some one can
Problem description:
I’m writing image processing CUDA code…
initially, i lunched threads such that one pixel per thread… so a thread is responsible for final output of assigned pixel and writing data back to my output array as “byte by byte”.
__device__ inline Byte
ComputeSpatial (const float InTemp[3][3])
{
float temp_val = 0 ;
unsigned int temp;
temp_val = ( InTemp[0][0] + ( InTemp[0][1]*2.0f ) + InTemp[0][2] + ( InTemp[1][0]*2.0f ) + (InTemp[1][1]*4.0f) +
( InTemp[1][2]*2.0f) + InTemp[2][0] + ( InTemp[2][1]*2.0f ) + InTemp[2][2] );
//convert it into unsigned char
temp = static_cast <unsigned int>(rint(temp_val*255.0f));
temp = ( temp + 8 ) >> 4 ;
return ( static_cast <Byte> ( temp ) );
}
__device__ inline Byte
FetchPixelsSpatial (const unsigned int g_Row_Index , const unsigned int g_Col_Index, float InTemp[3][3] )
{
InTemp[0][0] = tex2D( In_texImage, g_Col_Index-1, g_Row_Index-1 );
InTemp[0][1] = tex2D( In_texImage, g_Col_Index, g_Row_Index-1 );
InTemp[0][2] = tex2D( In_texImage, g_Col_Index+1, g_Row_Index-1 );
InTemp[1][0] = tex2D( In_texImage, g_Col_Index-1, g_Row_Index );
InTemp[1][1] = tex2D( In_texImage, g_Col_Index, g_Row_Index );
InTemp[1][2] = tex2D( In_texImage, g_Col_Index+1, g_Row_Index );
InTemp[2][0] = tex2D( In_texImage, g_Col_Index-1, g_Row_Index+1 );
InTemp[2][1] = tex2D( In_texImage, g_Col_Index, g_Row_Index+1 );
InTemp[2][2] = tex2D( In_texImage, g_Col_Index+1, g_Row_Index+1 );
return ( ComputeSpatial (InTemp) );
}
//Spatial filter kernel
__global__ void
Spatial_Filter ( Byte *Spatial_Filtered, const int Width, const int Height)
{
//calculate global memory location index
const unsigned int g_Col_Index = __umul24 (blockIdx.x , blockDim.x ) + threadIdx.x ;
const unsigned int g_Row_Index = __umul24 (blockIdx.y , blockDim.y ) + threadIdx.y ;
//for intermediate calculation
float InTemp[3][3] ;
//Boundary Conditions
if (g_Col_Index >= Width || g_Row_Index >= Height )
return ;
//image boundary condtion
if ( ( g_Row_Index != 0 ) && ( g_Col_Index != 0 ) && (g_Row_Index != Height - 1 ) && ( g_Col_Index != Width - 1 ) )
{
Spatial_Filtered [ __umul24( g_Row_Index , Width ) + g_Col_Index ] = FetchPixelsSpatial (g_Row_Index , g_Col_Index, InTemp);
}
return ;
}
and the grid is such that ;
blockWidth = 32 and blockheight = 8
gridwidth = width/blockwidth and gridheight = height/blockheight
assume there is integer division over here…
now instead of these, i’m trying to write 4 byte at a time for that
i redesign my grid as
blockWidth = 32 and blockheight = 8
gridwidth = (width/4)/blockwidth and gridheight = height/blockheight
assume there is integer division over here…
means horizontally 4 times less blocks…
and the code is
__device__ inline Byte
ComputeSpatial (const float InTemp[3][3])
{
float temp_val = 0 ;
unsigned int temp;
temp_val = ( InTemp[0][0] + ( InTemp[0][1]*2.0f ) + InTemp[0][2] + ( InTemp[1][0]*2.0f ) + (InTemp[1][1]*4.0f) +
( InTemp[1][2]*2.0f) + InTemp[2][0] + ( InTemp[2][1]*2.0f ) + InTemp[2][2] );
//convert it into unsigned char
temp = static_cast <unsigned int>(rint(temp_val*255.0f));
temp = ( temp + 8 ) >> 4 ;
return ( static_cast <Byte> ( temp ) );
}
__device__ inline Byte
FetchPixelsSpatial (const unsigned int g_Row_Index , const unsigned int g_Col_Index, float InTemp[3][3] )
{
InTemp[0][0] = tex2D( In_texImage, g_Col_Index-1, g_Row_Index-1 );
InTemp[0][1] = tex2D( In_texImage, g_Col_Index, g_Row_Index-1 );
InTemp[0][2] = tex2D( In_texImage, g_Col_Index+1, g_Row_Index-1 );
InTemp[1][0] = tex2D( In_texImage, g_Col_Index-1, g_Row_Index );
InTemp[1][1] = tex2D( In_texImage, g_Col_Index, g_Row_Index );
InTemp[1][2] = tex2D( In_texImage, g_Col_Index+1, g_Row_Index );
InTemp[2][0] = tex2D( In_texImage, g_Col_Index-1, g_Row_Index+1 );
InTemp[2][1] = tex2D( In_texImage, g_Col_Index, g_Row_Index+1 );
InTemp[2][2] = tex2D( In_texImage, g_Col_Index+1, g_Row_Index+1 );
return ( ComputeSpatial (InTemp) );
}
__device__ inline unsigned int
Spatial4Pixels (const unsigned int g_Row_Index , const unsigned int g_Col_Index, float InTemp[3][3])
{
//fetch pixels
Byte p0 = FetchPixelsSpatial (g_Row_Index , g_Col_Index*4 + 0, InTemp );
Byte p1 = FetchPixelsSpatial (g_Row_Index , g_Col_Index*4 + 1, InTemp );
Byte p2 = FetchPixelsSpatial (g_Row_Index , g_Col_Index*4 + 2, InTemp );
Byte p3 = FetchPixelsSpatial (g_Row_Index , g_Col_Index*4 + 3, InTemp );
unsigned int op = (p0 | (p1 << 8) | (p2 << 16) | (p3 << 24)) ;
return op;
}
//Spatial filter kernel
__global__ void
Spatial_Filter ( unsigned int *Spatial_Filtered, const int Width, const int Height)
{
//calculate global memory location index
const unsigned int g_Col_Index = __umul24 (blockIdx.x , blockDim.x ) + threadIdx.x ;
const unsigned int g_Row_Index = __umul24 (blockIdx.y , blockDim.y ) + threadIdx.y ;
//for intermediate calculation
float InTemp[3][3] ;
//Boundary Conditions
if (g_Col_Index >= Width || g_Row_Index >= Height )
return ;
//image boundary condtion
if ( ( g_Row_Index != 0 ) && ( g_Col_Index != 0 ) && (g_Row_Index != Height - 1 ) && ( g_Col_Index != Width - 1 ) )
{
Spatial_Filtered [ __umul24( g_Row_Index , Width ) + g_Col_Index ] = Spatial4Pixels(g_Row_Index , g_Col_Index, InTemp);
}
return ;
}
But the output is incorrect for last three pixels (p1,p2,p3)some where (means for some pixels it is correct and for some not correct)