4 byte at a time problem in image

Hello everyone, i have been facing a problem which i can’t resolve … pls help if some one can

Problem description:
I’m writing image processing CUDA code…
initially, i lunched threads such that one pixel per thread… so a thread is responsible for final output of assigned pixel and writing data back to my output array as “byte by byte”.

__device__ inline Byte
ComputeSpatial (const float InTemp[3][3])
{
		float temp_val = 0 ;
		unsigned int temp; 
		temp_val =   (  InTemp[0][0] + ( InTemp[0][1]*2.0f  )  + InTemp[0][2] + ( InTemp[1][0]*2.0f ) + (InTemp[1][1]*4.0f) + 
						( InTemp[1][2]*2.0f)  + InTemp[2][0]  + ( InTemp[2][1]*2.0f ) + InTemp[2][2] );
		
		//convert it into unsigned char
		temp = static_cast <unsigned int>(rint(temp_val*255.0f)); 
		temp = ( temp + 8 ) >> 4 ; 
		return ( static_cast <Byte> ( temp ) ); 

}

__device__ inline Byte
FetchPixelsSpatial (const unsigned int g_Row_Index , const unsigned int g_Col_Index, float InTemp[3][3] )
{
		InTemp[0][0]	=  tex2D( In_texImage,  g_Col_Index-1, g_Row_Index-1 ); 
		InTemp[0][1]	=  tex2D( In_texImage,  g_Col_Index, g_Row_Index-1   ); 
		InTemp[0][2]	=  tex2D( In_texImage,  g_Col_Index+1, g_Row_Index-1 ); 
		InTemp[1][0]	=  tex2D( In_texImage,  g_Col_Index-1, g_Row_Index   ); 
		InTemp[1][1]	=  tex2D( In_texImage,  g_Col_Index,   g_Row_Index   ); 
		InTemp[1][2]	=  tex2D( In_texImage,  g_Col_Index+1, g_Row_Index   ); 
		InTemp[2][0]	=  tex2D( In_texImage,  g_Col_Index-1, g_Row_Index+1 ); 
		InTemp[2][1]	=  tex2D( In_texImage,  g_Col_Index, g_Row_Index+1   ); 
		InTemp[2][2]	=  tex2D( In_texImage,  g_Col_Index+1, g_Row_Index+1 );
		return ( ComputeSpatial (InTemp) );
		
}

//Spatial filter kernel
__global__ void 
Spatial_Filter (  Byte *Spatial_Filtered, const int Width, const int Height)
{
	//calculate global memory location index
	const unsigned int g_Col_Index    = __umul24 (blockIdx.x , blockDim.x ) + threadIdx.x ; 
	const unsigned int g_Row_Index    = __umul24 (blockIdx.y , blockDim.y ) + threadIdx.y ;
	//for intermediate calculation 
	float InTemp[3][3] ; 
	
	
	//Boundary Conditions 
	if (g_Col_Index >= Width || g_Row_Index >= Height )
			return ; 
	
	
	//image boundary condtion
	if ( ( g_Row_Index != 0 ) && ( g_Col_Index != 0 ) && (g_Row_Index != Height - 1 ) &&  ( g_Col_Index != Width - 1 ) )
	{	

		Spatial_Filtered [ __umul24( g_Row_Index , Width ) + g_Col_Index ] = FetchPixelsSpatial (g_Row_Index , g_Col_Index, InTemp); 
	}

	return ;
}

and the grid is such that ;
blockWidth = 32 and blockheight = 8
gridwidth = width/blockwidth and gridheight = height/blockheight
assume there is integer division over here…

now instead of these, i’m trying to write 4 byte at a time for that
i redesign my grid as
blockWidth = 32 and blockheight = 8
gridwidth = (width/4)/blockwidth and gridheight = height/blockheight
assume there is integer division over here…
means horizontally 4 times less blocks…

and the code is

__device__ inline Byte
ComputeSpatial (const float InTemp[3][3])
{
		float temp_val = 0 ;
		unsigned int temp; 
		temp_val =   (  InTemp[0][0] + ( InTemp[0][1]*2.0f  )  + InTemp[0][2] + ( InTemp[1][0]*2.0f ) + (InTemp[1][1]*4.0f) + 
						( InTemp[1][2]*2.0f)  + InTemp[2][0]  + ( InTemp[2][1]*2.0f ) + InTemp[2][2] );
		
		//convert it into unsigned char
		temp = static_cast <unsigned int>(rint(temp_val*255.0f)); 
		temp = ( temp + 8 ) >> 4 ; 
		return ( static_cast <Byte> ( temp ) ); 

}

__device__ inline Byte
FetchPixelsSpatial (const unsigned int g_Row_Index , const unsigned int g_Col_Index, float InTemp[3][3] )
{
		InTemp[0][0]	=  tex2D( In_texImage,  g_Col_Index-1, g_Row_Index-1 ); 
		InTemp[0][1]	=  tex2D( In_texImage,  g_Col_Index, g_Row_Index-1   ); 
		InTemp[0][2]	=  tex2D( In_texImage,  g_Col_Index+1, g_Row_Index-1 ); 
		InTemp[1][0]	=  tex2D( In_texImage,  g_Col_Index-1, g_Row_Index   ); 
		InTemp[1][1]	=  tex2D( In_texImage,  g_Col_Index,   g_Row_Index   ); 
		InTemp[1][2]	=  tex2D( In_texImage,  g_Col_Index+1, g_Row_Index   ); 
		InTemp[2][0]	=  tex2D( In_texImage,  g_Col_Index-1, g_Row_Index+1 ); 
		InTemp[2][1]	=  tex2D( In_texImage,  g_Col_Index, g_Row_Index+1   ); 
		InTemp[2][2]	=  tex2D( In_texImage,  g_Col_Index+1, g_Row_Index+1 );
		return ( ComputeSpatial (InTemp) );
		
}

__device__ inline unsigned int
Spatial4Pixels (const unsigned int g_Row_Index , const unsigned int g_Col_Index, float InTemp[3][3])
{
	//fetch pixels 
    Byte p0 = FetchPixelsSpatial (g_Row_Index , g_Col_Index*4 + 0, InTemp ); 

    Byte p1 = FetchPixelsSpatial (g_Row_Index , g_Col_Index*4 + 1, InTemp );

    Byte p2 = FetchPixelsSpatial (g_Row_Index , g_Col_Index*4 + 2, InTemp );

    Byte p3 = FetchPixelsSpatial (g_Row_Index , g_Col_Index*4 + 3, InTemp );

    unsigned int op = (p0 | (p1 << 8) | (p2 << 16) | (p3 << 24)) ;

    return op;

}

//Spatial filter kernel
__global__ void 
Spatial_Filter (  unsigned int *Spatial_Filtered, const int Width, const int Height)
{
	//calculate global memory location index
	const unsigned int g_Col_Index    = __umul24 (blockIdx.x , blockDim.x ) + threadIdx.x ; 
	const unsigned int g_Row_Index    = __umul24 (blockIdx.y , blockDim.y ) + threadIdx.y ;
	//for intermediate calculation 
	float InTemp[3][3] ; 
	
	
	//Boundary Conditions 
	if (g_Col_Index >= Width || g_Row_Index >= Height )
			return ; 
	
	
	//image boundary condtion
	if ( ( g_Row_Index != 0 ) && ( g_Col_Index != 0 ) && (g_Row_Index != Height - 1 ) &&  ( g_Col_Index != Width - 1 ) )
	{	

		Spatial_Filtered [ __umul24( g_Row_Index , Width ) + g_Col_Index ] = Spatial4Pixels(g_Row_Index , g_Col_Index, InTemp); 
	}

	return ;
}

But the output is incorrect for last three pixels (p1,p2,p3)some where (means for some pixels it is correct and for some not correct)

Problem has Solved