How to optimize it further

I have following kernle which i need to optimize it further

__device__ inline Byte
ComputeSpatial (const float InTemp[3][3])
{
		float temp_val = 0 ;
		unsigned int temp; 
		temp_val =   (  InTemp[0][0] + ( InTemp[0][1]*2.0f  )  + InTemp[0][2] + ( InTemp[1][0]*2.0f ) + (InTemp[1][1]*4.0f) + 
						( InTemp[1][2]*2.0f)  + InTemp[2][0]  + ( InTemp[2][1]*2.0f ) + InTemp[2][2] );
		
		//convert it into unsigned char
		temp = static_cast <unsigned int>(rint(temp_val*255.0f)); 
		temp = ( temp + 8 ) >> 4 ; 
		return ( static_cast <Byte> ( temp ) ); 

}

__device__ inline Byte
FetchPixelsSpatial (const unsigned int g_Row_Index , const unsigned int g_Col_Index, float InTemp[3][3] )
{
		InTemp[0][0]	=  tex2D( In_texImage,  g_Col_Index-1, g_Row_Index-1 ); 
		InTemp[0][1]	=  tex2D( In_texImage,  g_Col_Index, g_Row_Index-1   ); 
		InTemp[0][2]	=  tex2D( In_texImage,  g_Col_Index+1, g_Row_Index-1 ); 
		InTemp[1][0]	=  tex2D( In_texImage,  g_Col_Index-1, g_Row_Index   ); 
		InTemp[1][1]	=  tex2D( In_texImage,  g_Col_Index,   g_Row_Index   ); 
		InTemp[1][2]	=  tex2D( In_texImage,  g_Col_Index+1, g_Row_Index   ); 
		InTemp[2][0]	=  tex2D( In_texImage,  g_Col_Index-1, g_Row_Index+1 ); 
		InTemp[2][1]	=  tex2D( In_texImage,  g_Col_Index, g_Row_Index+1   ); 
		InTemp[2][2]	=  tex2D( In_texImage,  g_Col_Index+1, g_Row_Index+1 );
		return ( ComputeSpatial (InTemp) );
		
}

//Spatial filter kernel
__global__ void 
Spatial_Filter (  unsigned int *Spatial_Filtered, const int Width, const int Height)
{
	//calculate global memory location index
	const unsigned int g_Col_Index    = __umul24 (blockIdx.x , blockDim.x ) + threadIdx.x ; 
	const unsigned int g_Row_Index    = __umul24 (blockIdx.y , blockDim.y ) + threadIdx.y ;
	//for intermediate calculation 
	float InTemp[3][3] ; 
	
	
	//Boundary Conditions 
	if (g_Col_Index >= Width || g_Row_Index >= Height )
			return ; 
	
	
	//image boundary condtion
	if ( ( g_Row_Index != 0 ) && ( g_Col_Index != 0 ) && (g_Row_Index != Height - 1 ) &&  ( g_Col_Index != Width - 1 ) )
	{	

		Spatial_Filtered [ __umul24( g_Row_Index , Width ) + g_Col_Index ] = FetchPixelsSpatial(g_Row_Index , g_Col_Index, InTemp); 
	}

	return ;
}

blockwidth = 32 and blockheight = 8
gridwidhth = width/blockwidht and gridheight = height/blockheight
assume all division are integer devision

How can i optimize it more

Just a first idea, but maybe try to flatten the InTemp array, like shown in talonmies’ reply below:
http://stackoverflow.com/questions/5631115/2d-array-on-cuda Although that might be negligible given the small array size… not sure, but worth a shot. For that matter, I’d also try passing that array by reference (after flattening) and see if you see any difference.

I see a lot of casting also, but not sure how that affects optimization… perhaps others can comment.

Edit: I see InTemp is actually defined in the kernel, I thought it copied over from a host variable. Not sure if the above advice will help in that regard.