how to cache data from texture memory to shared memory in image

currently i’m using texture memory and i found my reads from texture is very large, since i read same data again n again … so i thought i should cache texture data to shared memory and operate on it… but i can’t find a way to load data…
currently my kernel is as follows

__device__ inline Byte
ComputeSpatial (const float InTemp[3][3])
{
		float temp_val = 0 ;
		unsigned int temp; 
		temp_val =   (  InTemp[0][0] + ( InTemp[0][1]*2.0f  )  + InTemp[0][2] + ( InTemp[1][0]*2.0f ) + (InTemp[1][1]*4.0f) + 
						( InTemp[1][2]*2.0f)  + InTemp[2][0]  + ( InTemp[2][1]*2.0f ) + InTemp[2][2] );
		
		//convert it into unsigned char
		temp = static_cast <unsigned int>(rint(temp_val*255.0f)); 
		temp = ( temp + 8 ) >> 4 ; 
		return ( static_cast <Byte> ( temp ) ); 

}

__device__ inline Byte
FetchPixelsSpatial (const unsigned int g_Row_Index , const unsigned int g_Col_Index, float InTemp[3][3] )
{
		InTemp[0][0]	=  tex2D( In_texImage,  g_Col_Index-1, g_Row_Index-1 ); 
		InTemp[0][1]	=  tex2D( In_texImage,  g_Col_Index, g_Row_Index-1   ); 
		InTemp[0][2]	=  tex2D( In_texImage,  g_Col_Index+1, g_Row_Index-1 ); 
		InTemp[1][0]	=  tex2D( In_texImage,  g_Col_Index-1, g_Row_Index   ); 
		InTemp[1][1]	=  tex2D( In_texImage,  g_Col_Index,   g_Row_Index   ); 
		InTemp[1][2]	=  tex2D( In_texImage,  g_Col_Index+1, g_Row_Index   ); 
		InTemp[2][0]	=  tex2D( In_texImage,  g_Col_Index-1, g_Row_Index+1 ); 
		InTemp[2][1]	=  tex2D( In_texImage,  g_Col_Index, g_Row_Index+1   ); 
		InTemp[2][2]	=  tex2D( In_texImage,  g_Col_Index+1, g_Row_Index+1 );
		return ( ComputeSpatial (InTemp) );
		
}

//Spatial filter kernel
__global__ void 
Spatial_Filter (  unsigned int *Spatial_Filtered, const int Width, const int Height)
{
	//calculate global memory location index
	const unsigned int g_Col_Index    = __umul24 (blockIdx.x , blockDim.x ) + threadIdx.x ; 
	const unsigned int g_Row_Index    = __umul24 (blockIdx.y , blockDim.y ) + threadIdx.y ;
	//for intermediate calculation 
	float InTemp[3][3] ; 
	
	
	//Boundary Conditions 
	if (g_Col_Index >= Width || g_Row_Index >= Height )
			return ; 
	
	
	//image boundary condtion
	if ( ( g_Row_Index != 0 ) && ( g_Col_Index != 0 ) && (g_Row_Index != Height - 1 ) &&  ( g_Col_Index != Width - 1 ) )
	{	

		Spatial_Filtered [ __umul24( g_Row_Index , Width ) + g_Col_Index ] = FetchPixelsSpatial(g_Row_Index , g_Col_Index, InTemp); 
	}

	return ;
}

Please some one can help me to cache data from texture memory to shared memory for fast processing