How to further improve performance??

Hi, Everyone. I can’t think another way to improve the performance, can anyone give me some ideas. Following is my code.

__global__ void MultiplyCoef_all(short* dst, short const *src, int *height , int *width)//, int N , const *coeff)
{
   __shared__ int sharebuffer[512];
   __shared__ int NewSharebuffer[256];
   int i=blockDim.x*blockIdx.x+threadIdx.x;
   int tid = threadIdx.x;
   int ThreadCount;
   if(i < (*width)*(*height)*N_constant)
   {
      sharebuffer[tid] = src[i]*coeff_constant[tid%N_constant];
      for( int stride = N_constant, shift = 1; stride > 1; stride>>=1, shift++)
      {
          ThreadCount = (stride/2)*((2*tid)/stride)+tid;
	  if( tid < (blockDim.x >> shift))
	      NewSharebuffer[tid] = sharebuffer[ThreadCount]+sharebuffer[ThreadCount+stride/2];
	      __syncthreads();
	      sharebuffer[tid] = NewSharebuffer[tid];
      }
      if( tid <(blockDim.x/N_constant))
	  dst[tid+(blockDim.x/N_constant)*blockIdx.x] = (sharebuffer[tid]+offset_constant) >> shift_constant;
	
   }
	
}

Run the nvidia profiler.