Hi, Everyone. I can’t think another way to improve the performance, can anyone give me some ideas. Following is my code.
__global__ void MultiplyCoef_all(short* dst, short const *src, int *height , int *width)//, int N , const *coeff)
{
__shared__ int sharebuffer[512];
__shared__ int NewSharebuffer[256];
int i=blockDim.x*blockIdx.x+threadIdx.x;
int tid = threadIdx.x;
int ThreadCount;
if(i < (*width)*(*height)*N_constant)
{
sharebuffer[tid] = src[i]*coeff_constant[tid%N_constant];
for( int stride = N_constant, shift = 1; stride > 1; stride>>=1, shift++)
{
ThreadCount = (stride/2)*((2*tid)/stride)+tid;
if( tid < (blockDim.x >> shift))
NewSharebuffer[tid] = sharebuffer[ThreadCount]+sharebuffer[ThreadCount+stride/2];
__syncthreads();
sharebuffer[tid] = NewSharebuffer[tid];
}
if( tid <(blockDim.x/N_constant))
dst[tid+(blockDim.x/N_constant)*blockIdx.x] = (sharebuffer[tid]+offset_constant) >> shift_constant;
}
}