Hello, everyone. I wrote some CUDA code like follows.I don’t know why every time my result is different, I have already appended __syncthreads().Can anybody give me the suggestions, thanks in advance.

```
__global__ void MultiplyCoef_all(short* dst, short const *src, int *height , int *width)//, int N , const *coeff)
{
__shared__ int sharebuffer[512];
int i=blockDim.x*blockIdx.x+threadIdx.x;
int tid = threadIdx.x;
int Thread_Threshold;
int ThreadCount;
if(i < (*width)*(*height)*N_constant)
{
sharebuffer[tid] = src[i]*coeff_constant[tid%N_constant];
__syncthreads();
for( int stride = N_constant; stride > 1; stride>>=1)
{
Thread_Threshold = blockDim.x >> 1;
ThreadCount = (stride/2)*((2*tid)/stride)+tid;
if( tid < Thread_Threshold)
sharebuffer[tid] = sharebuffer[ThreadCount]+sharebuffer[ThreadCount+stride/2];
__syncthreads();
}
int ReturnStride = (blockDim.x/N_constant);
if( tid <ReturnStride)
dst[tid+ReturnStride*blockIdx.x] = (sharebuffer[tid]+offset_constant) >> shift_constant;
__syncthreads();
}
}
```