I’m using Visual Studio 2017 and CUDA 11.1 to do an algorithm, which is very similar with reduction algorithm. And when I use debug mode to compile, the result looks like good. But when I use release mode to compile, calculation becomes very fast but the result sometimes becomes very strange.

And then I make a for loop of this algorithm and find that even in debug mode the result sometimes also becomes very strange. Algorithm is something like this( an algorithm to calculate mean value of image)

```
template<typename T>
__device__ void customAdd(T* sdata, T* g_odata) {
int tx = threadIdx.x;
int ty = threadIdx.y;
int tid = ty * blockDim.x + tx;
// do reduction in shared mem
if (tid < 512) { sdata[tid] += sdata[tid + 512]; } __syncthreads();
if (tid < 256) { sdata[tid] += sdata[tid + 256]; } __syncthreads();
if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads();
if (tid < 64) { sdata[tid] += sdata[tid + 64]; } __syncthreads();
if (tid < 32) { sdata[tid] += sdata[tid + 32]; }__syncthreads();
if (tid < 16) { sdata[tid] += sdata[tid + 16]; }__syncthreads();
if (tid < 8) { sdata[tid] += sdata[tid + 8]; }__syncthreads();
if (tid < 4) { sdata[tid] += sdata[tid + 4]; }__syncthreads();
if (tid < 2) { sdata[tid] += sdata[tid + 2]; }__syncthreads();
if (tid < 1) { sdata[tid] += sdata[tid + 1]; }__syncthreads();
// write result for this block to global mem
if (tid == 0) { atomicAdd(g_odata, sdata[tid]); }
}
#pragma region ProcImage
__global__ void cuda_defcan1() {
int tx = threadIdx.x;
int ty = threadIdx.y;
int tid = ty * blockDim.x + tx;
int x = blockIdx.x*blockDim.x + threadIdx.x;
int y = blockIdx.y*blockDim.y + threadIdx.y;
if ((y >= ROW) || (x >= COL)) {
return;
}
/* definite canonicalization */
int margine = CANMARGIN / 2;
int condition = ((x >= margine && y >= margine) &&
(x < COL - margine) && (y < ROW - margine) &&
d_image1[y][x] != WHITE);
double this_pixel = condition * (double)d_image1[y][x];
__shared__ double sdata[3][32*32];
sdata[0][tid] = this_pixel;
sdata[1][tid] = this_pixel * this_pixel;
sdata[2][tid] = condition;
__syncthreads();
customAdd(sdata[0], d_cuda_defcan_vars);
customAdd(sdata[1], d_cuda_defcan_vars + 1);
customAdd(sdata[2], d_cuda_defcan_vars + 2);
}
```