Reduction results differs every time

I write this code:

#include <iostream>
#include <fstream>

template <typename T1, typename T2>
__global__ void vec_rm(T1 *dev_in, T1 *dev_out, T2 *dev_size)
{
    extern __shared__ double arr[];
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < *dev_size)
    {
    int tid = threadIdx.x;
    arr[tid] = dev_in[i];
    __syncthreads();
    for (int s = 1; s < blockDim.x; s*= 2)
    {
        if (tid % (2*s) == 0)
        {
            arr[tid] += arr[tid+s];
        }
        __syncthreads();
    }

    printf("dev_in[%d] = %lf\n", i, dev_in[i]);
    if (tid == 0)
    {
    dev_out[blockIdx.x] = arr[0];
    printf("dev_block[%d] = %lf\n", blockIdx.x, arr[0]);
    }
    }
}

int main()
{
    double a[] = {1,2,3,90,28,45,-8};
    int size = 7, TC = 3, BL = size / TC,*dev_size;
    double sum = 0,*dev_a, *dev_result;
    double result[BL];
    cudaMalloc((void**)&dev_result, BL*sizeof(double));
    if (size%TC == 0)
    {
        cudaMalloc( (void**)&dev_a, size*sizeof(double));
        cudaMemcpy( dev_a, a, size*sizeof(double), cudaMemcpyHostToDevice);
        cudaMalloc((void**)&dev_size, sizeof(int));
        cudaMemcpy(dev_size, &size, sizeof(int), cudaMemcpyHostToDevice);
        vec_rm<<<BL,TC, TC*sizeof(double)>>>(dev_a, dev_result, dev_size);
    }
    else
    {
        int gpu_size = size/TC*TC;
        cudaMalloc( (void**)&dev_a, gpu_size*sizeof(double));
        cudaMemcpy( dev_a, a, gpu_size*sizeof(double), cudaMemcpyHostToDevice);
        cudaMalloc((void**)&dev_size, sizeof(int));
        cudaMemcpy(dev_size, &gpu_size, sizeof(int), cudaMemcpyHostToDevice);
        vec_rm<<<BL,TC, TC*sizeof(double)>>>(dev_a, dev_result, dev_size);
        for (int k = gpu_size; k < size; k++)
        {
            sum += a[k];
        }
        std::cout << "CPU sum = " << sum << std::endl;
    }
    cudaMemcpy(result, dev_result, BL*sizeof(double), cudaMemcpyDeviceToHost);
    for (int k = 0; k < BL; ++k)
    {
        sum += result[k];
    }
    std::cout << "CPU+GPU sum = " << sum << std::endl;
    return 0;
}

And I get different result every launch, I try to find out error, but I don’t see race condition or something else.

There is a working code here:
https://devtalk.nvidia.com/default/topic/1038617/understanding-and-adjusting-mark-harriss-array-reduction/?offset=8#5277985

You can just copy/paste/compile/compare to yours and fix accordingly.
Or even better, use Thrust’s reduction.

If you want to start to understand what is wrong with your code:

  1. Run your code with cuda-memcheck
  2. Apply the method here:
    https://stackoverflow.com/questions/27277365/unspecified-launch-failure-on-memcpy/27278218#27278218
    to start to debug your code
  3. After fixing your code, when cuda-memcheck reports no errors, then check your code with the cuda-memcheck racecheck tool. Refer to the cuda-memcheck documentation to learn how to use this sub-tool:

https://docs.nvidia.com/cuda/cuda-memcheck/index.html

In the future, my recommendation is that you do proper CUDA error checking (google that) and run your code with cuda-memcheck, before asking others for help. Even if you don’t understand the error output, it will be useful to others who may try to help you.