Strange issue when accessing memory with a for loop

I created a smaller application to reproduce the issue while keeping it as close to the original as possible.

Simply put, I have a for loop in the code that gives the wrong results. This occurs only on the first pass and not the second. I have found two ways of fixing it that doesn’t make any sense to me.

This does not work. In all these examples, step = 1 and r = 10. It starts giving the wrong value when y = 2.

for (int y = 1; y < (r + 1); y++)
{
    sum = sum + input[(r + y) * step] - input[(r - y) * step];
    sum_output[y * step] = sum;
}

First, changing iterator from int to uint16_t or uint32_t. This will work and give the expected results.

for (uint16_t y = 1; y < (r + 1); y++)
{
    sum = sum + input[(r + y) * step] - input[(r - y) * step];
    sum_output[y * step] = sum;
}

The next scenario is to add a second iterator. This also results in the correct output.

int i = r - 1;
for (int y = 1; y < (r + 1); y++)
{
    sum = sum + input[(r + y) * step] - input[(i) * step];
    sum_output[y * step] = sum;
    i--;
}

The full code is the following.

    #include "sum.cuh"
    
    using namespace cuda_example::test;
    
    namespace sum {
    
    __device__ void d_sum(float *sum_output, const float *input, const int step, const int max, const int r)
    {
        float sum = 0;
        // do top/left edge
        for (int y = 0; y < r; y++)
        {
            sum = sum + 2 * input[y * step];
        }
        sum += input[r * step];
        sum_output[0] = sum;
        int j = r - 1;
        for (int y = 1; y < (r + 1); y++)
        {
            sum = sum + input[(r + y) * step] - input[(j) * step];
            sum_output[y * step] = sum;
            j--;
        }
        // main loop
        for (int y = (r + 1); y < (max - r); y++)
        {
            sum = sum + input[(y + r) * step] - input[((y - r - 1) * step)];
            sum_output[y * step] = sum;
        }
        // do bottom/right edge
        int i = 0;
        for (int y = max - r; y < max; y++)
        {            
            sum = sum + input[((max - 1) - i) * step] - input[((y - r - 1) * step)];
            sum_output[y * step] = sum;
            i++;
        }
    }
    
    __global__ void d_sum_x_global(float *od, const float *id, const int w, const int h, const int r)
    {
        unsigned int y = blockIdx.x * blockDim.x + threadIdx.x;
        if (y >= h) { return; }
        d_sum(&od[y * w], &id[y * w], 1, w, r);
    }
    
    __global__ void d_sum_y_global(float *od, const float *id, const int w, const int h, const int r)
    {
        unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
        if (x >= w) { return; }
        d_sum(&od[x], &id[x], w, h, r);
    }
        
    }
    
    sum_test::sum_test(uint16_t width, uint16_t height, uint16_t thread_count)
    {
        rows = height;
        cols = width;
    nthreads = thread_count;
    size = width * height;
    // allocate device_vectors
    middleman_buffer = thrust::device_vector<float>{size};

    // initialize pointers
        middleman_ptr = thrust::raw_pointer_cast(middleman_buffer.data());
    }
    
    void sum_test::run_filter(float* output_ptr, const float* input_ptr, const int radius) {

    sum::d_sum_x_global<<< (rows + nthreads - 1) / nthreads, nthreads, 0 >>>
        (middleman_ptr, input_ptr, cols, rows, radius);
    
    sum::d_sum_y_global<<< (cols + nthreads - 1) / nthreads, nthreads, 0 >>>
        (output_ptr, middleman_ptr, cols, rows, radius);

    }

Test Code (using opencv and gtest)

    auto im = read_csv("../../test/input.csv", CV_16U);
    auto key = GetParam();
    uint32_t count = im.cols * im.rows;

    int t = 1024;
    int b = (count + t - 1) / t;

    gridSize.emplace(key, t);
    blockSize.emplace(key, b);

    float tolerance = 100;
    int radius = 10;

    auto r = im.rows;
    auto c = im.cols;

    auto float_mat = cv::Mat{r, c, CV_32F};
    im.convertTo(float_mat, CV_32F);

    auto d_input = thrust::device_vector<float>(r * c);
    auto dp_input = thrust::raw_pointer_cast(d_input.data());
    cudaMemcpy(dp_input, float_mat.data, r * c * sizeof(float), cudaMemcpyHostToDevice);

    auto d_actual = thrust::device_vector<float>(r * c);
    auto dp_actual = thrust::raw_pointer_cast(d_actual.data());

    sum_test test(c, r, 1);
    test.run_filter(dp_actual, dp_input, radius);

    auto actual = cv::Mat(r, c, CV_32F);
    cudaMemcpy(actual.data, dp_actual, r * c * sizeof(float), cudaMemcpyDeviceToHost);

    auto expected = read_csv("../../test/output.csv", CV_32F);
    evaluate(expected, actual, 0, tolerance);