I created a smaller application to reproduce the issue while keeping it as close to the original as possible.
Simply put, I have a for loop in the code that gives the wrong results. This occurs only on the first pass and not the second. I have found two ways of fixing it that doesn’t make any sense to me.
This does not work. In all these examples, step = 1 and r = 10. It starts giving the wrong value when y = 2.
for (int y = 1; y < (r + 1); y++)
{
sum = sum + input[(r + y) * step] - input[(r - y) * step];
sum_output[y * step] = sum;
}
First, changing iterator from int to uint16_t or uint32_t. This will work and give the expected results.
for (uint16_t y = 1; y < (r + 1); y++)
{
sum = sum + input[(r + y) * step] - input[(r - y) * step];
sum_output[y * step] = sum;
}
The next scenario is to add a second iterator. This also results in the correct output.
int i = r - 1;
for (int y = 1; y < (r + 1); y++)
{
sum = sum + input[(r + y) * step] - input[(i) * step];
sum_output[y * step] = sum;
i--;
}
The full code is the following.
#include "sum.cuh"
using namespace cuda_example::test;
namespace sum {
__device__ void d_sum(float *sum_output, const float *input, const int step, const int max, const int r)
{
float sum = 0;
// do top/left edge
for (int y = 0; y < r; y++)
{
sum = sum + 2 * input[y * step];
}
sum += input[r * step];
sum_output[0] = sum;
int j = r - 1;
for (int y = 1; y < (r + 1); y++)
{
sum = sum + input[(r + y) * step] - input[(j) * step];
sum_output[y * step] = sum;
j--;
}
// main loop
for (int y = (r + 1); y < (max - r); y++)
{
sum = sum + input[(y + r) * step] - input[((y - r - 1) * step)];
sum_output[y * step] = sum;
}
// do bottom/right edge
int i = 0;
for (int y = max - r; y < max; y++)
{
sum = sum + input[((max - 1) - i) * step] - input[((y - r - 1) * step)];
sum_output[y * step] = sum;
i++;
}
}
__global__ void d_sum_x_global(float *od, const float *id, const int w, const int h, const int r)
{
unsigned int y = blockIdx.x * blockDim.x + threadIdx.x;
if (y >= h) { return; }
d_sum(&od[y * w], &id[y * w], 1, w, r);
}
__global__ void d_sum_y_global(float *od, const float *id, const int w, const int h, const int r)
{
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
if (x >= w) { return; }
d_sum(&od[x], &id[x], w, h, r);
}
}
sum_test::sum_test(uint16_t width, uint16_t height, uint16_t thread_count)
{
rows = height;
cols = width;
nthreads = thread_count;
size = width * height;
// allocate device_vectors
middleman_buffer = thrust::device_vector<float>{size};
// initialize pointers
middleman_ptr = thrust::raw_pointer_cast(middleman_buffer.data());
}
void sum_test::run_filter(float* output_ptr, const float* input_ptr, const int radius) {
sum::d_sum_x_global<<< (rows + nthreads - 1) / nthreads, nthreads, 0 >>>
(middleman_ptr, input_ptr, cols, rows, radius);
sum::d_sum_y_global<<< (cols + nthreads - 1) / nthreads, nthreads, 0 >>>
(output_ptr, middleman_ptr, cols, rows, radius);
}
Test Code (using opencv and gtest)
auto im = read_csv("../../test/input.csv", CV_16U);
auto key = GetParam();
uint32_t count = im.cols * im.rows;
int t = 1024;
int b = (count + t - 1) / t;
gridSize.emplace(key, t);
blockSize.emplace(key, b);
float tolerance = 100;
int radius = 10;
auto r = im.rows;
auto c = im.cols;
auto float_mat = cv::Mat{r, c, CV_32F};
im.convertTo(float_mat, CV_32F);
auto d_input = thrust::device_vector<float>(r * c);
auto dp_input = thrust::raw_pointer_cast(d_input.data());
cudaMemcpy(dp_input, float_mat.data, r * c * sizeof(float), cudaMemcpyHostToDevice);
auto d_actual = thrust::device_vector<float>(r * c);
auto dp_actual = thrust::raw_pointer_cast(d_actual.data());
sum_test test(c, r, 1);
test.run_filter(dp_actual, dp_input, radius);
auto actual = cv::Mat(r, c, CV_32F);
cudaMemcpy(actual.data, dp_actual, r * c * sizeof(float), cudaMemcpyDeviceToHost);
auto expected = read_csv("../../test/output.csv", CV_32F);
evaluate(expected, actual, 0, tolerance);