Speeding up kernel with atomicAdd and results do not match

Good Afternoon,

I am trying to accelerate a CUDA kernel whereby the performance is being dragged down by atomicAdd calls. This kernel is fairly simple and is more or less a toy to build upon later. I have tried to create another version of this kernel that uses shared memory in an attempt to lower the number of atomicAdd calls. However, when I compare the results from the original CUDA kernel with the modified CUDA kernel they do not match. In fact with a total output size of 8092 anything beyond index 4043 in my resulting array is 0.

The modified and original CUDA code is called as follows (to show the size of shared memory) where the output_size and input_size are both 8092:

uint32_t output_size = 8092;
uint32_t input_size = 8092;
uint32_t m_sz = output_size * sizeof(float2);
dim3 griddim(512,1,1);
dim3 blkdim(1,512,1);
uint32_t region = 3;
float3 * a = ...
float3 * b = ...
float2 * d_out_mod = ...
float2 * d_out_org = ...
mod_kernel<<<griddim,blkdim,m_sz,0>>>(a,b,input_size,output_size,d_out_mod,region,start,stop);
org_kernel<<<griddim,blkdim,0,0>>>(a,b,input_size,output_size,d_out_org,region,start,stop);

The original and modified CUDA kernels are shown below:

// The original CUDA kernel
__global__ void org_kernel(const float3 * pos_a, const float3 * pos_b, 
                              const uint32_t input_size, const uint32_t output_size, 
                              float2 * result, const int region, 
                              const uint32_t first_idx_x, const uint32_t last_idx_x)
{
    uint32_t thridx_x = threadIdx.x + blockDim.x*blockIdx.x + first_idx_x;
    uint32_t thridx_y = threadIdx.y + blockDim.y*blockIdx.y;
    uint32_t stride_x = blockDim.x*gridDim.x;
    uint32_t stride_y = blockDim.y*gridDim.y;

    float3 distance3 = make_float3(0);
    float distance = 0;

    uint32_t output_start, output_end;

    for(uint32_t x = thridx_x; x < last_idx_x; x += stride_x){
        // distance calcs
        distance3 = pos_a[x] - pos_b[x];
        distance = length(distance3);
        output_start = __fdiv_rz(output_size,2) + __fdiv_rz(distance,output_size) - region;
        output_end = output_start + region;
        
        for(uint32_t y = thridx_y; y < output_size; y += stride_y){
            float val = 1;
            if((y < output_end) && (y >= output_start)){
                atomicAdd(&result[y].x, val);
                atomicAdd(&result[y].y, val / 2);
            }                        
        }
    }
}
__global__ void mod_kernel(const float3 * pos_a, const float3 * pos_b, 
                                 const uint32_t input_size, const uint32_t output_size, 
                                 float2 * result, const int region, 
                                 const uint32_t first_idx_x, const uint32_t last_idx_x)
{

    extern __shared__ float2 shared_result[];
    uint32_t thridx_x = threadIdx.x + blockDim.x * blockIdx.x + first_idx_x;
    uint32_t thridx_y = threadIdx.y + blockDim.y * blockIdx.y;
    uint32_t stride_x = blockDim.x * gridDim.x;
    uint32_t stride_y = blockDim.y * gridDim.y;

    float3 distance3 = make_float3(0);
    float distance = 0;

    uint32_t output_start, output_end;

    // Initialize shared memory
    for (uint32_t y = threadIdx.y; y < output_size; y += blockDim.y) {
        shared_result[y] = make_float2(0, 0);
    }
    __syncthreads();

    // Accumulators for local results
    float2 local_accum = make_float2(0, 0);

    for (uint32_t x = thridx_x; x < last_idx_x; x += stride_x) {
        // Distance calculations
        distance3 = pos_a[x] - pos_b[x];
        distance = length(distance3);
        output_start = __fdiv_rz(output_size, 2) + __fdiv_rz(distance, output_size) - region;
        output_end = output_start + region;

        for (uint32_t y = thridx_y; y < output_size; y += stride_y) {
            if ((y < output_end) && (y >= output_start)) {
                local_accum.x += 1.0f;
                local_accum.y += 1.0f / 2.0f;
            }
        }
    }
    __syncthreads();

    // Update shared memory with accumulated results
    for (uint32_t y = threadIdx.y; y < output_size; y += blockDim.y) {
        atomicAdd(&shared_result[y].x, local_accum.x);
        atomicAdd(&shared_result[y].y, local_accum.y);
    }
    __syncthreads();

    // Write results back to global memory
    for (uint32_t y = threadIdx.y; y < output_size; y += blockDim.y) {
        atomicAdd(&result[y].x, shared_result[y].x);
        atomicAdd(&result[y].y, shared_result[y].y);
    }
}

I apologize for the long post but I seem to be going in circles on this one and would very much appreciate any help/hints in the right direction.

Thank you in advance.

I suggest providing a complete test case showing the issue, that others could work with without having to add things.

Are you doing rigorous, proper CUDA error checking?
Have you run your code with compute-sanitizer in both cases?

Thank you @Robert_Crovella for the fast response.

Yes, I am using CUDA error checking. The input to the code:

size_t n_vals = 8092;
size_t n_float3 = n_vals;
size_t float3sz = n_float3 * sizeof(float3);
output_sz = n_vals * sizeof(float2);
float3 * h_a, * h_b;
float3 * d_a, *d_b;
float2 * d_out_org, *d_out_mod;
h_a = (float3*)malloc(float3sz);
h_b = (float3*)malloc(float3sz);
for (size_t p = 0; p <  n_float3; ++p) {
    h_a[p] = make_float3(1,1,1);
    h_b[p] = make_float3(0.1,0.1,0.1);
}
GPU_ERROR_CHECK(cudaMalloc(&d_a, float3sz));
 GPU_ERROR_CHECK(cudaMalloc(&d_b, float3sz));
 GPU_ERROR_CHECK(cudaMemcpy(d_a, h_a, float3sz, cudaMemcpyHostToDevice));
 GPU_ERROR_CHECK(cudaMemcpy(d_b, h_b, float3sz, cudaMemcpyHostToDevice));
GPU_ERROR_CHECK(cudaMalloc(&d_out_org, output_sz));
GPU_ERROR_CHECK(cudaMemset(d_out_org, 0, output_sz));
GPU_ERROR_CHECK(cudaMalloc(&d_out_mod, output_sz));
GPU_ERROR_CHECK(cudaMemset(d_out_mod, 0, output_sz));
GPU_ERROR_CHECK(cudaDeviceSynchronize());

looks fine, as far as it goes.

I wouldn’t be able to spend any time on it without the things I suggested. Others may spot something.

That’s not the only difference in the modified kernel. The modified kernel also introduces a per-thread local_accum, and I do not see how the use of that in the inner-most loop is possibly equivalent to the use of atomicAdd in the original kernel.

If this were my code, I would start off debugging with a small configuration (say 1 block, with 32 threads, writing to output of size 64) and instrument the code to trace intermediate results.

You could try to do atomicAdds of local_accum directly into global memory instead of going through shared memory and see, if the results are correct then. Also do not do it in a loop, otherwise you would add several times? With shared memory you also should make sure that only one thread/warp is adding each shared memory element to global memory and not all, which contributed to the sum.

Hi @Robert_Crovella this is an interesting problem. I apologize but I am unfamiliar with using compute-sanitizer. Can you briefly explain how this would work?

Hi @njuffa,

Starting with a small 1 block with 32 threads seems like a good approach.

Can you briefly show how the local_accum per-thread variable could be properly defined to be more equivalent to the atomicAdd ?

Hi @Curefab,

Interesting ideas.

Can you elaborate on your suggestion with doing atomicAdd of local_accum directly into global memory instead of using shared memory without using a loop?

I don’t know what this code is trying to do. This is your code, you know what it is supposed to accomplish, and you are fully capable of working out various code transformations and assessing their validity by yourself. It will, however, require some investment of time to do that.

I just looked at the innermost loops and do not see/understand how these could possibly be functionally equivalent. If you disagree, you could try to explain to a rubber duck how they actually do the same thing.

Side remark: This highly unusual code (not necessarily wrong, but potentially so) caused raised eyebrows when I came across it:

output_start = __fdiv_rz(output_size,2) + __fdiv_rz(distance,output_size) - region;

It is documented.

Suppose your app is called ./my_executable (on linux). To use compute-sanitizer you would do:

compute-sanitizer ./my_executable

There are many examples on many forums.

Of course, you need an executable. I tried to create an executable from what is provided here but there was enough missing that I gave up.

You are correct @njuffa . I guess I haven’t properly explained the situation.

Essentially I would like to iterate over a 2-dimensional array storing one summed/calculated value for resulting float2 array for x-value and another for the y-value. The distance between the input positional float3 arrays, e.g., pos_a and pos_b, are utilized to calculate the proper range for output result array (a float2 array).

calculating the distance between the positional input arrays, i.e., pos_a and pos_b inputs.

What follows is a brief overview/description of my algorithm:

  1. The x-direction is iterated from the current threadIdx.x to the last x-direction index, e.g., last_idx_x

    • Calculate distance for the current index in the position float3 arrays
    • Use the newly calculate distance to define a valid range for the output float2 array, e.g., the result array (the weird looking output_start = __fdiv_rz(output_size,2) + __fdiv_rz(distance,output_size) - region; whereby region is just a way to try and ensure the range is not on the ends of the range.
  2. The y-direction is iterated from the current threadIdx.y to the last y-direction index, e.g., output_size

    • The local variable val = 1 is modified if the current index is within the previously defined range such that the x-value of the current result element that is a float2is summed with the additional value and the y-value is summed to val / 2
    • This is done via the atomicAdd calls which are really just a conceptual thing as this approach could be something I could build on later on.

It is these atomicAdd calls that cause the performance drag, hence this is why I was trying to modify the kernel to maybe use shared memory to hopefully speedup the execution and still maintain consistent results with the original kernel.

I apologize for not being clearer in my question any ideas/hints would be greatly appreciated.

Please show a minimal code example that can be compiled and run by others.

Basically, you have a matrix of size 8092 * 8092 where each row has a contiguous range of 1s starting at some column offset, and you want to compute the per-column sums. Is that correct?

Okay, the full working code (driver.cu):

#include <cuda.h>
#include <stdio.h>
#include <chrono>
#include <stdexcept>
#include <string>
#include <vector>
#include "helper_math.cuh"

#define GPU_ERROR_CHECK(ans) do{gpuAssert((ans),__FILE__,__LINE__);}while(0)
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr,"GPUassert: %s %s %s\n", cudaGetErrorString(code), file, line);
        printf("\nCUDA KERNEL ERROR: CUDA kernel reports error: %s\n", cudaGetErrorString(code));
        if (abort) exit(code);
    }
}
__global__ void org_kernel(const float3 * pos_a, const float3 * pos_b, 
                              const uint32_t input_size, const uint32_t output_size, 
                              float2 * result, const int region, 
                              const uint32_t first_idx_x, const uint32_t last_idx_x)
{
    uint32_t thridx_x = threadIdx.x + blockDim.x*blockIdx.x + first_idx_x;
    uint32_t thridx_y = threadIdx.y + blockDim.y*blockIdx.y;
    uint32_t stride_x = blockDim.x*gridDim.x;
    uint32_t stride_y = blockDim.y*gridDim.y;

    float3 distance3 = make_float3(0);
    float distance = 0;

    uint32_t output_start, output_end;

    for(uint32_t x = thridx_x; x < last_idx_x; x += stride_x){
        // distance calcs
        distance3 = pos_a[x] - pos_b[x];
        distance = length(distance3);
        output_start = __fdiv_rz(output_size,2) + __fdiv_rz(distance,output_size) - region;
        output_end = output_start + region;
        
        for(uint32_t y = thridx_y; y < output_size; y += stride_y){
            float val = 1;
            if((y < output_end) && (y >= output_start)){
                atomicAdd(&result[y].x, val);
                atomicAdd(&result[y].y, val / 2);
            }                        
        }
    }
}
__global__ void mod_kernel(const float3 * pos_a, const float3 * pos_b, 
                                 const uint32_t input_size, const uint32_t output_size, 
                                 float2 * result, const int region, 
                                 const uint32_t first_idx_x, const uint32_t last_idx_x)
{

    extern __shared__ float2 shared_result[];
    uint32_t thridx_x = threadIdx.x + blockDim.x * blockIdx.x + first_idx_x;
    uint32_t thridx_y = threadIdx.y + blockDim.y * blockIdx.y;
    uint32_t stride_x = blockDim.x * gridDim.x;
    uint32_t stride_y = blockDim.y * gridDim.y;

    float3 distance3 = make_float3(0);
    float distance = 0;

    uint32_t output_start, output_end;

    // Initialize shared memory
    for (uint32_t y = threadIdx.y; y < output_size; y += blockDim.y) {
        shared_result[y] = make_float2(0, 0);
    }
    __syncthreads();

    // Accumulators for local results
    float2 local_accum = make_float2(0, 0);

    for (uint32_t x = thridx_x; x < last_idx_x; x += stride_x) {
        // Distance calculations
        distance3 = pos_a[x] - pos_b[x];
        distance = length(distance3);
        output_start = __fdiv_rz(output_size, 2) + __fdiv_rz(distance, output_size) - region;
        output_end = output_start + region;

        for (uint32_t y = thridx_y; y < output_size; y += stride_y) {
            if ((y < output_end) && (y >= output_start)) {
                local_accum.x += 1.0f;
                local_accum.y += 1.0f / 2.0f;
            }
        }
    }
    __syncthreads();

    // Update shared memory with accumulated results
    for (uint32_t y = threadIdx.y; y < output_size; y += blockDim.y) {
        atomicAdd(&shared_result[y].x, local_accum.x);
        atomicAdd(&shared_result[y].y, local_accum.y);
    }
    __syncthreads();

    // Write results back to global memory
    for (uint32_t y = threadIdx.y; y < output_size; y += blockDim.y) {
        atomicAdd(&result[y].x, shared_result[y].x);
        atomicAdd(&result[y].y, shared_result[y].y);
    }
}

bool evalArraysEqual(float2* d_org, float2* d_mod, uint32_t n) {
    if (d_org == nullptr || d_mod == nullptr) {
        throw std::invalid_argument("Arrays are NULL.");
    }
    if (n < 1) {
        throw std::invalid_argument("Invalid array length.");
    }
    float2 *h_org, *h_mod;
    size_t sz = n * sizeof(float2);
    h_org = (float2*)malloc(sz);
    h_mod = (float2*)malloc(sz);
    GPU_ERROR_CHECK(cudaMemcpy(h_org, d_org, sz, cudaMemcpyDeviceToHost));
    GPU_ERROR_CHECK(cudaMemcpy(h_mod, d_mod, sz, cudaMemcpyDeviceToHost));
    GPU_ERROR_CHECK(cudaDeviceSynchronize());
    for (uint32_t i = 0; i < n; ++i) {
        if (h_org[i].x != h_mod[i].x || h_org[i].y != h_mod[i].y) {
            printf"\ti (%i): original (%f,%f), modified (%f,%f)\n", i, h_org[i].x, h_org[i].y, h_mod[i].x, h_mod[i].y);
            return false;
        }
    }
    free(h_org);
    free(h_mod);
    return true;
}
int main(int argc, char* argv[]) {
    dim3 ngrid(512,1,1);
    dim3 nblks(1,512,1);
    uint32_t region = 3;
    size_t n_vals = 8092;
    size_t n_float3 = n_vals;
    size_t float3sz = n_float3 * sizeof(float3);
    output_sz = n_vals * sizeof(float2);
    float3 *h_a, *h_b;
    float3 * d_a, *d_b;
    h_a = (float3*)malloc(float3sz);
    h_b = (float3*)malloc(float3sz);
    for (size_t p = 0; p <  n_float3; ++p) {
         h_a[p] = make_float3(1,1,1);
         h_b[p] = make_float3(0.1,0.1,0.1);
    }
    GPU_ERROR_CHECK(cudaMalloc(&d_a, float3sz));
    GPU_ERROR_CHECK(cudaMalloc(&d_b, float3sz));
    GPU_ERROR_CHECK(cudaMemcpy(d_a, h_a, float3sz, cudaMemcpyHostToDevice));
    GPU_ERROR_CHECK(cudaMemcpy(d_b, h_b, float3sz, cudaMemcpyHostToDevice));
   float2 *d_out_mod, *d_out_org;
   GPU_ERROR_CHECK(cudaMalloc(&d_out_org, output_sz, ));
   GPU_ERROR_CHECK(cudaMalloc(&d_out_mod, output_sz));
   GPU_ERROR_CHECK(cudaMemset(d_out_org, 0, output_sz));
   GPU_ERROR_CHECK(cudaMemset(d_out_mod, 0, output_sz));

    org_kernel<<<ngrid,nblks,0,0>>>(d_a, d_b, n_vals, n_vals, d_out_org, region, 0, 8092);
    uint32_t sm_sz = n_vals * sizeof(float2);
    mod_kernel<<<ngrid,nblks,sm_sz,0>>>(d_a, d_b, n_vals, n_vals, d_out_mod, region, 0, 8092);

    if (evalArraysEqual(d_out_org, d_out_mod, n_vals)) {
        printf("Fidelity achieved.\n");
    }else {
        printf("Original and Modified kernel results do not match.\n");
    }
    GPU_ERROR_CHECK(cudaPeekAtLastError());
    GPU_ERROR_CHECK(cudaDeviceSynchronize());
    GPU_ERROR_CHECK(cudaDeviceReset());
    return 0;
}

I compile as follows: nvcc driver.cu -o xTest

When I run the compiled executable xTest, I get the following output:

i (4043): original (8092.000000,4096.000000), modified (0.000000,0.000000)
Original and Modified kernel results do not match.

I also get a GPUassert: invalid argument at the line GPU_ERROR_CHECK(cudaPeekAtLastError()); in the driver.cu file.

The code does not compile (cuda 12.4)
For example, file helper_math.cuh is missing

Sorry. Below is the helper_math.cuh file.

/**
 * Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/*
 *  This file implements common mathematical operations on vector types
 *  (float3, float4 etc.) since these are not provided as standard by CUDA.
 *
 *  The syntax is modeled on the Cg standard library.
 *
 *  This is part of the Helper library includes
 *
 *    Thanks to Linh Hah for additions and fixes.
 */

#ifndef HELPER_MATH_H
#define HELPER_MATH_H

#include "cuda_runtime.h"

typedef unsigned int uint;
typedef unsigned short ushort;

#ifndef EXIT_WAIVED
#define EXIT_WAIVED 2
#endif

#ifndef __CUDACC__
#include <math.h>

////////////////////////////////////////////////////////////////////////////////
// host implementations of CUDA functions
////////////////////////////////////////////////////////////////////////////////

inline float fminf(float a, float b)
{
    return a < b ? a : b;
}

inline float fmaxf(float a, float b)
{
    return a > b ? a : b;
}

inline int max(int a, int b)
{
    return a > b ? a : b;
}

inline int min(int a, int b)
{
    return a < b ? a : b;
}

inline float rsqrtf(float x)
{
    return 1.0f / sqrtf(x);
}
#endif

////////////////////////////////////////////////////////////////////////////////
// constructors
////////////////////////////////////////////////////////////////////////////////

inline __host__ __device__ float2 make_float2(float s)
{
    return make_float2(s, s);
}
inline __host__ __device__ float2 make_float2(float3 a)
{
    return make_float2(a.x, a.y);
}
inline __host__ __device__ float2 make_float2(int2 a)
{
    return make_float2(float(a.x), float(a.y));
}
inline __host__ __device__ float2 make_float2(uint2 a)
{
    return make_float2(float(a.x), float(a.y));
}

inline __host__ __device__ int2 make_int2(int s)
{
    return make_int2(s, s);
}
inline __host__ __device__ int2 make_int2(int3 a)
{
    return make_int2(a.x, a.y);
}
inline __host__ __device__ int2 make_int2(uint2 a)
{
    return make_int2(int(a.x), int(a.y));
}
inline __host__ __device__ int2 make_int2(float2 a)
{
    return make_int2(int(a.x), int(a.y));
}

inline __host__ __device__ uint2 make_uint2(uint s)
{
    return make_uint2(s, s);
}
inline __host__ __device__ uint2 make_uint2(uint3 a)
{
    return make_uint2(a.x, a.y);
}
inline __host__ __device__ uint2 make_uint2(int2 a)
{
    return make_uint2(uint(a.x), uint(a.y));
}

inline __host__ __device__ float3 make_float3(float s)
{
    return make_float3(s, s, s);
}
inline __host__ __device__ float3 make_float3(float2 a)
{
    return make_float3(a.x, a.y, 0.0f);
}
inline __host__ __device__ float3 make_float3(float2 a, float s)
{
    return make_float3(a.x, a.y, s);
}
inline __host__ __device__ float3 make_float3(float4 a)
{
    return make_float3(a.x, a.y, a.z);
}
inline __host__ __device__ float3 make_float3(int3 a)
{
    return make_float3(float(a.x), float(a.y), float(a.z));
}
inline __host__ __device__ float3 make_float3(uint3 a)
{
    return make_float3(float(a.x), float(a.y), float(a.z));
}

inline __host__ __device__ int3 make_int3(int s)
{
    return make_int3(s, s, s);
}
inline __host__ __device__ int3 make_int3(int2 a)
{
    return make_int3(a.x, a.y, 0);
}
inline __host__ __device__ int3 make_int3(int2 a, int s)
{
    return make_int3(a.x, a.y, s);
}
inline __host__ __device__ int3 make_int3(uint3 a)
{
    return make_int3(int(a.x), int(a.y), int(a.z));
}
inline __host__ __device__ int3 make_int3(float3 a)
{
    return make_int3(int(a.x), int(a.y), int(a.z));
}

inline __host__ __device__ uint3 make_uint3(uint s)
{
    return make_uint3(s, s, s);
}
inline __host__ __device__ uint3 make_uint3(uint2 a)
{
    return make_uint3(a.x, a.y, 0);
}
inline __host__ __device__ uint3 make_uint3(uint2 a, uint s)
{
    return make_uint3(a.x, a.y, s);
}
inline __host__ __device__ uint3 make_uint3(uint4 a)
{
    return make_uint3(a.x, a.y, a.z);
}
inline __host__ __device__ uint3 make_uint3(int3 a)
{
    return make_uint3(uint(a.x), uint(a.y), uint(a.z));
}

inline __host__ __device__ float4 make_float4(float s)
{
    return make_float4(s, s, s, s);
}
inline __host__ __device__ float4 make_float4(float3 a)
{
    return make_float4(a.x, a.y, a.z, 0.0f);
}
inline __host__ __device__ float4 make_float4(float3 a, float w)
{
    return make_float4(a.x, a.y, a.z, w);
}
inline __host__ __device__ float4 make_float4(int4 a)
{
    return make_float4(float(a.x), float(a.y), float(a.z), float(a.w));
}
inline __host__ __device__ float4 make_float4(uint4 a)
{
    return make_float4(float(a.x), float(a.y), float(a.z), float(a.w));
}

inline __host__ __device__ int4 make_int4(int s)
{
    return make_int4(s, s, s, s);
}
inline __host__ __device__ int4 make_int4(int3 a)
{
    return make_int4(a.x, a.y, a.z, 0);
}
inline __host__ __device__ int4 make_int4(int3 a, int w)
{
    return make_int4(a.x, a.y, a.z, w);
}
inline __host__ __device__ int4 make_int4(uint4 a)
{
    return make_int4(int(a.x), int(a.y), int(a.z), int(a.w));
}
inline __host__ __device__ int4 make_int4(float4 a)
{
    return make_int4(int(a.x), int(a.y), int(a.z), int(a.w));
}


inline __host__ __device__ uint4 make_uint4(uint s)
{
    return make_uint4(s, s, s, s);
}
inline __host__ __device__ uint4 make_uint4(uint3 a)
{
    return make_uint4(a.x, a.y, a.z, 0);
}
inline __host__ __device__ uint4 make_uint4(uint3 a, uint w)
{
    return make_uint4(a.x, a.y, a.z, w);
}
inline __host__ __device__ uint4 make_uint4(int4 a)
{
    return make_uint4(uint(a.x), uint(a.y), uint(a.z), uint(a.w));
}

////////////////////////////////////////////////////////////////////////////////
// negate
////////////////////////////////////////////////////////////////////////////////

inline __host__ __device__ float2 operator-(float2 &a)
{
    return make_float2(-a.x, -a.y);
}
inline __host__ __device__ int2 operator-(int2 &a)
{
    return make_int2(-a.x, -a.y);
}
inline __host__ __device__ float3 operator-(float3 &a)
{
    return make_float3(-a.x, -a.y, -a.z);
}
inline __host__ __device__ int3 operator-(int3 &a)
{
    return make_int3(-a.x, -a.y, -a.z);
}
inline __host__ __device__ float4 operator-(float4 &a)
{
    return make_float4(-a.x, -a.y, -a.z, -a.w);
}
inline __host__ __device__ int4 operator-(int4 &a)
{
    return make_int4(-a.x, -a.y, -a.z, -a.w);
}

////////////////////////////////////////////////////////////////////////////////
// addition
////////////////////////////////////////////////////////////////////////////////

inline __host__ __device__ float2 operator+(float2 a, float2 b)
{
    return make_float2(a.x + b.x, a.y + b.y);
}
inline __host__ __device__ void operator+=(float2 &a, float2 b)
{
    a.x += b.x;
    a.y += b.y;
}
inline __host__ __device__ float2 operator+(float2 a, float b)
{
    return make_float2(a.x + b, a.y + b);
}
inline __host__ __device__ float2 operator+(float b, float2 a)
{
    return make_float2(a.x + b, a.y + b);
}
inline __host__ __device__ void operator+=(float2 &a, float b)
{
    a.x += b;
    a.y += b;
}

inline __host__ __device__ int2 operator+(int2 a, int2 b)
{
    return make_int2(a.x + b.x, a.y + b.y);
}
inline __host__ __device__ void operator+=(int2 &a, int2 b)
{
    a.x += b.x;
    a.y += b.y;
}
inline __host__ __device__ int2 operator+(int2 a, int b)
{
    return make_int2(a.x + b, a.y + b);
}
inline __host__ __device__ int2 operator+(int b, int2 a)
{
    return make_int2(a.x + b, a.y + b);
}
inline __host__ __device__ void operator+=(int2 &a, int b)
{
    a.x += b;
    a.y += b;
}

inline __host__ __device__ uint2 operator+(uint2 a, uint2 b)
{
    return make_uint2(a.x + b.x, a.y + b.y);
}
inline __host__ __device__ void operator+=(uint2 &a, uint2 b)
{
    a.x += b.x;
    a.y += b.y;
}
inline __host__ __device__ uint2 operator+(uint2 a, uint b)
{
    return make_uint2(a.x + b, a.y + b);
}
inline __host__ __device__ uint2 operator+(uint b, uint2 a)
{
    return make_uint2(a.x + b, a.y + b);
}
inline __host__ __device__ void operator+=(uint2 &a, uint b)
{
    a.x += b;
    a.y += b;
}


inline __host__ __device__ float3 operator+(float3 a, float3 b)
{
    return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
}
inline __host__ __device__ void operator+=(float3 &a, float3 b)
{
    a.x += b.x;
    a.y += b.y;
    a.z += b.z;
}
inline __host__ __device__ float3 operator+(float3 a, float b)
{
    return make_float3(a.x + b, a.y + b, a.z + b);
}
inline __host__ __device__ void operator+=(float3 &a, float b)
{
    a.x += b;
    a.y += b;
    a.z += b;
}

inline __host__ __device__ int3 operator+(int3 a, int3 b)
{
    return make_int3(a.x + b.x, a.y + b.y, a.z + b.z);
}
inline __host__ __device__ void operator+=(int3 &a, int3 b)
{
    a.x += b.x;
    a.y += b.y;
    a.z += b.z;
}
inline __host__ __device__ int3 operator+(int3 a, int b)
{
    return make_int3(a.x + b, a.y + b, a.z + b);
}
inline __host__ __device__ void operator+=(int3 &a, int b)
{
    a.x += b;
    a.y += b;
    a.z += b;
}

inline __host__ __device__ uint3 operator+(uint3 a, uint3 b)
{
    return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z);
}
inline __host__ __device__ void operator+=(uint3 &a, uint3 b)
{
    a.x += b.x;
    a.y += b.y;
    a.z += b.z;
}
inline __host__ __device__ uint3 operator+(uint3 a, uint b)
{
    return make_uint3(a.x + b, a.y + b, a.z + b);
}
inline __host__ __device__ void operator+=(uint3 &a, uint b)
{
    a.x += b;
    a.y += b;
    a.z += b;
}

inline __host__ __device__ int3 operator+(int b, int3 a)
{
    return make_int3(a.x + b, a.y + b, a.z + b);
}
inline __host__ __device__ uint3 operator+(uint b, uint3 a)
{
    return make_uint3(a.x + b, a.y + b, a.z + b);
}
inline __host__ __device__ float3 operator+(float b, float3 a)
{
    return make_float3(a.x + b, a.y + b, a.z + b);
}

inline __host__ __device__ float4 operator+(float4 a, float4 b)
{
    return make_float4(a.x + b.x, a.y + b.y, a.z + b.z,  a.w + b.w);
}
inline __host__ __device__ void operator+=(float4 &a, float4 b)
{
    a.x += b.x;
    a.y += b.y;
    a.z += b.z;
    a.w += b.w;
}
inline __host__ __device__ float4 operator+(float4 a, float b)
{
    return make_float4(a.x + b, a.y + b, a.z + b, a.w + b);
}
inline __host__ __device__ float4 operator+(float b, float4 a)
{
    return make_float4(a.x + b, a.y + b, a.z + b, a.w + b);
}
inline __host__ __device__ void operator+=(float4 &a, float b)
{
    a.x += b;
    a.y += b;
    a.z += b;
    a.w += b;
}

inline __host__ __device__ int4 operator+(int4 a, int4 b)
{
    return make_int4(a.x + b.x, a.y + b.y, a.z + b.z,  a.w + b.w);
}
inline __host__ __device__ void operator+=(int4 &a, int4 b)
{
    a.x += b.x;
    a.y += b.y;
    a.z += b.z;
    a.w += b.w;
}
inline __host__ __device__ int4 operator+(int4 a, int b)
{
    return make_int4(a.x + b, a.y + b, a.z + b,  a.w + b);
}
inline __host__ __device__ int4 operator+(int b, int4 a)
{
    return make_int4(a.x + b, a.y + b, a.z + b,  a.w + b);
}
inline __host__ __device__ void operator+=(int4 &a, int b)
{
    a.x += b;
    a.y += b;
    a.z += b;
    a.w += b;
}

inline __host__ __device__ uint4 operator+(uint4 a, uint4 b)
{
    return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z,  a.w + b.w);
}
inline __host__ __device__ void operator+=(uint4 &a, uint4 b)
{
    a.x += b.x;
    a.y += b.y;
    a.z += b.z;
    a.w += b.w;
}
inline __host__ __device__ uint4 operator+(uint4 a, uint b)
{
    return make_uint4(a.x + b, a.y + b, a.z + b,  a.w + b);
}
inline __host__ __device__ uint4 operator+(uint b, uint4 a)
{
    return make_uint4(a.x + b, a.y + b, a.z + b,  a.w + b);
}
inline __host__ __device__ void operator+=(uint4 &a, uint b)
{
    a.x += b;
    a.y += b;
    a.z += b;
    a.w += b;
}

////////////////////////////////////////////////////////////////////////////////
// subtract
////////////////////////////////////////////////////////////////////////////////

inline __host__ __device__ float2 operator-(float2 a, float2 b)
{
    return make_float2(a.x - b.x, a.y - b.y);
}
inline __host__ __device__ void operator-=(float2 &a, float2 b)
{
    a.x -= b.x;
    a.y -= b.y;
}
inline __host__ __device__ float2 operator-(float2 a, float b)
{
    return make_float2(a.x - b, a.y - b);
}
inline __host__ __device__ float2 operator-(float b, float2 a)
{
    return make_float2(b - a.x, b - a.y);
}
inline __host__ __device__ void operator-=(float2 &a, float b)
{
    a.x -= b;
    a.y -= b;
}

inline __host__ __device__ int2 operator-(int2 a, int2 b)
{
    return make_int2(a.x - b.x, a.y - b.y);
}
inline __host__ __device__ void operator-=(int2 &a, int2 b)
{
    a.x -= b.x;
    a.y -= b.y;
}
inline __host__ __device__ int2 operator-(int2 a, int b)
{
    return make_int2(a.x - b, a.y - b);
}
inline __host__ __device__ int2 operator-(int b, int2 a)
{
    return make_int2(b - a.x, b - a.y);
}
inline __host__ __device__ void operator-=(int2 &a, int b)
{
    a.x -= b;
    a.y -= b;
}

inline __host__ __device__ uint2 operator-(uint2 a, uint2 b)
{
    return make_uint2(a.x - b.x, a.y - b.y);
}
inline __host__ __device__ void operator-=(uint2 &a, uint2 b)
{
    a.x -= b.x;
    a.y -= b.y;
}
inline __host__ __device__ uint2 operator-(uint2 a, uint b)
{
    return make_uint2(a.x - b, a.y - b);
}
inline __host__ __device__ uint2 operator-(uint b, uint2 a)
{
    return make_uint2(b - a.x, b - a.y);
}
inline __host__ __device__ void operator-=(uint2 &a, uint b)
{
    a.x -= b;
    a.y -= b;
}

inline __host__ __device__ float3 operator-(float3 a, float3 b)
{
    return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
}
inline __host__ __device__ void operator-=(float3 &a, float3 b)
{
    a.x -= b.x;
    a.y -= b.y;
    a.z -= b.z;
}
inline __host__ __device__ float3 operator-(float3 a, float b)
{
    return make_float3(a.x - b, a.y - b, a.z - b);
}
inline __host__ __device__ float3 operator-(float b, float3 a)
{
    return make_float3(b - a.x, b - a.y, b - a.z);
}
inline __host__ __device__ void operator-=(float3 &a, float b)
{
    a.x -= b;
    a.y -= b;
    a.z -= b;
}

inline __host__ __device__ int3 operator-(int3 a, int3 b)
{
    return make_int3(a.x - b.x, a.y - b.y, a.z - b.z);
}
inline __host__ __device__ void operator-=(int3 &a, int3 b)
{
    a.x -= b.x;
    a.y -= b.y;
    a.z -= b.z;
}
inline __host__ __device__ int3 operator-(int3 a, int b)
{
    return make_int3(a.x - b, a.y - b, a.z - b);
}
inline __host__ __device__ int3 operator-(int b, int3 a)
{
    return make_int3(b - a.x, b - a.y, b - a.z);
}
inline __host__ __device__ void operator-=(int3 &a, int b)
{
    a.x -= b;
    a.y -= b;
    a.z -= b;
}

inline __host__ __device__ uint3 operator-(uint3 a, uint3 b)
{
    return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z);
}
inline __host__ __device__ void operator-=(uint3 &a, uint3 b)
{
    a.x -= b.x;
    a.y -= b.y;
    a.z -= b.z;
}
inline __host__ __device__ uint3 operator-(uint3 a, uint b)
{
    return make_uint3(a.x - b, a.y - b, a.z - b);
}
inline __host__ __device__ uint3 operator-(uint b, uint3 a)
{
    return make_uint3(b - a.x, b - a.y, b - a.z);
}
inline __host__ __device__ void operator-=(uint3 &a, uint b)
{
    a.x -= b;
    a.y -= b;
    a.z -= b;
}

inline __host__ __device__ float4 operator-(float4 a, float4 b)
{
    return make_float4(a.x - b.x, a.y - b.y, a.z - b.z,  a.w - b.w);
}
inline __host__ __device__ void operator-=(float4 &a, float4 b)
{
    a.x -= b.x;
    a.y -= b.y;
    a.z -= b.z;
    a.w -= b.w;
}
inline __host__ __device__ float4 operator-(float4 a, float b)
{
    return make_float4(a.x - b, a.y - b, a.z - b,  a.w - b);
}
inline __host__ __device__ void operator-=(float4 &a, float b)
{
    a.x -= b;
    a.y -= b;
    a.z -= b;
    a.w -= b;
}

inline __host__ __device__ int4 operator-(int4 a, int4 b)
{
    return make_int4(a.x - b.x, a.y - b.y, a.z - b.z,  a.w - b.w);
}
inline __host__ __device__ void operator-=(int4 &a, int4 b)
{
    a.x -= b.x;
    a.y -= b.y;
    a.z -= b.z;
    a.w -= b.w;
}
inline __host__ __device__ int4 operator-(int4 a, int b)
{
    return make_int4(a.x - b, a.y - b, a.z - b,  a.w - b);
}
inline __host__ __device__ int4 operator-(int b, int4 a)
{
    return make_int4(b - a.x, b - a.y, b - a.z, b - a.w);
}
inline __host__ __device__ void operator-=(int4 &a, int b)
{
    a.x -= b;
    a.y -= b;
    a.z -= b;
    a.w -= b;
}

inline __host__ __device__ uint4 operator-(uint4 a, uint4 b)
{
    return make_uint4(a.x - b.x, a.y - b.y, a.z - b.z,  a.w - b.w);
}
inline __host__ __device__ void operator-=(uint4 &a, uint4 b)
{
    a.x -= b.x;
    a.y -= b.y;
    a.z -= b.z;
    a.w -= b.w;
}
inline __host__ __device__ uint4 operator-(uint4 a, uint b)
{
    return make_uint4(a.x - b, a.y - b, a.z - b,  a.w - b);
}
inline __host__ __device__ uint4 operator-(uint b, uint4 a)
{
    return make_uint4(b - a.x, b - a.y, b - a.z, b - a.w);
}
inline __host__ __device__ void operator-=(uint4 &a, uint b)
{
    a.x -= b;
    a.y -= b;
    a.z -= b;
    a.w -= b;
}

////////////////////////////////////////////////////////////////////////////////
// multiply
////////////////////////////////////////////////////////////////////////////////

inline __host__ __device__ float2 operator*(float2 a, float2 b)
{
    return make_float2(a.x * b.x, a.y * b.y);
}
inline __host__ __device__ void operator*=(float2 &a, float2 b)
{
    a.x *= b.x;
    a.y *= b.y;
}
inline __host__ __device__ float2 operator*(float2 a, float b)
{
    return make_float2(a.x * b, a.y * b);
}
inline __host__ __device__ float2 operator*(float b, float2 a)
{
    return make_float2(b * a.x, b * a.y);
}
inline __host__ __device__ void operator*=(float2 &a, float b)
{
    a.x *= b;
    a.y *= b;
}

inline __host__ __device__ int2 operator*(int2 a, int2 b)
{
    return make_int2(a.x * b.x, a.y * b.y);
}
inline __host__ __device__ void operator*=(int2 &a, int2 b)
{
    a.x *= b.x;
    a.y *= b.y;
}
inline __host__ __device__ int2 operator*(int2 a, int b)
{
    return make_int2(a.x * b, a.y * b);
}
inline __host__ __device__ int2 operator*(int b, int2 a)
{
    return make_int2(b * a.x, b * a.y);
}
inline __host__ __device__ void operator*=(int2 &a, int b)
{
    a.x *= b;
    a.y *= b;
}

inline __host__ __device__ uint2 operator*(uint2 a, uint2 b)
{
    return make_uint2(a.x * b.x, a.y * b.y);
}
inline __host__ __device__ void operator*=(uint2 &a, uint2 b)
{
    a.x *= b.x;
    a.y *= b.y;
}
inline __host__ __device__ uint2 operator*(uint2 a, uint b)
{
    return make_uint2(a.x * b, a.y * b);
}
inline __host__ __device__ uint2 operator*(uint b, uint2 a)
{
    return make_uint2(b * a.x, b * a.y);
}
inline __host__ __device__ void operator*=(uint2 &a, uint b)
{
    a.x *= b;
    a.y *= b;
}

inline __host__ __device__ float3 operator*(float3 a, float3 b)
{
    return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
}
inline __host__ __device__ void operator*=(float3 &a, float3 b)
{
    a.x *= b.x;
    a.y *= b.y;
    a.z *= b.z;
}
inline __host__ __device__ float3 operator*(float3 a, float b)
{
    return make_float3(a.x * b, a.y * b, a.z * b);
}
inline __host__ __device__ float3 operator*(float b, float3 a)
{
    return make_float3(b * a.x, b * a.y, b * a.z);
}
inline __host__ __device__ void operator*=(float3 &a, float b)
{
    a.x *= b;
    a.y *= b;
    a.z *= b;
}

inline __host__ __device__ int3 operator*(int3 a, int3 b)
{
    return make_int3(a.x * b.x, a.y * b.y, a.z * b.z);
}
inline __host__ __device__ void operator*=(int3 &a, int3 b)
{
    a.x *= b.x;
    a.y *= b.y;
    a.z *= b.z;
}
inline __host__ __device__ int3 operator*(int3 a, int b)
{
    return make_int3(a.x * b, a.y * b, a.z * b);
}
inline __host__ __device__ int3 operator*(int b, int3 a)
{
    return make_int3(b * a.x, b * a.y, b * a.z);
}
inline __host__ __device__ void operator*=(int3 &a, int b)
{
    a.x *= b;
    a.y *= b;
    a.z *= b;
}

inline __host__ __device__ uint3 operator*(uint3 a, uint3 b)
{
    return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z);
}
inline __host__ __device__ void operator*=(uint3 &a, uint3 b)
{
    a.x *= b.x;
    a.y *= b.y;
    a.z *= b.z;
}
inline __host__ __device__ uint3 operator*(uint3 a, uint b)
{
    return make_uint3(a.x * b, a.y * b, a.z * b);
}
inline __host__ __device__ uint3 operator*(uint b, uint3 a)
{
    return make_uint3(b * a.x, b * a.y, b * a.z);
}
inline __host__ __device__ void operator*=(uint3 &a, uint b)
{
    a.x *= b;
    a.y *= b;
    a.z *= b;
}

inline __host__ __device__ float4 operator*(float4 a, float4 b)
{
    return make_float4(a.x * b.x, a.y * b.y, a.z * b.z,  a.w * b.w);
}
inline __host__ __device__ void operator*=(float4 &a, float4 b)
{
    a.x *= b.x;
    a.y *= b.y;
    a.z *= b.z;
    a.w *= b.w;
}
inline __host__ __device__ float4 operator*(float4 a, float b)
{
    return make_float4(a.x * b, a.y * b, a.z * b,  a.w * b);
}
inline __host__ __device__ float4 operator*(float b, float4 a)
{
    return make_float4(b * a.x, b * a.y, b * a.z, b * a.w);
}
inline __host__ __device__ void operator*=(float4 &a, float b)
{
    a.x *= b;
    a.y *= b;
    a.z *= b;
    a.w *= b;
}

inline __host__ __device__ int4 operator*(int4 a, int4 b)
{
    return make_int4(a.x * b.x, a.y * b.y, a.z * b.z,  a.w * b.w);
}
inline __host__ __device__ void operator*=(int4 &a, int4 b)
{
    a.x *= b.x;
    a.y *= b.y;
    a.z *= b.z;
    a.w *= b.w;
}
inline __host__ __device__ int4 operator*(int4 a, int b)
{
    return make_int4(a.x * b, a.y * b, a.z * b,  a.w * b);
}
inline __host__ __device__ int4 operator*(int b, int4 a)
{
    return make_int4(b * a.x, b * a.y, b * a.z, b * a.w);
}
inline __host__ __device__ void operator*=(int4 &a, int b)
{
    a.x *= b;
    a.y *= b;
    a.z *= b;
    a.w *= b;
}

inline __host__ __device__ uint4 operator*(uint4 a, uint4 b)
{
    return make_uint4(a.x * b.x, a.y * b.y, a.z * b.z,  a.w * b.w);
}
inline __host__ __device__ void operator*=(uint4 &a, uint4 b)
{
    a.x *= b.x;
    a.y *= b.y;
    a.z *= b.z;
    a.w *= b.w;
}
inline __host__ __device__ uint4 operator*(uint4 a, uint b)
{
    return make_uint4(a.x * b, a.y * b, a.z * b,  a.w * b);
}
inline __host__ __device__ uint4 operator*(uint b, uint4 a)
{
    return make_uint4(b * a.x, b * a.y, b * a.z, b * a.w);
}
inline __host__ __device__ void operator*=(uint4 &a, uint b)
{
    a.x *= b;
    a.y *= b;
    a.z *= b;
    a.w *= b;
}

////////////////////////////////////////////////////////////////////////////////
// divide
////////////////////////////////////////////////////////////////////////////////

inline __host__ __device__ float2 operator/(float2 a, float2 b)
{
    return make_float2(a.x / b.x, a.y / b.y);
}
inline __host__ __device__ void operator/=(float2 &a, float2 b)
{
    a.x /= b.x;
    a.y /= b.y;
}
inline __host__ __device__ float2 operator/(float2 a, float b)
{
    return make_float2(a.x / b, a.y / b);
}
inline __host__ __device__ void operator/=(float2 &a, float b)
{
    a.x /= b;
    a.y /= b;
}
inline __host__ __device__ float2 operator/(float b, float2 a)
{
    return make_float2(b / a.x, b / a.y);
}

inline __host__ __device__ float3 operator/(float3 a, float3 b)
{
    return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
}
inline __host__ __device__ void operator/=(float3 &a, float3 b)
{
    a.x /= b.x;
    a.y /= b.y;
    a.z /= b.z;
}
inline __host__ __device__ float3 operator/(float3 a, float b)
{
    return make_float3(a.x / b, a.y / b, a.z / b);
}
inline __host__ __device__ void operator/=(float3 &a, float b)
{
    a.x /= b;
    a.y /= b;
    a.z /= b;
}
inline __host__ __device__ float3 operator/(float b, float3 a)
{
    return make_float3(b / a.x, b / a.y, b / a.z);
}

inline __host__ __device__ float4 operator/(float4 a, float4 b)
{
    return make_float4(a.x / b.x, a.y / b.y, a.z / b.z,  a.w / b.w);
}
inline __host__ __device__ void operator/=(float4 &a, float4 b)
{
    a.x /= b.x;
    a.y /= b.y;
    a.z /= b.z;
    a.w /= b.w;
}
inline __host__ __device__ float4 operator/(float4 a, float b)
{
    return make_float4(a.x / b, a.y / b, a.z / b,  a.w / b);
}
inline __host__ __device__ void operator/=(float4 &a, float b)
{
    a.x /= b;
    a.y /= b;
    a.z /= b;
    a.w /= b;
}
inline __host__ __device__ float4 operator/(float b, float4 a)
{
    return make_float4(b / a.x, b / a.y, b / a.z, b / a.w);
}

////////////////////////////////////////////////////////////////////////////////
// min
////////////////////////////////////////////////////////////////////////////////

inline  __host__ __device__ float2 fminf(float2 a, float2 b)
{
    return make_float2(fminf(a.x,b.x), fminf(a.y,b.y));
}
inline __host__ __device__ float3 fminf(float3 a, float3 b)
{
    return make_float3(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z));
}
inline  __host__ __device__ float4 fminf(float4 a, float4 b)
{
    return make_float4(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z), fminf(a.w,b.w));
}

inline __host__ __device__ int2 min(int2 a, int2 b)
{
    return make_int2(min(a.x,b.x), min(a.y,b.y));
}
inline __host__ __device__ int3 min(int3 a, int3 b)
{
    return make_int3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z));
}
inline __host__ __device__ int4 min(int4 a, int4 b)
{
    return make_int4(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z), min(a.w,b.w));
}

inline __host__ __device__ uint2 min(uint2 a, uint2 b)
{
    return make_uint2(min(a.x,b.x), min(a.y,b.y));
}
inline __host__ __device__ uint3 min(uint3 a, uint3 b)
{
    return make_uint3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z));
}
inline __host__ __device__ uint4 min(uint4 a, uint4 b)
{
    return make_uint4(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z), min(a.w,b.w));
}

////////////////////////////////////////////////////////////////////////////////
// max
////////////////////////////////////////////////////////////////////////////////

inline __host__ __device__ float2 fmaxf(float2 a, float2 b)
{
    return make_float2(fmaxf(a.x,b.x), fmaxf(a.y,b.y));
}
inline __host__ __device__ float3 fmaxf(float3 a, float3 b)
{
    return make_float3(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z));
}
inline __host__ __device__ float4 fmaxf(float4 a, float4 b)
{
    return make_float4(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z), fmaxf(a.w,b.w));
}

inline __host__ __device__ int2 max(int2 a, int2 b)
{
    return make_int2(max(a.x,b.x), max(a.y,b.y));
}
inline __host__ __device__ int3 max(int3 a, int3 b)
{
    return make_int3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z));
}
inline __host__ __device__ int4 max(int4 a, int4 b)
{
    return make_int4(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z), max(a.w,b.w));
}

inline __host__ __device__ uint2 max(uint2 a, uint2 b)
{
    return make_uint2(max(a.x,b.x), max(a.y,b.y));
}
inline __host__ __device__ uint3 max(uint3 a, uint3 b)
{
    return make_uint3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z));
}
inline __host__ __device__ uint4 max(uint4 a, uint4 b)
{
    return make_uint4(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z), max(a.w,b.w));
}

////////////////////////////////////////////////////////////////////////////////
// lerp
// - linear interpolation between a and b, based on value t in [0, 1] range
////////////////////////////////////////////////////////////////////////////////

inline __device__ __host__ float lerp(float a, float b, float t)
{
    return a + t*(b-a);
}
inline __device__ __host__ float2 lerp(float2 a, float2 b, float t)
{
    return a + t*(b-a);
}
inline __device__ __host__ float3 lerp(float3 a, float3 b, float t)
{
    return a + t*(b-a);
}
inline __device__ __host__ float4 lerp(float4 a, float4 b, float t)
{
    return a + t*(b-a);
}

////////////////////////////////////////////////////////////////////////////////
// clamp
// - clamp the value v to be in the range [a, b]
////////////////////////////////////////////////////////////////////////////////

inline __device__ __host__ float clamp(float f, float a, float b)
{
    return fmaxf(a, fminf(f, b));
}
inline __device__ __host__ int clamp(int f, int a, int b)
{
    return max(a, min(f, b));
}
inline __device__ __host__ uint clamp(uint f, uint a, uint b)
{
    return max(a, min(f, b));
}

inline __device__ __host__ float2 clamp(float2 v, float a, float b)
{
    return make_float2(clamp(v.x, a, b), clamp(v.y, a, b));
}
inline __device__ __host__ float2 clamp(float2 v, float2 a, float2 b)
{
    return make_float2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
}
inline __device__ __host__ float3 clamp(float3 v, float a, float b)
{
    return make_float3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
}
inline __device__ __host__ float3 clamp(float3 v, float3 a, float3 b)
{
    return make_float3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
}
inline __device__ __host__ float4 clamp(float4 v, float a, float b)
{
    return make_float4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
}
inline __device__ __host__ float4 clamp(float4 v, float4 a, float4 b)
{
    return make_float4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
}

inline __device__ __host__ int2 clamp(int2 v, int a, int b)
{
    return make_int2(clamp(v.x, a, b), clamp(v.y, a, b));
}
inline __device__ __host__ int2 clamp(int2 v, int2 a, int2 b)
{
    return make_int2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
}
inline __device__ __host__ int3 clamp(int3 v, int a, int b)
{
    return make_int3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
}
inline __device__ __host__ int3 clamp(int3 v, int3 a, int3 b)
{
    return make_int3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
}
inline __device__ __host__ int4 clamp(int4 v, int a, int b)
{
    return make_int4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
}
inline __device__ __host__ int4 clamp(int4 v, int4 a, int4 b)
{
    return make_int4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
}

inline __device__ __host__ uint2 clamp(uint2 v, uint a, uint b)
{
    return make_uint2(clamp(v.x, a, b), clamp(v.y, a, b));
}
inline __device__ __host__ uint2 clamp(uint2 v, uint2 a, uint2 b)
{
    return make_uint2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
}
inline __device__ __host__ uint3 clamp(uint3 v, uint a, uint b)
{
    return make_uint3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
}
inline __device__ __host__ uint3 clamp(uint3 v, uint3 a, uint3 b)
{
    return make_uint3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
}
inline __device__ __host__ uint4 clamp(uint4 v, uint a, uint b)
{
    return make_uint4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
}
inline __device__ __host__ uint4 clamp(uint4 v, uint4 a, uint4 b)
{
    return make_uint4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
}

////////////////////////////////////////////////////////////////////////////////
// dot product
////////////////////////////////////////////////////////////////////////////////

inline __host__ __device__ float dot(float2 a, float2 b)
{
    return a.x * b.x + a.y * b.y;
}
inline __host__ __device__ float dot(float3 a, float3 b)
{
    return a.x * b.x + a.y * b.y + a.z * b.z;
}
inline __host__ __device__ float dot(float4 a, float4 b)
{
    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
}

inline __host__ __device__ int dot(int2 a, int2 b)
{
    return a.x * b.x + a.y * b.y;
}
inline __host__ __device__ int dot(int3 a, int3 b)
{
    return a.x * b.x + a.y * b.y + a.z * b.z;
}
inline __host__ __device__ int dot(int4 a, int4 b)
{
    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
}

inline __host__ __device__ uint dot(uint2 a, uint2 b)
{
    return a.x * b.x + a.y * b.y;
}
inline __host__ __device__ uint dot(uint3 a, uint3 b)
{
    return a.x * b.x + a.y * b.y + a.z * b.z;
}
inline __host__ __device__ uint dot(uint4 a, uint4 b)
{
    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
}

////////////////////////////////////////////////////////////////////////////////
// length
////////////////////////////////////////////////////////////////////////////////

inline __host__ __device__ float length(float2 v)
{
    return sqrtf(dot(v, v));
}
inline __host__ __device__ float length(float3 v)
{
    return sqrtf(dot(v, v));
}
inline __host__ __device__ float length(float4 v)
{
    return sqrtf(dot(v, v));
}

////////////////////////////////////////////////////////////////////////////////
// normalize
////////////////////////////////////////////////////////////////////////////////

inline __host__ __device__ float2 normalize(float2 v)
{
    float invLen = rsqrtf(dot(v, v));
    return v * invLen;
}
inline __host__ __device__ float3 normalize(float3 v)
{
    float invLen = rsqrtf(dot(v, v));
    return v * invLen;
}
inline __host__ __device__ float4 normalize(float4 v)
{
    float invLen = rsqrtf(dot(v, v));
    return v * invLen;
}

////////////////////////////////////////////////////////////////////////////////
// floor
////////////////////////////////////////////////////////////////////////////////

inline __host__ __device__ float2 floorf(float2 v)
{
    return make_float2(floorf(v.x), floorf(v.y));
}
inline __host__ __device__ float3 floorf(float3 v)
{
    return make_float3(floorf(v.x), floorf(v.y), floorf(v.z));
}
inline __host__ __device__ float4 floorf(float4 v)
{
    return make_float4(floorf(v.x), floorf(v.y), floorf(v.z), floorf(v.w));
}

////////////////////////////////////////////////////////////////////////////////
// frac - returns the fractional portion of a scalar or each vector component
////////////////////////////////////////////////////////////////////////////////

inline __host__ __device__ float fracf(float v)
{
    return v - floorf(v);
}
inline __host__ __device__ float2 fracf(float2 v)
{
    return make_float2(fracf(v.x), fracf(v.y));
}
inline __host__ __device__ float3 fracf(float3 v)
{
    return make_float3(fracf(v.x), fracf(v.y), fracf(v.z));
}
inline __host__ __device__ float4 fracf(float4 v)
{
    return make_float4(fracf(v.x), fracf(v.y), fracf(v.z), fracf(v.w));
}

////////////////////////////////////////////////////////////////////////////////
// fmod
////////////////////////////////////////////////////////////////////////////////

inline __host__ __device__ float2 fmodf(float2 a, float2 b)
{
    return make_float2(fmodf(a.x, b.x), fmodf(a.y, b.y));
}
inline __host__ __device__ float3 fmodf(float3 a, float3 b)
{
    return make_float3(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z));
}
inline __host__ __device__ float4 fmodf(float4 a, float4 b)
{
    return make_float4(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z), fmodf(a.w, b.w));
}

////////////////////////////////////////////////////////////////////////////////
// absolute value
////////////////////////////////////////////////////////////////////////////////

inline __host__ __device__ float2 fabs(float2 v)
{
    return make_float2(fabs(v.x), fabs(v.y));
}
inline __host__ __device__ float3 fabs(float3 v)
{
    return make_float3(fabs(v.x), fabs(v.y), fabs(v.z));
}
inline __host__ __device__ float4 fabs(float4 v)
{
    return make_float4(fabs(v.x), fabs(v.y), fabs(v.z), fabs(v.w));
}

inline __host__ __device__ int2 abs(int2 v)
{
    return make_int2(abs(v.x), abs(v.y));
}
inline __host__ __device__ int3 abs(int3 v)
{
    return make_int3(abs(v.x), abs(v.y), abs(v.z));
}
inline __host__ __device__ int4 abs(int4 v)
{
    return make_int4(abs(v.x), abs(v.y), abs(v.z), abs(v.w));
}

////////////////////////////////////////////////////////////////////////////////
// reflect
// - returns reflection of incident ray I around surface normal N
// - N should be normalized, reflected vector's length is equal to length of I
////////////////////////////////////////////////////////////////////////////////

inline __host__ __device__ float3 reflect(float3 i, float3 n)
{
    return i - 2.0f * n * dot(n,i);
}

////////////////////////////////////////////////////////////////////////////////
// cross product
////////////////////////////////////////////////////////////////////////////////

inline __host__ __device__ float3 cross(float3 a, float3 b)
{
    return make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x);
}

////////////////////////////////////////////////////////////////////////////////
// smoothstep
// - returns 0 if x < a
// - returns 1 if x > b
// - otherwise returns smooth interpolation between 0 and 1 based on x
////////////////////////////////////////////////////////////////////////////////

inline __device__ __host__ float smoothstep(float a, float b, float x)
{
    float y = clamp((x - a) / (b - a), 0.0f, 1.0f);
    return (y*y*(3.0f - (2.0f*y)));
}
inline __device__ __host__ float2 smoothstep(float2 a, float2 b, float2 x)
{
    float2 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
    return (y*y*(make_float2(3.0f) - (make_float2(2.0f)*y)));
}
inline __device__ __host__ float3 smoothstep(float3 a, float3 b, float3 x)
{
    float3 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
    return (y*y*(make_float3(3.0f) - (make_float3(2.0f)*y)));
}
inline __device__ __host__ float4 smoothstep(float4 a, float4 b, float4 x)
{
    float4 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
    return (y*y*(make_float4(3.0f) - (make_float4(2.0f)*y)));
}

#endif

Please fix all other compilation errors and update your driver.cu file above.

Okay. Thank you again for all your help/assistance.

Below is the latest driver.cu file with the helper_math.cuh file no longer used. I am using CUDA version 12.4.

I get no compilation errors when building as follows: nvcc driver.cu -o xTest.

When I execute xTest I get the following output:

i (4043): atomic (8092.000000,4046.000000), shared memory (0.000000,0.000000)
Atomic and Optimized kernels do not match.
GPUassert: invalid argument driver.cu 216

The driver.cu code:

#include <cuda.h>
#include <chrono>
#include <stdio.h>
#include <string>
#include <sstream>
#include <vector>
#include <stdexcept>

// for easy gpu error checking
#define GPU_ERROR_CHECK(ans) do{gpuAssert((ans),__FILE__,__LINE__);}while(0)
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
	  fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
	  printf("\nCUDA KERNEL ERROR: CUDA Kernel reports error: %s\n",cudaGetErrorString(code));
	  if (abort) exit(code);
   }
}

inline __host__ __device__ float dot(float3 a, float3 b)
{
    return a.x * b.x + a.y * b.y + a.z * b.z;
}

inline __host__ __device__ float length(float3 v)
{
    return sqrtf(dot(v, v));
}

__global__ void mod_kernel(const float3 * pos_a, const float3 * pos_b, const uint32_t input_size,
                           const uint32_t output_size, float2 * result, const int region,
                           const uint32_t first_idx_x, const uint32_t last_idx_x) {
    // External shared memory
    extern __shared__ float2 shared_result[];

    uint32_t thridx_x = threadIdx.x + blockDim.x * blockIdx.x + first_idx_x;
    uint32_t thridx_y = threadIdx.y + blockDim.y * blockIdx.y;
    uint32_t stride_x = blockDim.x * gridDim.x;
    uint32_t stride_y = blockDim.y * gridDim.y;

    float3 distance3 = make_float3(0.0f, 0.0f, 0.0f);

    float distance = 0;
    uint32_t output_start, output_end;

    // Initialize shared memory
    for (uint32_t y = threadIdx.x; y < output_size; y += blockDim.x) {
        shared_result[y] = make_float2(0, 0);
    }
    __syncthreads();

    for (uint32_t x = thridx_x; x < last_idx_x; x += stride_x) {
        distance3.x = pos_a[x].x - pos_b[x].x;
        distance3.y = pos_a[x].y - pos_b[x].y;
        distance3.z = pos_a[x].z - pos_b[x].z;

        distance = length(distance3);
        output_start = __fdiv_rz(output_size,2) + __fdiv_rz(distance,output_size) - region;
        output_end = output_start + region;

        for (uint32_t y = thridx_y; y < output_size; y += stride_y) {
            float val = 1;
            if ((y < output_end) && (y >= output_start)) {
                atomicAdd(&shared_result[y].x, val);
                atomicAdd(&shared_result[y].y, val / 2);
            }
        }
    }
    __syncthreads();
    
    // Write results from shared memory to global memory
    for (uint32_t y = threadIdx.x; y < output_size; y += blockDim.x) {
        atomicAdd(&result[y].x, shared_result[y].x);
        atomicAdd(&result[y].y, shared_result[y].y);
    }
}

/* 
    Atomic Kernel 
    Takes distance between two float3 arrays.
    Uses that to determine relevant range of output array.
    Uses atomics to sum everything. 
    @param pos_a input float3 array containing position data
    @param pos_b input float3 array containing position data
    @param input_size Size of input array
    @param output_size Size of output array (should be same as input_size)
    @param result Result float2 array
    @param region Buffer region for output array (to enforce multiple atomic ops at once)
*/
__global__ void org_kernel(const float3 * pos_a, const float3 * pos_b, 
                           const uint32_t input_size, const uint32_t output_size, 
                           float2 * result, const int region, 
                           const uint32_t first_idx_x, const uint32_t last_idx_x)
{
    uint32_t thridx_x = threadIdx.x + blockDim.x*blockIdx.x + first_idx_x;
    uint32_t thridx_y = threadIdx.y + blockDim.y*blockIdx.y;
    uint32_t stride_x = blockDim.x*gridDim.x;
    uint32_t stride_y = blockDim.y*gridDim.y;

    float3 distance3 = make_float3(0.0f, 0.0f, 0.0f);
    float distance = 0;

    uint32_t output_start, output_end;

    for(uint32_t x = thridx_x; x < last_idx_x; x += stride_x){
        // distance calcs
        distance3.x = pos_a[x].x - pos_b[x].x;
        distance3.y = pos_a[x].y - pos_b[x].y;
        distance3.z = pos_a[x].z - pos_b[x].z;

        distance = length(distance3);
        output_start = __fdiv_rz(output_size,2) + __fdiv_rz(distance,output_size) - region;
        output_end = output_start + region;
        
        for(uint32_t y = thridx_y; y < output_size; y += stride_y){
            float val = 1;
            if((y < output_end) && (y >= output_start)){
                atomicAdd(&result[y].x, val);
                atomicAdd(&result[y].y, val / 2);
            }                        
        }
    }
}

bool eval_arrays_equal(float2 * d_atomic, float2 * d_shmem, uint32_t n) {
    if (d_atomic == nullptr || d_shmem == nullptr) {
        throw std::invalid_argument("Arrays are NULL.");
    }
    if (n < 1) {
	throw std::invalid_argument("Invalid array length, less than 1.");
    }

    float2 * h_atomic;
    float2 * h_shmem;
    size_t sz = n * sizeof(float2);
    h_atomic = (float2*)malloc(sz);
    h_shmem  = (float2*)malloc(sz);

    GPU_ERROR_CHECK(cudaMemcpy(h_atomic, d_atomic, sz, cudaMemcpyDeviceToHost));
    GPU_ERROR_CHECK(cudaMemcpy(h_shmem,  d_shmem,  sz, cudaMemcpyDeviceToHost));

    GPU_ERROR_CHECK(cudaDeviceSynchronize());
    for (uint32_t i = 0; i < n; ++i) {
        if (h_atomic[i].x != h_shmem[i].x || h_atomic[i].y != h_shmem[i].y) {
            printf("\ti (%i): atomic (%f,%f), shared memory (%f,%f)\n", i, h_atomic[i].x, h_atomic[i].y, h_shmem[i].y, h_shmem[i].y);
            return false;
        }
    }

    free(h_atomic);
    free(h_shmem);

    // Every element is equal
    return true;
}

int main(int argc, char * argv[]) {
    dim3 nthreads(512,1,1);
    dim3 nblocks(1,512,1);

    uint32_t n_values = 8092;
    uint32_t region = 3;

    uint32_t n_float3s = n_values;
    uint32_t float3_sz = n_float3s * sizeof(float3);
    uint32_t output_sz = n_values * sizeof(float2);

    // Allocate host & device side
    float2 *d_output_array_atomic, *d_output_array_shmem;
    GPU_ERROR_CHECK(cudaMalloc(&d_output_array_atomic, output_sz));
    GPU_ERROR_CHECK(cudaMemset(d_output_array_atomic, 0, output_sz));
    GPU_ERROR_CHECK(cudaMalloc(&d_output_array_shmem, output_sz));
    GPU_ERROR_CHECK(cudaMemset(d_output_array_shmem, 0, output_sz));

    // Float3s
    float3 *pos_a, *pos_b;
    float3 *d_pos_a, *d_pos_b;
    pos_a = (float3*)malloc(float3_sz);
    pos_b = (float3*)malloc(float3_sz);
    for(size_t p = 0; p < n_float3s; ++p){
        pos_a[p] = make_float3(1,1,1);
        pos_b[p] = make_float3(0.1,0.1,0.1);
    }
    GPU_ERROR_CHECK(cudaMalloc(&d_pos_a, float3_sz));
    GPU_ERROR_CHECK(cudaMalloc(&d_pos_b, float3_sz));
    GPU_ERROR_CHECK(cudaMemcpy(d_pos_a, pos_a, float3_sz, cudaMemcpyHostToDevice));
    GPU_ERROR_CHECK(cudaMemcpy(d_pos_b, pos_b, float3_sz, cudaMemcpyHostToDevice));
    GPU_ERROR_CHECK(cudaDeviceSynchronize());

    /*
        ATOMIC
    */
    org_kernel<<<nthreads,nblocks,0,0>>>(d_pos_a, d_pos_b, n_values, n_values, d_output_array_atomic, region, 0, n_values);
    GPU_ERROR_CHECK(cudaDeviceSynchronize());

    /*
        ATOMIC SHARED MEMORY
    */
    mod_kernel<<<nthreads,nblocks,output_sz,0>>>(d_pos_a, d_pos_b, n_values, n_values, d_output_array_shmem, region, 0, n_values);
    GPU_ERROR_CHECK(cudaDeviceSynchronize());

    // Check for fidelity
    if (eval_arrays_equal(d_output_array_atomic, d_output_array_shmem, n_values)) {
        printf("Fidelity achieved.\n");
    }else {
        printf("Atomic and Optimized kernels do not match.\n");
    }

    GPU_ERROR_CHECK(cudaPeekAtLastError());
    GPU_ERROR_CHECK(cudaDeviceSynchronize());
    GPU_ERROR_CHECK(cudaDeviceReset());

    return 0;
}

Doh. I find the issue (I think) the shared memory was iterating in the x-direction when it should have been in the y-direction.

However, now it appears that the output Atomic and Optimized kernels do not match for any value of n_values greater than 6144. I am guess that it has something to do with shared memory limitations (?).

Below is the updated driver.cu file:

#include <cuda.h>
#include <chrono>
#include <stdio.h>
#include <string>
#include <sstream>
#include <vector>
#include <stdexcept>
#include <cstdlib>

// for easy gpu error checking
#define GPU_ERROR_CHECK(ans) do{gpuAssert((ans),__FILE__,__LINE__);}while(0)
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
	  fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
	  printf("\nCUDA KERNEL ERROR: CUDA Kernel reports error: %s\n",cudaGetErrorString(code));
	  if (abort) exit(code);
   }
}

inline __host__ __device__ float dot(float3 a, float3 b)
{
    return a.x * b.x + a.y * b.y + a.z * b.z;
}

inline __host__ __device__ float length(float3 v)
{
    return sqrtf(dot(v, v));
}

__global__ void mod_kernel(const float3 * pos_a, const float3 * pos_b, const uint32_t input_size,
                           const uint32_t output_size, float2 * result, const int region,
                           const uint32_t first_idx_x, const uint32_t last_idx_x) {
    // External shared memory
    extern __shared__ float2 shared_result[];

    uint32_t thridx_x = threadIdx.x + blockDim.x * blockIdx.x + first_idx_x;
    uint32_t thridx_y = threadIdx.y + blockDim.y * blockIdx.y;
    uint32_t stride_x = blockDim.x * gridDim.x;
    uint32_t stride_y = blockDim.y * gridDim.y;

    float3 distance3 = make_float3(0.0f, 0.0f, 0.0f);

    float distance = 0;
    uint32_t output_start, output_end;

    // Initialize shared memory
    for (uint32_t y = threadIdx.y; y < output_size; y += blockDim.y) {
        shared_result[y] = make_float2(0, 0);
    }
    __syncthreads();

    for (uint32_t x = thridx_x; x < last_idx_x; x += stride_x) {
        distance3.x = pos_a[x].x - pos_b[x].x;
        distance3.y = pos_a[x].y - pos_b[x].y;
        distance3.z = pos_a[x].z - pos_b[x].z;

        distance = length(distance3);
        //output_start = __fdiv_rz(output_size,2) + __fdiv_rz(distance,output_size);
        //output_end = output_start;
        output_start = __fdiv_rz(output_size,2) + __fdiv_rz(distance,output_size) - region;
        output_end = output_start + region;

        for (uint32_t y = thridx_y; y < output_size; y += stride_y) {
            float val = 1;
            if ((y < output_end) && (y >= output_start)) {
                atomicAdd(&shared_result[y].x, val);
                atomicAdd(&shared_result[y].y, val / 2);
            }
        }
    }
    __syncthreads();
    
    // Write results from shared memory to global memory
    for (uint32_t y = threadIdx.y; y < output_size; y += blockDim.y) {
        atomicAdd(&result[y].x, shared_result[y].x);
        atomicAdd(&result[y].y, shared_result[y].y);
    }
}

/* 
    Atomic Kernel 
    Takes distance between two float3 arrays.
    Uses that to determine relevant range of output array.
    Uses atomics to sum everything. 
    @param pos_a input float3 array containing position data
    @param pos_b input float3 array containing position data
    @param input_size Size of input array
    @param output_size Size of output array (should be same as input_size)
    @param result Result float2 array
    @param region Buffer region for output array (to enforce multiple atomic ops at once)
*/
__global__ void org_kernel(const float3 * pos_a, const float3 * pos_b, 
                           const uint32_t input_size, const uint32_t output_size, 
                           float2 * result, const int region, 
                           const uint32_t first_idx_x, const uint32_t last_idx_x)
{
    uint32_t thridx_x = threadIdx.x + blockDim.x*blockIdx.x + first_idx_x;
    uint32_t thridx_y = threadIdx.y + blockDim.y*blockIdx.y;
    uint32_t stride_x = blockDim.x*gridDim.x;
    uint32_t stride_y = blockDim.y*gridDim.y;

    float3 distance3 = make_float3(0.0f, 0.0f, 0.0f);
    float distance = 0;

    uint32_t output_start, output_end;

    for(uint32_t x = thridx_x; x < last_idx_x; x += stride_x){
        // distance calcs
        distance3.x = pos_a[x].x - pos_b[x].x;
        distance3.y = pos_a[x].y - pos_b[x].y;
        distance3.z = pos_a[x].z - pos_b[x].z;

        distance = length(distance3);
        //output_start = __fdiv_rz(output_size, 2) + __fdiv_rz(distance, output_size);
        //output_end = output_start; 
        output_start = __fdiv_rz(output_size,2) + __fdiv_rz(distance,output_size) - region;
        output_end = output_start + region;
        
        for(uint32_t y = thridx_y; y < output_size; y += stride_y){
            float val = 1;
            if((y < output_end) && (y >= output_start)){
                atomicAdd(&result[y].x, val);
                atomicAdd(&result[y].y, val / 2);
            }                        
        }
    }
}

void eval_arrays(float2 * d_atomic, float2 * d_shmem, uint32_t n) {
    if (d_atomic == nullptr || d_shmem == nullptr) {
        throw std::invalid_argument("Arrays are NULL.");
    }
    if (n < 1) {
        throw std::invalid_argument("Invalid array length, less than 1.");
    }
    float2 * h_atomic;
    float2 * h_shmem;
    size_t sz = n * sizeof(float2);
    h_atomic = (float2*)malloc(sz);
    h_shmem  = (float2*)malloc(sz);

    GPU_ERROR_CHECK(cudaMemcpy(h_atomic, d_atomic, sz, cudaMemcpyDeviceToHost));
    GPU_ERROR_CHECK(cudaMemcpy(h_shmem,  d_shmem,  sz, cudaMemcpyDeviceToHost));
    GPU_ERROR_CHECK(cudaDeviceSynchronize());

    printf("Idx:\t ATOMIC\t\t\tSHARED_MEM_ATOMIC:\n");
    for (uint32_t i = 0; i < n; ++i) {
        printf("i (%i):\t(%f,%f)\t(%f,%f)\n", i, h_atomic[i].x, h_atomic[i].y, h_shmem[i].x, h_shmem[i].y);
    }

    free(h_atomic);
    free(h_shmem);
}

bool eval_arrays_equal(float2 * d_atomic, float2 * d_shmem, uint32_t n) {
    if (d_atomic == nullptr || d_shmem == nullptr) {
        throw std::invalid_argument("Arrays are NULL.");
    }
    if (n < 1) {
        throw std::invalid_argument("Invalid array length, less than 1.");
    }

    float2 * h_atomic;
    float2 * h_shmem;
    size_t sz = n * sizeof(float2);
    h_atomic = (float2*)malloc(sz);
    h_shmem  = (float2*)malloc(sz);

    GPU_ERROR_CHECK(cudaMemcpy(h_atomic, d_atomic, sz, cudaMemcpyDeviceToHost));
    GPU_ERROR_CHECK(cudaMemcpy(h_shmem,  d_shmem,  sz, cudaMemcpyDeviceToHost));

    GPU_ERROR_CHECK(cudaDeviceSynchronize());
    for (uint32_t i = 0; i < n; ++i) {
        if (h_atomic[i].x != h_shmem[i].x || h_atomic[i].y != h_shmem[i].y) {
            printf("\ti (%i): atomic (%f,%f), shared memory (%f,%f)\n", i, h_atomic[i].x, h_atomic[i].y, h_shmem[i].y, h_shmem[i].y);
            return false;
        }
    }

    free(h_atomic);
    free(h_shmem);

    // Every element is equal
    return true;
}

int main(int argc, char * argv[]) {
    dim3 nthreads(512,1,1);
    dim3 nblocks(1,512,1);

    // AT THIS VALUE THE RESULTS NO LONGER MATCH !!!!
    uint32_t n_values = 6145;
    
    // *************************
    // THESE WORK !!!!!!!!!
    // *************************
    //uint32_t n_values = 32;
    //uint32_t n_values = 1024;
    //uint32_t n_values = 1027;
    //uint32_t n_values = 2048;
    //uint32_t n_values = 2049;
    //uint32_t n_values = 4096;
    //uint32_t n_values = 4097;
    //uint32_t n_values = 5000;
    //uint32_t n_values = 5007;
    //uint32_t n_values = 6139;
    //uint32_t n_values = 6144;

    uint32_t region = 3;

    uint32_t n_float3s = n_values;
    uint32_t float3_sz = n_float3s * sizeof(float3);
    uint32_t output_sz = n_values * sizeof(float2);

    // Allocate host & device side
    float2 *d_output_array_atomic, *d_output_array_shmem;
    GPU_ERROR_CHECK(cudaMalloc(&d_output_array_atomic, output_sz));
    GPU_ERROR_CHECK(cudaMemset(d_output_array_atomic, 0, output_sz));
    GPU_ERROR_CHECK(cudaMalloc(&d_output_array_shmem, output_sz));
    GPU_ERROR_CHECK(cudaMemset(d_output_array_shmem, 0, output_sz));

    // Float3s
    float3 *pos_a, *pos_b;
    float3 *d_pos_a, *d_pos_b;
    pos_a = (float3*)malloc(float3_sz);
    pos_b = (float3*)malloc(float3_sz);
    for(size_t p = 0; p < n_float3s; ++p){
        pos_a[p] = make_float3(1,1,1);
        pos_b[p] = make_float3(0.1,0.1,0.1);
    }
    GPU_ERROR_CHECK(cudaMalloc(&d_pos_a, float3_sz));
    GPU_ERROR_CHECK(cudaMalloc(&d_pos_b, float3_sz));
    GPU_ERROR_CHECK(cudaMemcpy(d_pos_a, pos_a, float3_sz, cudaMemcpyHostToDevice));
    GPU_ERROR_CHECK(cudaMemcpy(d_pos_b, pos_b, float3_sz, cudaMemcpyHostToDevice));
    GPU_ERROR_CHECK(cudaDeviceSynchronize());

    /*
        ATOMIC
    */
    org_kernel<<<nthreads,nblocks,0,0>>>(d_pos_a, d_pos_b, n_values, n_values, d_output_array_atomic, region, 0, n_values);
    GPU_ERROR_CHECK(cudaDeviceSynchronize());

    /*
        ATOMIC SHARED MEMORY
    */
    mod_kernel<<<nthreads,nblocks,output_sz,0>>>(d_pos_a, d_pos_b, n_values, n_values, d_output_array_shmem, region, 0, n_values);
    GPU_ERROR_CHECK(cudaDeviceSynchronize());

    // Check for fidelity
    if (eval_arrays_equal(d_output_array_atomic, d_output_array_shmem, n_values)) {
        printf("Fidelity achieved.\n");
    }else {
        printf("Atomic and Optimized kernels do not match.\n");
        return EXIT_FAILURE;
    }

    // Output array results
    //eval_arrays(d_output_array_atomic, d_output_array_shmem, n_values);

    GPU_ERROR_CHECK(cudaPeekAtLastError());
    GPU_ERROR_CHECK(cudaDeviceSynchronize());
    GPU_ERROR_CHECK(cudaDeviceReset());

    return EXIT_SUCCESS;
}

Something like that should not be ignored. If you get an error like that you should immediately track it down.

I think that is probably at least one issue you may have. Shared memory (unless you do something special, which you haven’t) is limited to 48K bytes per threadblock. 6144x8bytes = 48K.

That is not the error-checking I would recommend (for clarity) around a kernel launch, but to each their own.

For future reference, in CUDA, the number of blocks associated with a kernel launch is the first kernel config argument, and the number of threads per block is the second kernel config argument. At the moment, neither one (both are ~512) is exceeding limits, but you may get into confusing situations later if you modify those numerical values.