Optimizing CUDA kernel with atomics

Good afternoon,

I realize this may likely considered a re-post but the subject has changed as I am able to achieve results that are in line with the original CUDA kernel.

Anyway, I would still like to squeeze out more performance for my mod_kernel as compared to the corresponding org_kernel if possible.

Can anyone look at the following C++ CUDA code and tell me if there is any obvious place that looks like the mod_kernel could be accelerated as compared to org_kernel?

The mod_kernel is faster (1.5x to 2.0x) than org_kernel for input element counts of less than 131,048. However, for element counts higher than 131,048 the performance speed is about the same, maybe slightly slower. Maybe this is as fast as possible for the defined algorithm ?

As an aside, I am somewhat restricted in my permissions regarding nvprof and I have to use command-line option - I am unable execute things such as nvprof --metrics achieved_occupancy,warp_execution_efficiency .

Many thanks again for any help/hints. I have been learning a lot about CUDA performance benefits as well as limitations with your guidance.

#include <cuda.h>
#include <chrono>
#include <stdio.h>
#include <string>
#include <sstream>
#include <vector>
#include <stdexcept>
#include <cstdlib>
#include <cmath>

// for easy gpu error checking
#define GPU_ERROR_CHECK(ans) do{gpuAssert((ans),__FILE__,__LINE__);}while(0)
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      printf("\nCUDA KERNEL ERROR: CUDA Kernel reports error: %s\n",cudaGetErrorString(code));
      if (abort) exit(code);
   }
}

inline __host__ __device__ float dot(float3 a, float3 b)
{
    return a.x * b.x + a.y * b.y + a.z * b.z;
}

inline __host__ __device__ float length(float3 v)
{
    return sqrtf(dot(v, v));
}

inline __device__ float2 myKernel(float val) {
    return make_float2(val, val / 2);
}

__global__ void mod_kernel(const float3 * pos_a, const float3 * pos_b, 
                           const uint32_t input_size, const uint32_t output_size, 
                           float2 * result, const int region, 
                           const uint32_t first_idx_x, const uint32_t last_idx_x)
{
    extern __shared__ float3 shared_mem[];
    float3* shared_pos_a = shared_mem;
    float3* shared_pos_b = shared_mem + blockDim.x;

    uint32_t thridx_x = threadIdx.x + blockDim.x * blockIdx.x + first_idx_x;
    uint32_t thridx_y = threadIdx.y + blockDim.y * blockIdx.y;
    uint32_t stride_x = blockDim.x * gridDim.x;
    uint32_t stride_y = blockDim.y * gridDim.y;

    float3 distance3 = make_float3(0.0f, 0.0f, 0.0f);
    float distance = 0;

    uint32_t output_start, output_end;

    for(uint32_t x = thridx_x; x < last_idx_x; x += stride_x){
        if (x >= input_size) break;

        // Load data into shared memory
        if (threadIdx.x < blockDim.x) {
            shared_pos_a[threadIdx.x] = pos_a[x];
            shared_pos_b[threadIdx.x] = pos_b[x];
        }
        __syncthreads();

        // distance calcs
        distance3.x = shared_pos_a[threadIdx.x].x - shared_pos_b[threadIdx.x].x;
        distance3.y = shared_pos_a[threadIdx.x].y - shared_pos_b[threadIdx.x].y;
        distance3.z = shared_pos_a[threadIdx.x].z - shared_pos_b[threadIdx.x].z;

        distance = length(distance3);
        output_start = __fdiv_rz(output_size, 2) + __fdiv_rz(distance, output_size) - region;
        output_end = output_start + region;
        
        for(uint32_t y = thridx_y; y < output_size; y += stride_y){
            float val = 1;
            if((y < output_end) && (y >= output_start)){
                float2 lval = myKernel(val);
                atomicAdd(&result[y].x, lval.x);
                atomicAdd(&result[y].y, lval.y);
            }                        
        }
        __syncthreads();
    }
}

__global__ void org_kernel(const float3 * pos_a, const float3 * pos_b, 
                           const uint32_t input_size, const uint32_t output_size, 
                           float2 * result, const int region, 
                           const uint32_t first_idx_x, const uint32_t last_idx_x)
{
    uint32_t thridx_x = threadIdx.x + blockDim.x * blockIdx.x + first_idx_x;
    uint32_t thridx_y = threadIdx.y + blockDim.y * blockIdx.y;
    uint32_t stride_x = blockDim.x * gridDim.x;
    uint32_t stride_y = blockDim.y * gridDim.y;

    float3 distance3 = make_float3(0.0f, 0.0f, 0.0f);
    float distance = 0;

    uint32_t output_start, output_end;

    for(uint32_t x = thridx_x; x < last_idx_x; x += stride_x){
        // distance calcs
        distance3.x = pos_a[x].x - pos_b[x].x;
        distance3.y = pos_a[x].y - pos_b[x].y;
        distance3.z = pos_a[x].z - pos_b[x].z;

        distance = length(distance3);
        output_start = __fdiv_rz(output_size, 2) + __fdiv_rz(distance, output_size) - region;
        output_end = output_start + region;
        
        for(uint32_t y = thridx_y; y < output_size; y += stride_y){
            float val = 1;
            if((y < output_end) && (y >= output_start)){
                float2 lval = myKernel(val);
                atomicAdd(&result[y].x, lval.x);
                atomicAdd(&result[y].y, lval.y);
            }                        
        }
    }
}

bool eval_arrays_equal(float2 * d_atomic, float2 * d_mod, uint32_t n) {
    if (d_atomic == nullptr || d_mod == nullptr) {
        throw std::invalid_argument("Arrays are NULL.");
    }
    if (n < 1) {
        throw std::invalid_argument("Invalid array length, less than 1.");
    }

    float2 * h_atomic;
    float2 * h_mod;
    size_t sz = n * sizeof(float2);
    h_atomic = (float2*)malloc(sz);
    h_mod  = (float2*)malloc(sz);

    GPU_ERROR_CHECK(cudaMemcpy(h_atomic, d_atomic, sz, cudaMemcpyDeviceToHost));
    GPU_ERROR_CHECK(cudaMemcpy(h_mod, d_mod, sz, cudaMemcpyDeviceToHost));

    GPU_ERROR_CHECK(cudaDeviceSynchronize());
    for (uint32_t i = 0; i < n; ++i) {
        if (h_atomic[i].x != h_mod[i].x || h_atomic[i].y != h_mod[i].y) {
            printf("\ti (%i): atomic (%f,%f), modified (%f,%f)\n", i, h_atomic[i].x, h_atomic[i].y, h_mod[i].x, h_mod[i].y);
            return false;
        }
    }

    free(h_atomic);
    free(h_mod);

    // Every element is equal
    return true;
}

int main(int argc, char * argv[]) {
    if (argc != 2) {
        fprintf(stderr, "\nPass the number of array elements via command line as follows:\n");
        fprintf(stderr, "./xTest <num_elems>\n\n");
        return EXIT_FAILURE;
    }

    // Dimensions
    dim3 nthreads(512,1,1);
    dim3 nblocks(1,512,1);

    uint32_t n_values = static_cast<uint32_t>(std::stoi(argv[1]));
    uint32_t region = 3;

    uint32_t n_float3s = n_values;
    uint32_t float3_sz = n_float3s * sizeof(float3);
    uint32_t output_sz = n_values * sizeof(float2);

    // Allocate host & device side
    float2 *d_out_org;
    float2 *d_out_mod;
    GPU_ERROR_CHECK(cudaMalloc(&d_out_org, output_sz));
    GPU_ERROR_CHECK(cudaMalloc(&d_out_mod, output_sz));
    GPU_ERROR_CHECK(cudaMemset(d_out_org, 0, output_sz));
    GPU_ERROR_CHECK(cudaMemset(d_out_mod, 0, output_sz));

    // Float3s
    float3 *pos_a, *pos_b;
    float3 *d_pos_a, *d_pos_b;
    pos_a = (float3*)malloc(float3_sz);
    pos_b = (float3*)malloc(float3_sz);
    for(size_t p = 0; p < n_float3s; ++p){
        pos_a[p] = make_float3(1,1,1);
        pos_b[p] = make_float3(0.1,0.1,0.1);
    }
    GPU_ERROR_CHECK(cudaMalloc(&d_pos_a, float3_sz));
    GPU_ERROR_CHECK(cudaMalloc(&d_pos_b, float3_sz));
    GPU_ERROR_CHECK(cudaMemcpy(d_pos_a, pos_a, float3_sz, cudaMemcpyHostToDevice));
    GPU_ERROR_CHECK(cudaMemcpy(d_pos_b, pos_b, float3_sz, cudaMemcpyHostToDevice));
    GPU_ERROR_CHECK(cudaDeviceSynchronize());

    float total_time_org = 0.0f;
    float total_time_mod = 0.0f;

    uint32_t first_idx_x = 0;
    uint32_t last_idx_x  = n_values;

    auto start = std::chrono::high_resolution_clock::now();
    org_kernel<<<nthreads,nblocks,0,0>>>(d_pos_a, d_pos_b, n_values, n_values, d_out_org, region, first_idx_x, last_idx_x);
    GPU_ERROR_CHECK(cudaDeviceSynchronize());
    auto stop = std::chrono::high_resolution_clock::now();
    total_time_org = static_cast<float>(std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start).count());

    size_t max_shared_mem_bytes = 48 * 1024;  // Assuming shared memory max of 48 KB
    size_t float3_size = sizeof(float3);
    size_t max_elements_per_chunk = max_shared_mem_bytes / (2 * float3_size); // 2 arrays in shared memory

    for (uint32_t i = 0; i < n_values; i += max_elements_per_chunk) {
        uint32_t chunk_size = (i + max_elements_per_chunk < n_values) ? max_elements_per_chunk : (n_values - i);
        size_t shareMemSize = 2 * chunk_size * float3_size;

        start = std::chrono::high_resolution_clock::now();
        mod_kernel<<<nthreads,nblocks,shareMemSize,0>>>(d_pos_a, d_pos_b, n_values, n_values, d_out_mod, region, i, i + chunk_size);
        GPU_ERROR_CHECK(cudaDeviceSynchronize());
        stop = std::chrono::high_resolution_clock::now();
        total_time_mod += static_cast<float>(std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start).count());
    }

    // Check for fidelity
    if (eval_arrays_equal(d_out_org, d_out_mod, n_values)) {
        printf("\nFidelity achieved.\n");
        printf("\t[ORIGINAL] Time: %8.9f (us.)\n", total_time_org);
        printf("\t[MODIFIED] Time: %8.9f (us.)\n", total_time_mod);
    } else {
        printf("\nAtomic and Optimized kernels do not match.\n");
        return EXIT_FAILURE;
    }

    GPU_ERROR_CHECK(cudaPeekAtLastError());
    GPU_ERROR_CHECK(cudaDeviceSynchronize());
    GPU_ERROR_CHECK(cudaDeviceReset());

    return EXIT_SUCCESS;
}