Speeding up kernel with atomicAdd and results do not match

Maybe the use of shared memory is not a very robust approach.

Do you have any recommendation(s) for accelerating the following CUDA kernel for larger sized arrays? My initial thought was to lower the number of atomicAdd calls using shared memory but that may not be a very good approach.

The following is the original CUDA code that I am trying to accelerate:

__global__ void org_kernel(const float3 * pos_a, const float3 * pos_b, 
                           const uint32_t input_size, const uint32_t output_size, 
                           float2 * result, const int region, 
                           const uint32_t first_idx_x, const uint32_t last_idx_x)
{
    uint32_t thridx_x = threadIdx.x + blockDim.x*blockIdx.x + first_idx_x;
    uint32_t thridx_y = threadIdx.y + blockDim.y*blockIdx.y;
    uint32_t stride_x = blockDim.x * gridDim.x;
    uint32_t stride_y = blockDim.y * gridDim.y;

    float3 distance3 = make_float3(0.0f, 0.0f, 0.0f);
    float distance = 0;

    uint32_t output_start, output_end;

    for(uint32_t x = thridx_x; x < last_idx_x; x += stride_x){
        // distance calcs
        distance3.x = pos_a[x].x - pos_b[x].x;
        distance3.y = pos_a[x].y - pos_b[x].y;
        distance3.z = pos_a[x].z - pos_b[x].z;

        distance = length(distance3);
        output_start = __fdiv_rz(output_size,2) + __fdiv_rz(distance,output_size) - region;
        output_end = output_start + region;
        
        for(uint32_t y = thridx_y; y < output_size; y += stride_y){
            float val = 1;
            if((y < output_end) && (y >= output_start)){
                atomicAdd(&result[y].x, val);
                atomicAdd(&result[y].y, val / 2);
            }                        
        }
    }
}

How large are “larger sized arrays” ? What is a typical value for output_end - output_start ?
You could think about using all threads in a block to process a single x value. This way you do not need atomics to shared memory.

Something similar to this:


float smemResults[];  filled with 0
__syncthreads();
...
for(x = blockIdx.x; x < numX; x += gridDim.x){
      ... // compute distance of pos_a[x] and pos_b[x]
      
     for(i = threadIdx.x; i < end-start; i += blockDim.x){
          smemResults[start + i] += 1;
    }
    __syncthreads();
}

for(i = threadIdx.x; i < smem size; i += blockDim.x){
  float2 res = make_float2(smemResults[i], smemResults[i]/2);
   //atomicAdd(&globalResults[i], res);  //atomicAdd float2 overload only available on sm_90+
   atomicAdd(&globalResults[i].x, res.x); 
   atomicAdd(&globalResults[i].y, res.y); 
}
1 Like

Hi @striker159,

The arrays could be much larger than what is available for shared memory. I am also stuck with the values of the dim3 variables at:

dim3 ngrid(512,1,1);
dim3 nblocks(1,512,1);
...
mod_kernel<<<ngrid,nblocks,0,0>>>(...)

Why are you stuck with the thread dimensions?

If the arrays are larger than shared memory, you need to partition the output array. for example, instead of processing 65536 columns, (which does not fit in shared memory), process 16 * 4096 columns (which does fit in shared memory).

You could always start with an implementation that instead of using shared memory uses a temporary per-block global memory array. This way you avoid partitioning.

Thank you for the reply @striker159. Yes, unfortunately I am stuck with the thread dimensions.

Okay. I think I got the code to work using shared memory for increasing array input sizes. The performance boost is noticeable for any array input, i.e., n_values of 8092 I get approximately 1.25x faster. However, for larger arrays the performance is either the same or worse.

Does anyone have any suggestions for boosting execution speed of mod_kernel? I have even been toying with the idea of giving CUDA CUB a try. The complete code follows:

#include <cuda.h>
#include <chrono>
#include <stdio.h>
#include <string>
#include <sstream>
#include <vector>
#include <stdexcept>
#include <cstdlib>
#include <cmath>

#define GPU_ERROR_CHECK(ans) do{gpuAssert((ans),__FILE__,__LINE__);}while(0)
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      printf("\nCUDA KERNEL ERROR: CUDA Kernel reports error: %s\n",cudaGetErrorString(code));
      if (abort) exit(code);
   }
}

inline __host__ __device__ float dot(float3 a, float3 b)
{
    return a.x * b.x + a.y * b.y + a.z * b.z;
}

inline __host__ __device__ float length(float3 v)
{
    return sqrtf(dot(v, v));
}

inline __device__ float2 myKernel(float val) {
    return make_float2(val, val / 2);
}

__global__ void mod_kernel(const float3 * pos_a, const float3 * pos_b, 
                           const uint32_t input_size, const uint32_t output_size, 
                           float2 * result, const int region, 
                           const uint32_t first_idx_x, const uint32_t last_idx_x)
{
    extern __shared__ float3 shared_mem[];
    float3* shared_pos_a = shared_mem;
    float3* shared_pos_b = shared_mem + blockDim.x;

    uint32_t thridx_x = threadIdx.x + blockDim.x * blockIdx.x + first_idx_x;
    uint32_t thridx_y = threadIdx.y + blockDim.y * blockIdx.y;
    uint32_t stride_x = blockDim.x * gridDim.x;
    uint32_t stride_y = blockDim.y * gridDim.y;

    float3 distance3 = make_float3(0.0f, 0.0f, 0.0f);
    float distance = 0;

    uint32_t output_start, output_end;

    if (thridx_x < input_size) {
        // Load data into shared memory
        shared_pos_a[threadIdx.x] = pos_a[thridx_x];
        shared_pos_b[threadIdx.x] = pos_b[thridx_x];
    }
    __syncthreads();

    for(uint32_t x = thridx_x; x < last_idx_x; x += stride_x){
        if (x >= input_size) break;

        // distance calcs
        distance3.x = shared_pos_a[threadIdx.x].x - shared_pos_b[threadIdx.x].x;
        distance3.y = shared_pos_a[threadIdx.x].y - shared_pos_b[threadIdx.x].y;
        distance3.z = shared_pos_a[threadIdx.x].z - shared_pos_b[threadIdx.x].z;

        distance = length(distance3);
        output_start = __fdiv_rz(output_size, 2) + __fdiv_rz(distance, output_size) - region;
        output_end = output_start + region;
        
        for(uint32_t y = thridx_y; y < output_size; y += stride_y){
            float val = 1;
            if((y < output_end) && (y >= output_start)){
                float2 lval = myKernel(val);
                atomicAdd(&result[y].x, lval.x);
                atomicAdd(&result[y].y, lval.y);
            }                        
        }
        __syncthreads();
    }
}

__global__ void org_kernel(const float3 * pos_a, const float3 * pos_b, 
                           const uint32_t input_size, const uint32_t output_size, 
                           float2 * result, const int region, 
                           const uint32_t first_idx_x, const uint32_t last_idx_x)
{
    uint32_t thridx_x = threadIdx.x + blockDim.x * blockIdx.x + first_idx_x;
    uint32_t thridx_y = threadIdx.y + blockDim.y * blockIdx.y;
    uint32_t stride_x = blockDim.x * gridDim.x;
    uint32_t stride_y = blockDim.y * gridDim.y;

    float3 distance3 = make_float3(0.0f, 0.0f, 0.0f);
    float distance = 0;

    uint32_t output_start, output_end;

    for(uint32_t x = thridx_x; x < last_idx_x; x += stride_x){
        // distance calcs
        distance3.x = pos_a[x].x - pos_b[x].x;
        distance3.y = pos_a[x].y - pos_b[x].y;
        distance3.z = pos_a[x].z - pos_b[x].z;

        distance = length(distance3);
        output_start = __fdiv_rz(output_size, 2) + __fdiv_rz(distance, output_size) - region;
        output_end = output_start + region;
        
        for(uint32_t y = thridx_y; y < output_size; y += stride_y){
            float val = 1;
            if((y < output_end) && (y >= output_start)){
                float2 lval = myKernel(val);
                atomicAdd(&result[y].x, lval.x);
                atomicAdd(&result[y].y, lval.y);
            }                        
        }
    }
}

bool eval_arrays_equal(float2 * d_atomic, float2 * d_mod, uint32_t n) {
    if (d_atomic == nullptr || d_mod == nullptr) {
        throw std::invalid_argument("Arrays are NULL.");
    }
    if (n < 1) {
        throw std::invalid_argument("Invalid array length, less than 1.");
    }

    float2 * h_atomic;
    float2 * h_mod;
    size_t sz = n * sizeof(float2);
    h_atomic = (float2*)malloc(sz);
    h_mod  = (float2*)malloc(sz);

    GPU_ERROR_CHECK(cudaMemcpy(h_atomic, d_atomic, sz, cudaMemcpyDeviceToHost));
    GPU_ERROR_CHECK(cudaMemcpy(h_mod, d_mod, sz, cudaMemcpyDeviceToHost));

    GPU_ERROR_CHECK(cudaDeviceSynchronize());
    for (uint32_t i = 0; i < n; ++i) {
        if (h_atomic[i].x != h_mod[i].x || h_atomic[i].y != h_mod[i].y) {
            printf("\ti (%i): atomic (%f,%f), modified (%f,%f)\n", i, h_atomic[i].x, h_atomic[i].y, h_mod[i].x, h_mod[i].y);
            return false;
        }
    }

    free(h_atomic);
    free(h_mod);

    // Every element is equal
    return true;
}

int main(int argc, char * argv[]) {
    if (argc != 2) {
        fprintf(stderr, "\nPass the number of array elements via command line as follows:\n");
        fprintf(stderr, "./xTest <num_elems>\n\n");
        return EXIT_FAILURE;
    }

    // Dimensions
    dim3 nthreads(512,1,1);
    dim3 nblocks(1,512,1);

    uint32_t n_values = static_cast<uint32_t>(std::stoi(argv[1]));
    uint32_t region = 3;

    uint32_t n_float3s = n_values;
    uint32_t float3_sz = n_float3s * sizeof(float3);
    uint32_t output_sz = n_values * sizeof(float2);

    // Allocate host & device side
    float2 *d_out_org;
    float2 *d_out_mod;
    GPU_ERROR_CHECK(cudaMalloc(&d_out_org, output_sz));
    GPU_ERROR_CHECK(cudaMalloc(&d_out_mod, output_sz));
    GPU_ERROR_CHECK(cudaMemset(d_out_org, 0, output_sz));
    GPU_ERROR_CHECK(cudaMemset(d_out_mod, 0, output_sz));

    // Float3s
    float3 *pos_a, *pos_b;
    float3 *d_pos_a, *d_pos_b;
    pos_a = (float3*)malloc(float3_sz);
    pos_b = (float3*)malloc(float3_sz);
    for(size_t p = 0; p < n_float3s; ++p){
        pos_a[p] = make_float3(1,1,1);
        pos_b[p] = make_float3(0.1,0.1,0.1);
    }
    GPU_ERROR_CHECK(cudaMalloc(&d_pos_a, float3_sz));
    GPU_ERROR_CHECK(cudaMalloc(&d_pos_b, float3_sz));
    GPU_ERROR_CHECK(cudaMemcpy(d_pos_a, pos_a, float3_sz, cudaMemcpyHostToDevice));
    GPU_ERROR_CHECK(cudaMemcpy(d_pos_b, pos_b, float3_sz, cudaMemcpyHostToDevice));
    GPU_ERROR_CHECK(cudaDeviceSynchronize());

    float total_time_org = 0.0f;
    float total_time_mod = 0.0f;

    uint32_t first_idx_x = 0;
    uint32_t last_idx_x  = n_values;

    auto start = std::chrono::high_resolution_clock::now();
    org_kernel<<<nthreads,nblocks,0,0>>>(d_pos_a, d_pos_b, n_values, n_values, d_out_org, region, first_idx_x, last_idx_x);
    GPU_ERROR_CHECK(cudaDeviceSynchronize());
    auto stop = std::chrono::high_resolution_clock::now();
    total_time_org = static_cast<float>(std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start).count());

    size_t sharedMemSize = 2 * nthreads.x * sizeof(float3);
    start = std::chrono::high_resolution_clock::now();
    mod_kernel<<<nthreads,nblocks,sharedMemSize,0>>>(d_pos_a, d_pos_b, n_values, n_values, d_out_mod, region, first_idx_x, last_idx_x);
    GPU_ERROR_CHECK(cudaDeviceSynchronize());
    stop = std::chrono::high_resolution_clock::now();
    total_time_mod = static_cast<float>(std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start).count());

    // Check for fidelity
    if (eval_arrays_equal(d_out_org, d_out_mod, n_values)) {
        printf("\nFidelity achieved.\n");
        printf("\t[ORIGINAL] Time: %8.9f (us.)\n", total_time_org);
        printf("\t[MODIFIED] Time: %8.9f (us.)\n", total_time_mod);
    } else {
        printf("\nAtomic and Optimized kernels do not match.\n");
        return EXIT_FAILURE;
    }

    GPU_ERROR_CHECK(cudaPeekAtLastError());
    GPU_ERROR_CHECK(cudaDeviceSynchronize());
    GPU_ERROR_CHECK(cudaDeviceReset());

    return EXIT_SUCCESS;
}

Use Compute Nsight to profile, where your bottleneck is, and what speed you can expect at a maximum after optimization.