Using Atomic Device Intrinsics over Volatile

SIGNAL APIs internally use a volatile read or write for a signal fetch or set, respectively.

Likewise, theTEST API uses a volatile read for all symmetric memory accesses.

Yet, the CUDA C++ programming guide discourages using volatile for this use case as (1) it does not guarantee atomicity, which NVSHMEM promises for these APIs and (2) exhibits worse performance over device intrinsics.

So, is there a specific reason we use volatile for these APIs?

For instance, for its ATOMIC_FETCH, NVSHMEM correctly uses atomicOr(x, 0) which does an atomic RMW operation on x, where the modification is a no-op, thus functionally equivalent to a read. Why don’t we use this or something similar for the SIGNAL and TEST APIs?

I ran a benchmark verifying the performance difference and the device atomic is one order of magnitude faster:

  1. atomicOr: 16 ns
  2. volatile read: 208 ns

Here’s the code:

#include <nvshmemx.h>
#include <nvshmem.h>
#include <host/nvshmemx_api.h> // Makes CLion happy

#define CAST_TO(T, p) static_cast<T*>(static_cast<void*>(p))
#define CHECK_ERROR_EXIT(e) // typical cuda error checking, fill in the gap.

// uint64_t for signal type
__global__ void bench(uint64_t* __restrict__ p, const bool skip = true) {
    constexpr auto rounds = 64;
    double vC = 0.0f, aC = 0.0f; // vC -> volatile clocked, aC -> atomicClocked
    for (uint i = 0; i < rounds; ++i) {
        uint64_t start = 0, end = 0;
        asm volatile("mov.u64 %0, %%globaltimer;": "=l"(start)::);
        *CAST_TO(volatile uint64_t, p); // volatile read as done in nvshmem_*_test APIs
        asm volatile("mov.u64 %0, %%globaltimer;": "=l"(end)::);
        vC += static_cast<double>(end - start) / static_cast<double>(rounds);

        asm volatile("mov.u64 %0, %%globaltimer;": "=l"(start)::);
        static_assert(sizeof(unsigned long long int) == sizeof(uint64_t));
        atomicOr_system(CAST_TO(unsigned long long int, p), 0U); // suggested alternative
        asm volatile("mov.u64 %0, %%globaltimer;": "=l"(end)::);
        aC += static_cast<double>(end - start) / static_cast<double>(rounds);
    }

    if (!skip && !threadIdx.x) {
        printf("Block %u: vC is %f, aC is %f\n", blockIdx.x, vC, aC);
    }
}

__host__ __forceinline__
void hostBench() {
    nvshmem_init();
    CHECK_ERROR_EXIT(cudaSetDevice(nvshmem_team_my_pe(NVSHMEMX_TEAM_NODE)));
    auto* s = static_cast<uint64_t*>(nvshmem_calloc(1, sizeof(uint64_t)));
    for (uint i = 0; i < 128; ++i) {
        bench<<<1,1>>>(s);
    }
    bench<<<1,1>>>(s, false);
    nvshmem_free(s);
    nvshmem_finalize();
    CHECK_ERROR_EXIT(cudaPeekAtLastError());
    CHECK_ERROR_EXIT(cudaDeviceSynchronize());
}

int main() {
    hostBench();
}