SIGNAL
APIs internally use a volatile
read or write for a signal fetch or set, respectively.
Likewise, theTEST
API uses a volatile read for all symmetric memory accesses.
Yet, the CUDA C++ programming guide discourages using volatile for this use case as (1) it does not guarantee atomicity, which NVSHMEM promises for these APIs and (2) exhibits worse performance over device intrinsics.
So, is there a specific reason we use volatile
for these APIs?
For instance, for its ATOMIC_FETCH
, NVSHMEM correctly uses atomicOr(x, 0)
which does an atomic RMW operation on x
, where the modification is a no-op, thus functionally equivalent to a read. Why don’t we use this or something similar for the SIGNAL
and TEST
APIs?
I ran a benchmark verifying the performance difference and the device atomic is one order of magnitude faster:
- atomicOr: 16 ns
- volatile read: 208 ns
Here’s the code:
#include <nvshmemx.h>
#include <nvshmem.h>
#include <host/nvshmemx_api.h> // Makes CLion happy
#define CAST_TO(T, p) static_cast<T*>(static_cast<void*>(p))
#define CHECK_ERROR_EXIT(e) // typical cuda error checking, fill in the gap.
// uint64_t for signal type
__global__ void bench(uint64_t* __restrict__ p, const bool skip = true) {
constexpr auto rounds = 64;
double vC = 0.0f, aC = 0.0f; // vC -> volatile clocked, aC -> atomicClocked
for (uint i = 0; i < rounds; ++i) {
uint64_t start = 0, end = 0;
asm volatile("mov.u64 %0, %%globaltimer;": "=l"(start)::);
*CAST_TO(volatile uint64_t, p); // volatile read as done in nvshmem_*_test APIs
asm volatile("mov.u64 %0, %%globaltimer;": "=l"(end)::);
vC += static_cast<double>(end - start) / static_cast<double>(rounds);
asm volatile("mov.u64 %0, %%globaltimer;": "=l"(start)::);
static_assert(sizeof(unsigned long long int) == sizeof(uint64_t));
atomicOr_system(CAST_TO(unsigned long long int, p), 0U); // suggested alternative
asm volatile("mov.u64 %0, %%globaltimer;": "=l"(end)::);
aC += static_cast<double>(end - start) / static_cast<double>(rounds);
}
if (!skip && !threadIdx.x) {
printf("Block %u: vC is %f, aC is %f\n", blockIdx.x, vC, aC);
}
}
__host__ __forceinline__
void hostBench() {
nvshmem_init();
CHECK_ERROR_EXIT(cudaSetDevice(nvshmem_team_my_pe(NVSHMEMX_TEAM_NODE)));
auto* s = static_cast<uint64_t*>(nvshmem_calloc(1, sizeof(uint64_t)));
for (uint i = 0; i < 128; ++i) {
bench<<<1,1>>>(s);
}
bench<<<1,1>>>(s, false);
nvshmem_free(s);
nvshmem_finalize();
CHECK_ERROR_EXIT(cudaPeekAtLastError());
CHECK_ERROR_EXIT(cudaDeviceSynchronize());
}
int main() {
hostBench();
}