Is there proper CUDA atomicLoad function?

I’ve faced with the issue that CUDA atomic API do not have atomicLoad function.
After searching on stackoverflow, I’ve found the following implementation of CUDA atomicLoad

But looks like this function is failed to work in following example:

#include <cassert>
#include <iostream>
#include <cuda_runtime_api.h>

template <typename T>
__device__ T atomicLoad(const T* addr) {
    const volatile T* vaddr = addr;  // To bypass cache
    __threadfence();                 // for seq_cst loads. Remove for acquire semantics.
    const T value = *vaddr;
    // fence to ensure that dependent reads are correctly ordered
    __threadfence();
    return value;
}

__global__ void initAtomic(unsigned& count, const unsigned initValue) {
    count = initValue;
}

__global__ void addVerify(unsigned& count, const unsigned biasAtomicValue) {
    atomicAdd(&count, 1);
    // NOTE: When uncomment the following while loop the addVerify is stuck,
    //       it cannot read last proper value in variable count
//    while (atomicLoad(&count) != (1024 * 1024 + biasAtomicValue)) {
//        printf("count = %u\n", atomicLoad(&count));
//    }
}

int main() {
    std::cout << "Hello, CUDA atomics!" << std::endl;
    const auto atomicSize = sizeof(unsigned);

    unsigned* datomic = nullptr;
    cudaMalloc(&datomic, atomicSize);

    cudaStream_t stream;
    cudaStreamCreate(&stream);

    constexpr unsigned biasAtomicValue = 11;
    initAtomic<<<1, 1, 0, stream>>>(*datomic, biasAtomicValue);
    addVerify<<<1024, 1024, 0, stream>>>(*datomic, biasAtomicValue);
    cudaStreamSynchronize(stream);

    unsigned countHost = 0;
    cudaMemcpyAsync(&countHost, datomic, atomicSize, cudaMemcpyDeviceToHost, stream);
    assert(countHost == 1024 * 1024 + biasAtomicValue);

    cudaStreamDestroy(stream);

    return 0;
}

If you will uncomment the section with atomicLoad then application will stuck …

Maybe I missed something ? Is there a proper way to load variable modified atomically ?

P.S.: I know there exists cuda::atomic implementation, but this API is not supported by my hardware
This question is duplicate of the following gpu atomics - Is there proper CUDA atomicLoad function? - Stack Overflow

The issue here, as already indicated in your other question/answer, is that your GPU is not able to simultaneously have 1 million threads in flight, and therefore your while loop condition can never be satisfed, thus the hang. This is covered in many other questions on these forums.

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.