CUDA non-default stream synchronization

Hi,

I have a question about synchronization for non-default streams. In particular, whether it can be done solely using cudaEvents (without a cudaStreamSynchronize() call, which seems to be present in all CUDA samples) or not. The documentation seems to suggest querying for event completion should be enough, but it isn’t. Both managed and pinned memory have problems.

I have the following test program:

#include <cuda_runtime.h>
#include <iostream>

// These provide 8 combinations of the test: of these,  0 1 0 and 1 1 0 are failing
#define MEMORY_MANAGED          0   
#define STREAM_NON_BLOCKING     1
#define EXTRA_SYNCHRONIZE       0

#define CUDA_CHECK(call)                                                                           \
    do {                                                                                           \
        cudaError_t cudaStatus = (call);                                                           \
        if (cudaStatus != cudaSuccess) {                                                           \
            std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__ << " - "                  \
                      << cudaGetErrorString(cudaStatus) << std::endl;                              \
            cudaDeviceReset();                                                                     \
            exit(EXIT_FAILURE);                                                                    \
        }                                                                                          \
    } while (0)


int main() {
    const int W = 1024;
    const int H = 1024;
    float *h_src, *h_dst;
    float *d_dst;

    // Allocate stream
    cudaStream_t stream;
#if STREAM_NON_BLOCKING
    CUDA_CHECK( cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking) );
#else
    CUDA_CHECK( cudaStreamCreate(&stream) );
#endif

    // Allocate pinned/managed host memory 
#if MEMORY_MANAGED
    CUDA_CHECK( cudaMallocManaged((void**)&h_src, W * H * sizeof(float)) );
    CUDA_CHECK( cudaMallocManaged((void**)&h_dst, W * H * sizeof(float) ));
#else
    CUDA_CHECK( cudaMallocHost((void**)&h_src, W * H * sizeof(float) ));
    CUDA_CHECK( cudaMallocHost((void**)&h_dst, W * H * sizeof(float) ));
#endif

    // Allocate device memory
    CUDA_CHECK( cudaMalloc((void**)&d_dst, W * H * sizeof(float) ));

    // Initialize data on the host
    for (int i = 0; i < W * H; i++) {
        h_src[i] = static_cast<float>(i);
    }

    // Create events for synchronization
    cudaEvent_t startEvent, stopEvent;
    CUDA_CHECK( cudaEventCreate(&startEvent) );
    CUDA_CHECK( cudaEventCreate(&stopEvent) );

    // Start the asynchronous memory copy from pinned/managed host memory to device memory
    CUDA_CHECK( cudaMemcpy2DAsync(d_dst, W * sizeof(float), h_src, W * sizeof(float), W * sizeof(float), H, cudaMemcpyHostToDevice, stream) );

    // Record event after the first async copy
    CUDA_CHECK( cudaEventRecord(startEvent) );

    // Optionally, do other work on the host or device here

    // Start the device-to-host asynchronous copy
    CUDA_CHECK( cudaMemcpy2DAsync(h_dst, W * sizeof(float), d_dst, W * sizeof(float), W * sizeof(float), H, cudaMemcpyDeviceToHost, stream) );
    CUDA_CHECK( cudaEventRecord(stopEvent) );

#if EXTRA_SYNCHRONIZE
    cudaStreamSynchronize(stream);
#endif
    // Use cudaEventQuery to check for event completion without blocking
    cudaError_t eventStatus = cudaEventQuery(stopEvent);
    while (eventStatus == cudaErrorNotReady) {
        // The event is not ready yet, do some other non-blocking work here if needed
        // std::cout << "."; 

        // Recheck the event status
        eventStatus = cudaEventQuery(stopEvent);
    }

    if (eventStatus == cudaSuccess) {
        std::cout << "Data transfer completed successfully!" << std::endl;
    } else {
        std::cout << "An error occurred while waiting for the event." << std::endl;
    }

    // Check integrity
    for (int i = 0; i < W * H; i++) {
        if (h_src[i] != h_dst[i]) {
            std::cout << "Mismatch at position " << i << "!" << std::endl;
            break;
        }
    }

    // Clean up
    CUDA_CHECK( cudaFree(d_dst) );
#if MEMORY_MANAGED
    CUDA_CHECK( cudaFree(h_src) );
    CUDA_CHECK( cudaFree(h_dst) );
#else
    CUDA_CHECK( cudaFreeHost(h_src) );
    CUDA_CHECK( cudaFreeHost(h_dst) );
#endif
    CUDA_CHECK( cudaEventDestroy(startEvent) );
    CUDA_CHECK( cudaEventDestroy(stopEvent) );
    CUDA_CHECK( cudaStreamDestroy(stream) );

    return 0;
}

There’s 8 combinations changing the flags of: 1) managed/pinned memory, 2) creating the stream with/without non-blocking, and 3) adding cudaStreamSynchronize() - according to the define’s at the top.

It fails in two combinations:

a) fails with comparison mismatch at index 0 when using pinned memory, non-blocking flag to stream creation and no cudaStreamSynchronize() In particular pinned memory being incoherent here is unexpected.

b) fails with a segmentation fault when using managed memory, non-blocking flag to stream creation and no cudaStreamSynchronize(). Reason: h_src cannot be dereferenced in the integrity check (it’s not paged in) → SEGV This may be a misunderstanding on my part of host-accessibility of managed memory.

The documentation is not entirely clear as to why this fails and why simply querying the event on the stream shouldn’t be enough.

Suffice to say that everything works fine on the default stream.

Someone who can shine a detailed light on this perhaps?

[configuration is CUDA 12.6, JetPack 6.1 on Jetson Orin AGX].

Many thanks,
Arnoud.

The events are recorded in the default stream. When checking the event you busy-wait on completion of the default stream. However, the memory copies are executed in a different stream, and you explicitly create this stream with cudaStreamNonBlocking which removes the link between default stream and the custom stream.
Thus, the event may complete independent of the work in the custom stream.

It should work if the events are recorded in the custom stream, i.e.
CUDA_CHECK( cudaEventRecord(startEvent, stream) );

1 Like

I believe striker159 has addressed your item a. For your item b, my suggestion would be to check the concurrentManagedAccess device property, before trying to access a managed allocation from host code, without an intervening cudaDeviceSynchronize, after 1 or more kernel launches. If the concurrent managed access property is false, then what you are trying to do is expected to seg fault. For discrete GPUs, this would typically be true on linux for a pascal or newer GPU, but false on maxwell or on windows. Jetson devices have a somewhat different footprint, I believe.

1 Like

Yes, striker159 addressed item a), thanks a lot! I missed the stream argument. It indeed works correctly if the events are recorded in the custom stream.

Interestingly, this works for both managed and for pinned memory (thereby also solving item b)! even though concurrentManagedAccess is 0 on the Jetson Orin when I check it before accessing the managed allocation from host code, per Robert’s suggestion. Apparently it does get paged back in if appropriately sync’ed.

By the way, we found that our actual problem was caused by us performing atomic operations (atomic fetch-and-add, atomic fetch-and-decrement, etc.) on pinned memory, and this seemed to cause coherence problems between host and device.

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.