CUDA non-default stream synchronization

azw · October 28, 2024, 3:48pm

Hi,

I have a question about synchronization for non-default streams. In particular, whether it can be done solely using cudaEvents (without a cudaStreamSynchronize() call, which seems to be present in all CUDA samples) or not. The documentation seems to suggest querying for event completion should be enough, but it isn’t. Both managed and pinned memory have problems.

I have the following test program:

#include <cuda_runtime.h>
#include <iostream>

// These provide 8 combinations of the test: of these,  0 1 0 and 1 1 0 are failing
#define MEMORY_MANAGED          0   
#define STREAM_NON_BLOCKING     1
#define EXTRA_SYNCHRONIZE       0

#define CUDA_CHECK(call)                                                                           \
    do {                                                                                           \
        cudaError_t cudaStatus = (call);                                                           \
        if (cudaStatus != cudaSuccess) {                                                           \
            std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__ << " - "                  \
                      << cudaGetErrorString(cudaStatus) << std::endl;                              \
            cudaDeviceReset();                                                                     \
            exit(EXIT_FAILURE);                                                                    \
        }                                                                                          \
    } while (0)


int main() {
    const int W = 1024;
    const int H = 1024;
    float *h_src, *h_dst;
    float *d_dst;

    // Allocate stream
    cudaStream_t stream;
#if STREAM_NON_BLOCKING
    CUDA_CHECK( cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking) );
#else
    CUDA_CHECK( cudaStreamCreate(&stream) );
#endif

    // Allocate pinned/managed host memory 
#if MEMORY_MANAGED
    CUDA_CHECK( cudaMallocManaged((void**)&h_src, W * H * sizeof(float)) );
    CUDA_CHECK( cudaMallocManaged((void**)&h_dst, W * H * sizeof(float) ));
#else
    CUDA_CHECK( cudaMallocHost((void**)&h_src, W * H * sizeof(float) ));
    CUDA_CHECK( cudaMallocHost((void**)&h_dst, W * H * sizeof(float) ));
#endif

    // Allocate device memory
    CUDA_CHECK( cudaMalloc((void**)&d_dst, W * H * sizeof(float) ));

    // Initialize data on the host
    for (int i = 0; i < W * H; i++) {
        h_src[i] = static_cast<float>(i);
    }

    // Create events for synchronization
    cudaEvent_t startEvent, stopEvent;
    CUDA_CHECK( cudaEventCreate(&startEvent) );
    CUDA_CHECK( cudaEventCreate(&stopEvent) );

    // Start the asynchronous memory copy from pinned/managed host memory to device memory
    CUDA_CHECK( cudaMemcpy2DAsync(d_dst, W * sizeof(float), h_src, W * sizeof(float), W * sizeof(float), H, cudaMemcpyHostToDevice, stream) );

    // Record event after the first async copy
    CUDA_CHECK( cudaEventRecord(startEvent) );

    // Optionally, do other work on the host or device here

    // Start the device-to-host asynchronous copy
    CUDA_CHECK( cudaMemcpy2DAsync(h_dst, W * sizeof(float), d_dst, W * sizeof(float), W * sizeof(float), H, cudaMemcpyDeviceToHost, stream) );
    CUDA_CHECK( cudaEventRecord(stopEvent) );

#if EXTRA_SYNCHRONIZE
    cudaStreamSynchronize(stream);
#endif
    // Use cudaEventQuery to check for event completion without blocking
    cudaError_t eventStatus = cudaEventQuery(stopEvent);
    while (eventStatus == cudaErrorNotReady) {
        // The event is not ready yet, do some other non-blocking work here if needed
        // std::cout << "."; 

        // Recheck the event status
        eventStatus = cudaEventQuery(stopEvent);
    }

    if (eventStatus == cudaSuccess) {
        std::cout << "Data transfer completed successfully!" << std::endl;
    } else {
        std::cout << "An error occurred while waiting for the event." << std::endl;
    }

    // Check integrity
    for (int i = 0; i < W * H; i++) {
        if (h_src[i] != h_dst[i]) {
            std::cout << "Mismatch at position " << i << "!" << std::endl;
            break;
        }
    }

    // Clean up
    CUDA_CHECK( cudaFree(d_dst) );
#if MEMORY_MANAGED
    CUDA_CHECK( cudaFree(h_src) );
    CUDA_CHECK( cudaFree(h_dst) );
#else
    CUDA_CHECK( cudaFreeHost(h_src) );
    CUDA_CHECK( cudaFreeHost(h_dst) );
#endif
    CUDA_CHECK( cudaEventDestroy(startEvent) );
    CUDA_CHECK( cudaEventDestroy(stopEvent) );
    CUDA_CHECK( cudaStreamDestroy(stream) );

    return 0;
}

There’s 8 combinations changing the flags of: 1) managed/pinned memory, 2) creating the stream with/without non-blocking, and 3) adding cudaStreamSynchronize() - according to the define’s at the top.

It fails in two combinations:

a) fails with comparison mismatch at index 0 when using pinned memory, non-blocking flag to stream creation and no cudaStreamSynchronize() In particular pinned memory being incoherent here is unexpected.

b) fails with a segmentation fault when using managed memory, non-blocking flag to stream creation and no cudaStreamSynchronize(). Reason: h_src cannot be dereferenced in the integrity check (it’s not paged in) → SEGV This may be a misunderstanding on my part of host-accessibility of managed memory.

The documentation is not entirely clear as to why this fails and why simply querying the event on the stream shouldn’t be enough.

Suffice to say that everything works fine on the default stream.

Someone who can shine a detailed light on this perhaps?

[configuration is CUDA 12.6, JetPack 6.1 on Jetson Orin AGX].

Many thanks,
Arnoud.

striker159 · October 28, 2024, 5:01pm

azw:


    CUDA_CHECK( cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking) );

    CUDA_CHECK( cudaMemcpy2DAsync(d_dst, W * sizeof(float), h_src, W * sizeof(float), W * sizeof(float), H, cudaMemcpyHostToDevice, stream) );

    CUDA_CHECK( cudaEventRecord(startEvent) );

    CUDA_CHECK( cudaMemcpy2DAsync(h_dst, W * sizeof(float), d_dst, W * sizeof(float), W * sizeof(float), H, cudaMemcpyDeviceToHost, stream) );

    CUDA_CHECK( cudaEventRecord(stopEvent) );

The events are recorded in the default stream. When checking the event you busy-wait on completion of the default stream. However, the memory copies are executed in a different stream, and you explicitly create this stream with cudaStreamNonBlocking which removes the link between default stream and the custom stream.
Thus, the event may complete independent of the work in the custom stream.

It should work if the events are recorded in the custom stream, i.e.
CUDA_CHECK( cudaEventRecord(startEvent, stream) );

Robert_Crovella · October 28, 2024, 6:24pm

I believe striker159 has addressed your item a. For your item b, my suggestion would be to check the concurrentManagedAccess device property, before trying to access a managed allocation from host code, without an intervening cudaDeviceSynchronize, after 1 or more kernel launches. If the concurrent managed access property is false, then what you are trying to do is expected to seg fault. For discrete GPUs, this would typically be true on linux for a pascal or newer GPU, but false on maxwell or on windows. Jetson devices have a somewhat different footprint, I believe.

azw · October 30, 2024, 11:24am

Yes, striker159 addressed item a), thanks a lot! I missed the stream argument. It indeed works correctly if the events are recorded in the custom stream.

Interestingly, this works for both managed and for pinned memory (thereby also solving item b)! even though concurrentManagedAccess is 0 on the Jetson Orin when I check it before accessing the managed allocation from host code, per Robert’s suggestion. Apparently it does get paged back in if appropriately sync’ed.

By the way, we found that our actual problem was caused by us performing atomic operations (atomic fetch-and-add, atomic fetch-and-decrement, etc.) on pinned memory, and this seemed to cause coherence problems between host and device.

system · November 13, 2024, 11:25am

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.

Topic		Replies	Views
Fail to sync the cudaMemcpyAsync using the cudaEvent in two streams CUDA Programming and Performance	4	243	April 1, 2024
cudaMemcpyAsync, unexpected behaviour while using cudaStreamNonBlocking? CUDA Programming and Performance	6	2068	May 29, 2018
CUDA streams and error handling CUDA Programming and Performance	11	2388	December 14, 2023
Streams Problem CUDA Programming and Performance	2	4658	December 7, 2008
cudaDeviceSynchronize needed between kernel launch and cudaMemcpy ? CUDA Programming and Performance	15	16266	September 29, 2017
Problem regarding data transfer overlap between multiple asynchronous streams CUDA Programming and Performance	8	799	September 11, 2016
performance problem CUDA Programming and Performance	2	609	July 16, 2018
Problems with streams CUDA Programming and Performance	4	1077	September 30, 2010
Cannot get any stream parallelism. CUDA Programming and Performance	13	1280	December 31, 2019
QUIT CUDA? Kernel and pinned memory gives strange results CUDA Programming and Performance	6	6717	September 22, 2011

CUDA non-default stream synchronization

Related topics