Hi,
I have a question about synchronization for non-default streams. In particular, whether it can be done solely using cudaEvents (without a cudaStreamSynchronize()
call, which seems to be present in all CUDA samples) or not. The documentation seems to suggest querying for event completion should be enough, but it isn’t. Both managed and pinned memory have problems.
I have the following test program:
#include <cuda_runtime.h>
#include <iostream>
// These provide 8 combinations of the test: of these, 0 1 0 and 1 1 0 are failing
#define MEMORY_MANAGED 0
#define STREAM_NON_BLOCKING 1
#define EXTRA_SYNCHRONIZE 0
#define CUDA_CHECK(call) \
do { \
cudaError_t cudaStatus = (call); \
if (cudaStatus != cudaSuccess) { \
std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__ << " - " \
<< cudaGetErrorString(cudaStatus) << std::endl; \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
} while (0)
int main() {
const int W = 1024;
const int H = 1024;
float *h_src, *h_dst;
float *d_dst;
// Allocate stream
cudaStream_t stream;
#if STREAM_NON_BLOCKING
CUDA_CHECK( cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking) );
#else
CUDA_CHECK( cudaStreamCreate(&stream) );
#endif
// Allocate pinned/managed host memory
#if MEMORY_MANAGED
CUDA_CHECK( cudaMallocManaged((void**)&h_src, W * H * sizeof(float)) );
CUDA_CHECK( cudaMallocManaged((void**)&h_dst, W * H * sizeof(float) ));
#else
CUDA_CHECK( cudaMallocHost((void**)&h_src, W * H * sizeof(float) ));
CUDA_CHECK( cudaMallocHost((void**)&h_dst, W * H * sizeof(float) ));
#endif
// Allocate device memory
CUDA_CHECK( cudaMalloc((void**)&d_dst, W * H * sizeof(float) ));
// Initialize data on the host
for (int i = 0; i < W * H; i++) {
h_src[i] = static_cast<float>(i);
}
// Create events for synchronization
cudaEvent_t startEvent, stopEvent;
CUDA_CHECK( cudaEventCreate(&startEvent) );
CUDA_CHECK( cudaEventCreate(&stopEvent) );
// Start the asynchronous memory copy from pinned/managed host memory to device memory
CUDA_CHECK( cudaMemcpy2DAsync(d_dst, W * sizeof(float), h_src, W * sizeof(float), W * sizeof(float), H, cudaMemcpyHostToDevice, stream) );
// Record event after the first async copy
CUDA_CHECK( cudaEventRecord(startEvent) );
// Optionally, do other work on the host or device here
// Start the device-to-host asynchronous copy
CUDA_CHECK( cudaMemcpy2DAsync(h_dst, W * sizeof(float), d_dst, W * sizeof(float), W * sizeof(float), H, cudaMemcpyDeviceToHost, stream) );
CUDA_CHECK( cudaEventRecord(stopEvent) );
#if EXTRA_SYNCHRONIZE
cudaStreamSynchronize(stream);
#endif
// Use cudaEventQuery to check for event completion without blocking
cudaError_t eventStatus = cudaEventQuery(stopEvent);
while (eventStatus == cudaErrorNotReady) {
// The event is not ready yet, do some other non-blocking work here if needed
// std::cout << ".";
// Recheck the event status
eventStatus = cudaEventQuery(stopEvent);
}
if (eventStatus == cudaSuccess) {
std::cout << "Data transfer completed successfully!" << std::endl;
} else {
std::cout << "An error occurred while waiting for the event." << std::endl;
}
// Check integrity
for (int i = 0; i < W * H; i++) {
if (h_src[i] != h_dst[i]) {
std::cout << "Mismatch at position " << i << "!" << std::endl;
break;
}
}
// Clean up
CUDA_CHECK( cudaFree(d_dst) );
#if MEMORY_MANAGED
CUDA_CHECK( cudaFree(h_src) );
CUDA_CHECK( cudaFree(h_dst) );
#else
CUDA_CHECK( cudaFreeHost(h_src) );
CUDA_CHECK( cudaFreeHost(h_dst) );
#endif
CUDA_CHECK( cudaEventDestroy(startEvent) );
CUDA_CHECK( cudaEventDestroy(stopEvent) );
CUDA_CHECK( cudaStreamDestroy(stream) );
return 0;
}
There’s 8 combinations changing the flags of: 1) managed/pinned memory, 2) creating the stream with/without non-blocking, and 3) adding cudaStreamSynchronize() - according to the define’s at the top.
It fails in two combinations:
a) fails with comparison mismatch at index 0 when using pinned memory, non-blocking flag to stream creation and no cudaStreamSynchronize()
In particular pinned memory being incoherent here is unexpected.
b) fails with a segmentation fault when using managed memory, non-blocking flag to stream creation and no cudaStreamSynchronize()
. Reason: h_src
cannot be dereferenced in the integrity check (it’s not paged in) → SEGV This may be a misunderstanding on my part of host-accessibility of managed memory.
The documentation is not entirely clear as to why this fails and why simply querying the event on the stream shouldn’t be enough.
Suffice to say that everything works fine on the default stream.
Someone who can shine a detailed light on this perhaps?
[configuration is CUDA 12.6, JetPack 6.1 on Jetson Orin AGX].
Many thanks,
Arnoud.