Hello, I’m trying to create a CUDA Graph that involves multiple streams with dependencies. Specifically:
- Stream 1 launches a sequence of kernels.
- Other streams also execute kernels but must wait for each corresponding kernel in Stream 1 to complete before proceeding.
I can achieve this without CUDA Graphs, and with multiple graphs per stream, but I’m wondering if I can have just one graph per stream, in hopes of minimising overhead.
Questions:
- Can a single CUDA Graph contain multiple streams, with kernels in one stream waiting on another?
- If not, can I use multiple graphs (one per stream) and enforce synchronization between them?
- Why do I get
EndCapture error: capturing stream has unjoined work
in my following attempt at a minimal example
#include <cstdio>
#include <cuda_runtime.h>
// A trivial kernel that just prints its identifier
__global__ void minimalKernel(int streamId) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx == 0) {
printf("streamId %d\n", streamId);
}
}
int main() {
cudaError_t err;
err = cudaDeviceSynchronize();
if (err != cudaSuccess) {
fprintf(stderr, "Pre-capture sync error: %s\n", cudaGetErrorString(err));
return 1;
}
cudaStream_t s1, s2;
err = cudaStreamCreate(&s1);
if (err != cudaSuccess) {
fprintf(stderr, "Stream create error s1: %s\n", cudaGetErrorString(err));
return 1;
}
err = cudaStreamCreate(&s2);
if (err != cudaSuccess) {
fprintf(stderr, "Stream create error s2: %s\n", cudaGetErrorString(err));
return 1;
}
err = cudaStreamBeginCapture(s1, cudaStreamCaptureModeGlobal);
if (err != cudaSuccess) {
fprintf(stderr, "BeginCapture error on s1: %s\n", cudaGetErrorString(err));
return 1;
}
minimalKernel<<<1, 1, 0, s1>>>(1);
minimalKernel<<<1, 1, 0, s2>>>(2);
cudaEvent_t evt;
err = cudaEventCreateWithFlags(&evt, cudaEventDisableTiming);
if (err != cudaSuccess) {
fprintf(stderr, "Event create error: %s\n", cudaGetErrorString(err));
return 1;
}
err = cudaEventRecord(evt, s1);
if (err != cudaSuccess) {
fprintf(stderr, "Event record error: %s\n", cudaGetErrorString(err));
return 1;
}
err = cudaStreamWaitEvent(s2, evt, 0);
if (err != cudaSuccess) {
fprintf(stderr, "StreamWaitEvent error: %s\n", cudaGetErrorString(err));
return 1;
}
minimalKernel<<<1, 1, 0, s2>>>(3);
cudaGraph_t graph = nullptr;
err = cudaStreamEndCapture(s1, &graph);
if (err != cudaSuccess) {
fprintf(stderr, "EndCapture error: %s\n", cudaGetErrorString(err));
return 1;
}
cudaGraphExec_t exec;
err = cudaGraphInstantiate(&exec, graph, nullptr, nullptr, 0);
if (err != cudaSuccess) {
fprintf(stderr, "cudaGraphInstantiate error: %s\n", cudaGetErrorString(err));
return 1;
}
err = cudaGraphLaunch(exec, s1);
if (err != cudaSuccess) {
fprintf(stderr, "cudaGraphLaunch error: %s\n", cudaGetErrorString(err));
return 1;
}
err = cudaStreamSynchronize(s1);
if (err != cudaSuccess) {
fprintf(stderr, "Stream sync error s1: %s\n", cudaGetErrorString(err));
return 1;
}
err = cudaStreamSynchronize(s2);
if (err != cudaSuccess) {
fprintf(stderr, "Stream sync error s2: %s\n", cudaGetErrorString(err));
return 1;
}
// Cleanup
cudaEventDestroy(evt);
cudaGraphDestroy(graph);
cudaGraphExecDestroy(exec);
cudaStreamDestroy(s1);
cudaStreamDestroy(s2);
fprintf(stderr, "Multi-stream capture succeeded!\n");
return 0;
}