Multiples launch of a single cudaGraphExec_t on the device creates a deadlock

Hi, I’m testing the CUDA 12 Device Graph Launch feature and I came across an issue with a quite simple case.

When executing the following code, the call of cudaStreamSynchronize after the loop never return. It happens when the number of iteration is 2 or above. The number of time empty_kernel is call is not constant, some tests showed value between 0 and 3.

When adding a sleep in the for loop (100ms or more), the executable works fine. This seems to indicate there is a concurrent access to the exec graph which is forbidden. I would I thought that the cuda stream would have guaranty the sequential execution.

#include <cstdio>

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      exit(code);
   }
}

__global__ void empty_kernel() { }

__global__ void launch_kernel(cudaGraphExec_t exec) {
    cudaGraphLaunch(exec, cudaStreamGraphTailLaunch);
}

int main() {
    gpuErrchk(cudaSetDevice(0));

    cudaStream_t stream;
    gpuErrchk(cudaStreamCreate(&stream));

    cudaGraphExec_t device_graph_exec;

    // create pipeline graph and upload it on device
    {
        gpuErrchk(cudaStreamBeginCapture(stream, cudaStreamCaptureModeThreadLocal));

        empty_kernel<<<1, 1, 0, stream>>>();

        cudaGraph_t graph;
        gpuErrchk(cudaStreamEndCapture(stream, &graph));

        // instantiate graph exec on device
        gpuErrchk(cudaGraphInstantiate(&device_graph_exec, graph, cudaGraphInstantiateFlagDeviceLaunch));
        gpuErrchk(cudaGraphUpload(device_graph_exec, stream));

        gpuErrchk(cudaGraphDestroy(graph));
    }

    cudaGraphExec_t graph_exec;

    // create launcher graph
    {
        gpuErrchk(cudaStreamBeginCapture(stream, cudaStreamCaptureModeThreadLocal));

        launch_kernel<<<1, 1, 0, stream>>>(device_graph_exec);

        cudaGraph_t graph;
        gpuErrchk(cudaStreamEndCapture(stream, &graph));

        // instantiate graph exec
        gpuErrchk(cudaGraphInstantiate(&graph_exec, graph, 0));

        gpuErrchk(cudaGraphDestroy(graph));
    }

    // put the graph on the stream 10 times
    for (int i = 0; i < 10; ++i) {
        gpuErrchk(cudaGraphLaunch(graph_exec, stream));
        // calling cudaGraphLaunch(device_graph_exec, stream) works when exec created with the flags = 0

        // no problem when the following line is uncommented
        // gpuErrchk(cudaStreamSynchronize(stream));
    }
    gpuErrchk(cudaStreamSynchronize(stream));

    // clean up
    gpuErrchk(cudaGraphExecDestroy(device_graph_exec));
    gpuErrchk(cudaGraphExecDestroy(graph_exec));

    gpuErrchk(cudaStreamDestroy(stream));
}

The source file was compiled using the following command for a tesla V100

nvcc --generate-code=arch=compute_70,code=[compute_70,sm_70] main.cu -o multi_launch

I’m using CUDA 12.1 on AlmaLinux 8.4 with nvidia driver version 530.30.02.

I did some tests and the issue persist using cuda 12.2.

Thanks for sharing this issue. We have fixed the issue in our Driver. Next standalond Driver release of R535 and R525 will contain the fix .
You can also wait for a next complete CUDA Toolkit version after CUDA 12.2