Multiples launch of a single cudaGraphExec_t on the device creates a deadlock

JulienBernard · May 30, 2023, 1:52pm

Hi, I’m testing the CUDA 12 Device Graph Launch feature and I came across an issue with a quite simple case.

When executing the following code, the call of cudaStreamSynchronize after the loop never return. It happens when the number of iteration is 2 or above. The number of time empty_kernel is call is not constant, some tests showed value between 0 and 3.

When adding a sleep in the for loop (100ms or more), the executable works fine. This seems to indicate there is a concurrent access to the exec graph which is forbidden. I would I thought that the cuda stream would have guaranty the sequential execution.

#include <cstdio>

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      exit(code);
   }
}

__global__ void empty_kernel() { }

__global__ void launch_kernel(cudaGraphExec_t exec) {
    cudaGraphLaunch(exec, cudaStreamGraphTailLaunch);
}

int main() {
    gpuErrchk(cudaSetDevice(0));

    cudaStream_t stream;
    gpuErrchk(cudaStreamCreate(&stream));

    cudaGraphExec_t device_graph_exec;

    // create pipeline graph and upload it on device
    {
        gpuErrchk(cudaStreamBeginCapture(stream, cudaStreamCaptureModeThreadLocal));

        empty_kernel<<<1, 1, 0, stream>>>();

        cudaGraph_t graph;
        gpuErrchk(cudaStreamEndCapture(stream, &graph));

        // instantiate graph exec on device
        gpuErrchk(cudaGraphInstantiate(&device_graph_exec, graph, cudaGraphInstantiateFlagDeviceLaunch));
        gpuErrchk(cudaGraphUpload(device_graph_exec, stream));

        gpuErrchk(cudaGraphDestroy(graph));
    }

    cudaGraphExec_t graph_exec;

    // create launcher graph
    {
        gpuErrchk(cudaStreamBeginCapture(stream, cudaStreamCaptureModeThreadLocal));

        launch_kernel<<<1, 1, 0, stream>>>(device_graph_exec);

        cudaGraph_t graph;
        gpuErrchk(cudaStreamEndCapture(stream, &graph));

        // instantiate graph exec
        gpuErrchk(cudaGraphInstantiate(&graph_exec, graph, 0));

        gpuErrchk(cudaGraphDestroy(graph));
    }

    // put the graph on the stream 10 times
    for (int i = 0; i < 10; ++i) {
        gpuErrchk(cudaGraphLaunch(graph_exec, stream));
        // calling cudaGraphLaunch(device_graph_exec, stream) works when exec created with the flags = 0

        // no problem when the following line is uncommented
        // gpuErrchk(cudaStreamSynchronize(stream));
    }
    gpuErrchk(cudaStreamSynchronize(stream));

    // clean up
    gpuErrchk(cudaGraphExecDestroy(device_graph_exec));
    gpuErrchk(cudaGraphExecDestroy(graph_exec));

    gpuErrchk(cudaStreamDestroy(stream));
}

The source file was compiled using the following command for a tesla V100

nvcc --generate-code=arch=compute_70,code=[compute_70,sm_70] main.cu -o multi_launch

I’m using CUDA 12.1 on AlmaLinux 8.4 with nvidia driver version 530.30.02.

JulienBernard · August 1, 2023, 1:52am

I did some tests and the issue persist using cuda 12.2.

hhoffman · August 15, 2023, 3:42am

Thanks for sharing this issue. We have fixed the issue in our Driver. Next standalond Driver release of R535 and R525 will contain the fix .
You can also wait for a next complete CUDA Toolkit version after CUDA 12.2