Crash when using cuda graph

Hi,

I encountered some errors while using cuda graph in my program.

Here are my environment settings:

From nvidia-smi:

NVIDIA-SMI 525.89.02
Driver Version: 525.89.02
CUDA Version: 12.0

From nvcc:

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2021 NVIDIA Corporation
Built on Sun_Feb_14_21:12:58_PST_2021
Cuda compilation tools, release 11.2, V11.2.152
Build cuda_11.2.r11.2/compiler.29618528_0

Below is a demo that includes three functions:

  • code1 will throws a Segmentation fault when executing cudaGraphInstantiate.
  • code2 processes successfully. I am testing to see if it’s safe when I destroy the graph before destroying the instance.
  • code3 reports an “Invalid read” error as indicated by valgrind.
#include <cuda_runtime_api.h>

#include <cstdio>
#include <iostream>

#define CUDA_CHECK(call)                                                     \
  do {                                                                       \
    cudaError_t err = call;                                                  \
    if (err != cudaSuccess) {                                                \
      fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", __FILE__, \
              __LINE__, err, cudaGetErrorString(err), #call);                \
      exit(EXIT_FAILURE);                                                    \
    }                                                                        \
  } while (0)

#define CHECK_NOTNULLPTR(ptr)                                                  \
  do {                                                                         \
    if (ptr == nullptr) {                                                      \
      fprintf(stderr, "nullptr get %s:%d -> %s \n", __FILE__, __LINE__, #ptr); \
      exit(EXIT_FAILURE);                                                      \
    }                                                                          \
  } while (0)

#define CHECK_ISNULLPTR(ptr)                                             \
  do {                                                                   \
    if (ptr != nullptr) {                                                \
      fprintf(stderr, "need nullptr %s:%d -> %s \n", __FILE__, __LINE__, \
              #ptr);                                                     \
      exit(EXIT_FAILURE);                                                \
    }                                                                    \
  } while (0)

// err
void code1() {
  cudaGraph_t graph = nullptr;
  CUDA_CHECK(cudaGraphCreate(&graph, 0));
  CHECK_NOTNULLPTR(graph);
  cudaGraph_t sub_graph = nullptr;
  CUDA_CHECK(cudaGraphCreate(&sub_graph, 0));
  CHECK_NOTNULLPTR(sub_graph);
  cudaGraphNode_t sub_graph_node = nullptr;
  CUDA_CHECK(cudaGraphAddChildGraphNode(&sub_graph_node, graph, nullptr, 0,
                                        sub_graph));
  CHECK_NOTNULLPTR(sub_graph_node);
  cudaGraphExec_t instance;
  cudaGraphNode_t err_node = nullptr;
  char buff[100] = {0};
  // Segmentation fault (below)
  CUDA_CHECK(cudaGraphInstantiate(&instance, graph, &err_node, buff, 100));
  CHECK_ISNULLPTR(err_node);
  CHECK_NOTNULLPTR(instance);
  CUDA_CHECK(cudaGraphDestroy(graph));
  CUDA_CHECK(cudaGraphExecDestroy(instance));
}

// pass
void code2() {
  cudaStream_t stream = nullptr;
  CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
  CHECK_NOTNULLPTR(stream);
  void* ptr = nullptr;
  CUDA_CHECK(cudaMalloc(&ptr, 1024));
  CHECK_NOTNULLPTR(ptr);
  cudaGraph_t graph = nullptr;
  CUDA_CHECK(cudaStreamBeginCapture(stream, cudaStreamCaptureModeRelaxed));
  CUDA_CHECK(cudaMemsetAsync(ptr, 0, 1024, stream));
  CUDA_CHECK(cudaStreamEndCapture(stream, &graph));
  CHECK_NOTNULLPTR(graph);
  cudaGraphExec_t instance;
  cudaGraphNode_t err_node = nullptr;
  char buff[100] = {0};
  CUDA_CHECK(cudaGraphInstantiate(&instance, graph, &err_node, buff, 100));
  CHECK_ISNULLPTR(err_node);
  CHECK_NOTNULLPTR(instance);
  CUDA_CHECK(cudaGraphDestroy(graph));
  CUDA_CHECK(cudaGraphExecDestroy(instance));
  CUDA_CHECK(cudaFree(ptr));
}

// err
void code3() {
  cudaStream_t stream = nullptr;
  CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
  CHECK_NOTNULLPTR(stream);
  void* ptr = nullptr;
  CUDA_CHECK(cudaMalloc(&ptr, 1024));
  cudaGraph_t graph = nullptr;
  CUDA_CHECK(cudaGraphCreate(&graph, 0));
  CHECK_NOTNULLPTR(graph);
  cudaGraph_t sub_graph = nullptr;
  CUDA_CHECK(cudaStreamBeginCapture(stream, cudaStreamCaptureModeRelaxed));
  CUDA_CHECK(cudaMemsetAsync(ptr, 0, 1024, stream));
  CUDA_CHECK(cudaStreamEndCapture(stream, &sub_graph));
  CHECK_NOTNULLPTR(sub_graph);

  cudaGraphNode_t sub_graph_node = nullptr;
  CUDA_CHECK(cudaGraphAddChildGraphNode(&sub_graph_node, graph, nullptr, 0,
                                        sub_graph));
  CHECK_NOTNULLPTR(sub_graph_node);
  cudaGraphExec_t instance;
  cudaGraphNode_t err_node = nullptr;
  char buff[100] = {0};
  CUDA_CHECK(cudaGraphInstantiate(&instance, graph, &err_node, buff, 100));
  CHECK_ISNULLPTR(err_node);
  CHECK_NOTNULLPTR(instance);
  CUDA_CHECK(cudaGraphDestroy(graph));
  // Invalid read of size 8 (below)
  CUDA_CHECK(cudaGraphExecDestroy(instance));
}

int main() {
  code1();
  // code2();
  // code3();
  return 0;
}

The valgrind log for processing code3 is:

==1742267== Memcheck, a memory error detector
==1742267== Copyright (C) 2002-2022, and GNU GPL'd, by Julian Seward et al.
==1742267== Using Valgrind-3.21.0 and LibVEX; rerun with -h for copyright info
==1742267== Command: ./a.out
==1742267== 
==1742267== Warning: noted but unhandled ioctl 0x30000001 with no direction hints.
==1742267==    This could cause spurious value errors to appear.
==1742267==    See README_MISSING_SYSCALL_OR_IOCTL for guidance on writing a proper wrapper.
==1742267== Warning: set address range perms: large range [0x200000000, 0x300200000) (noaccess)
==1742267== Warning: set address range perms: large range [0x7e67000, 0x27e66000) (noaccess)
==1742267== Invalid read of size 8
==1742267==    at 0x63B2F3E: ??? (in /usr/lib/x86_64-linux-gnu/libcuda.so.525.89.02)
==1742267==    by 0x63B2339: ??? (in /usr/lib/x86_64-linux-gnu/libcuda.so.525.89.02)
==1742267==    by 0x6469926: ??? (in /usr/lib/x86_64-linux-gnu/libcuda.so.525.89.02)
==1742267==    by 0x120A7D: __cudart967 (in /home/leon/simulation_pkg_ljq/a.out)
==1742267==    by 0x15CCE7: cudaGraphExecDestroy (in /home/leon/simulation_pkg_ljq/a.out)
==1742267==    by 0x11070A: code3() (in /home/leon/simulation_pkg_ljq/a.out)
==1742267==    by 0x110796: main (in /home/leon/simulation_pkg_ljq/a.out)
==1742267==  Address 0xc2f98a8 is 1,656 bytes inside a block of size 1,944 free'd
==1742267==    at 0x40382BB: free (vg_replace_malloc.c:974)
==1742267==    by 0x63B2339: ??? (in /usr/lib/x86_64-linux-gnu/libcuda.so.525.89.02)
==1742267==    by 0x63B349B: ??? (in /usr/lib/x86_64-linux-gnu/libcuda.so.525.89.02)
==1742267==    by 0x63B2339: ??? (in /usr/lib/x86_64-linux-gnu/libcuda.so.525.89.02)
==1742267==    by 0x6469BD4: ??? (in /usr/lib/x86_64-linux-gnu/libcuda.so.525.89.02)
==1742267==    by 0x120ADD: __cudart789 (in /home/leon/simulation_pkg_ljq/a.out)
==1742267==    by 0x15CE57: cudaGraphDestroy (in /home/leon/simulation_pkg_ljq/a.out)
==1742267==    by 0x110692: code3() (in /home/leon/simulation_pkg_ljq/a.out)
==1742267==    by 0x110796: main (in /home/leon/simulation_pkg_ljq/a.out)
==1742267==  Block was alloc'd at
==1742267==    at 0x403A853: calloc (vg_replace_malloc.c:1554)
==1742267==    by 0x63C336F: ??? (in /usr/lib/x86_64-linux-gnu/libcuda.so.525.89.02)
==1742267==    by 0x63C469D: ??? (in /usr/lib/x86_64-linux-gnu/libcuda.so.525.89.02)
==1742267==    by 0x63C581D: ??? (in /usr/lib/x86_64-linux-gnu/libcuda.so.525.89.02)
==1742267==    by 0x63C5BDC: ??? (in /usr/lib/x86_64-linux-gnu/libcuda.so.525.89.02)
==1742267==    by 0x6630D70: ??? (in /usr/lib/x86_64-linux-gnu/libcuda.so.525.89.02)
==1742267==    by 0x6462742: ??? (in /usr/lib/x86_64-linux-gnu/libcuda.so.525.89.02)
==1742267==    by 0x11EBBD: __cudart1263 (in /home/leon/simulation_pkg_ljq/a.out)
==1742267==    by 0x1587F3: cudaGraphAddChildGraphNode (in /home/leon/simulation_pkg_ljq/a.out)
==1742267==    by 0x11049F: code3() (in /home/leon/simulation_pkg_ljq/a.out)
==1742267==    by 0x110796: main (in /home/leon/simulation_pkg_ljq/a.out)
==1742267== 
==1742267== 
==1742267== HEAP SUMMARY:
==1742267==     in use at exit: 10,590,680 bytes in 10,839 blocks
==1742267==   total heap usage: 13,286 allocs, 2,447 frees, 36,750,923 bytes allocated
==1742267== 
==1742267== For a detailed leak analysis, rerun with: --leak-check=full
==1742267== 
==1742267== For lists of detected and suppressed errors, rerun with: -s
==1742267== ERROR SUMMARY: 1 errors from 1 contexts (suppressed: 0 from 0)

Let me know if you need further information or if there are other areas that require clarity.

Your code1 doesn’t make any sense to me. You are creating a graph without any nodes. If you wish to argue that your empty subgraph node is a contraindication, I don’t agree with you. If you prefer, I will state that you are creating an empty subgraph (no nodes in it) and then attempt to instantiate it. That doesn’t make any sense, and I don’t know of any suggestions in the CUDA docs that indicate that is OK. As soon as you add an actual node (code2) that issue seems to be resolved.

For the valgrind issue on code3, my suggestion would be to retest on the latest CUDA version, and if the issue still presents, then file a bug. It’s possible that there may be some API misuse in your example, but after a quick look I didn’t spot it.

Hi,

Thank you for your insightful response, Robert!

We have conducted further tests on CUDA 11.8, and I’m pleased to report that all tests were executed without any issues. I agree with your assessment regarding the code1 scenario; it indeed seems inconsequential and could potentially lead to undefined behavior due to the creation of a graph without any nodes.

Regarding code3, it appears that the issue may have been an internal bug in the older version, which seems to have been addressed in the latest release. I observed that invoking cudaGraphExecDestroy before cudaGraphDestroy did not lead to any errors or unsafe behavior in this old version.

Best Regards.

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.