Hi,
I encountered some errors while using cuda graph in my program.
Here are my environment settings:
From nvidia-smi
:
NVIDIA-SMI 525.89.02
Driver Version: 525.89.02
CUDA Version: 12.0
From nvcc
:
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2021 NVIDIA Corporation
Built on Sun_Feb_14_21:12:58_PST_2021
Cuda compilation tools, release 11.2, V11.2.152
Build cuda_11.2.r11.2/compiler.29618528_0
Below is a demo that includes three functions:
code1
will throws aSegmentation fault
when executingcudaGraphInstantiate
.code2
processes successfully. I am testing to see if it’s safe when I destroy the graph before destroying the instance.code3
reports an “Invalid read” error as indicated byvalgrind
.
#include <cuda_runtime_api.h>
#include <cstdio>
#include <iostream>
#define CUDA_CHECK(call) \
do { \
cudaError_t err = call; \
if (err != cudaSuccess) { \
fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", __FILE__, \
__LINE__, err, cudaGetErrorString(err), #call); \
exit(EXIT_FAILURE); \
} \
} while (0)
#define CHECK_NOTNULLPTR(ptr) \
do { \
if (ptr == nullptr) { \
fprintf(stderr, "nullptr get %s:%d -> %s \n", __FILE__, __LINE__, #ptr); \
exit(EXIT_FAILURE); \
} \
} while (0)
#define CHECK_ISNULLPTR(ptr) \
do { \
if (ptr != nullptr) { \
fprintf(stderr, "need nullptr %s:%d -> %s \n", __FILE__, __LINE__, \
#ptr); \
exit(EXIT_FAILURE); \
} \
} while (0)
// err
void code1() {
cudaGraph_t graph = nullptr;
CUDA_CHECK(cudaGraphCreate(&graph, 0));
CHECK_NOTNULLPTR(graph);
cudaGraph_t sub_graph = nullptr;
CUDA_CHECK(cudaGraphCreate(&sub_graph, 0));
CHECK_NOTNULLPTR(sub_graph);
cudaGraphNode_t sub_graph_node = nullptr;
CUDA_CHECK(cudaGraphAddChildGraphNode(&sub_graph_node, graph, nullptr, 0,
sub_graph));
CHECK_NOTNULLPTR(sub_graph_node);
cudaGraphExec_t instance;
cudaGraphNode_t err_node = nullptr;
char buff[100] = {0};
// Segmentation fault (below)
CUDA_CHECK(cudaGraphInstantiate(&instance, graph, &err_node, buff, 100));
CHECK_ISNULLPTR(err_node);
CHECK_NOTNULLPTR(instance);
CUDA_CHECK(cudaGraphDestroy(graph));
CUDA_CHECK(cudaGraphExecDestroy(instance));
}
// pass
void code2() {
cudaStream_t stream = nullptr;
CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
CHECK_NOTNULLPTR(stream);
void* ptr = nullptr;
CUDA_CHECK(cudaMalloc(&ptr, 1024));
CHECK_NOTNULLPTR(ptr);
cudaGraph_t graph = nullptr;
CUDA_CHECK(cudaStreamBeginCapture(stream, cudaStreamCaptureModeRelaxed));
CUDA_CHECK(cudaMemsetAsync(ptr, 0, 1024, stream));
CUDA_CHECK(cudaStreamEndCapture(stream, &graph));
CHECK_NOTNULLPTR(graph);
cudaGraphExec_t instance;
cudaGraphNode_t err_node = nullptr;
char buff[100] = {0};
CUDA_CHECK(cudaGraphInstantiate(&instance, graph, &err_node, buff, 100));
CHECK_ISNULLPTR(err_node);
CHECK_NOTNULLPTR(instance);
CUDA_CHECK(cudaGraphDestroy(graph));
CUDA_CHECK(cudaGraphExecDestroy(instance));
CUDA_CHECK(cudaFree(ptr));
}
// err
void code3() {
cudaStream_t stream = nullptr;
CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
CHECK_NOTNULLPTR(stream);
void* ptr = nullptr;
CUDA_CHECK(cudaMalloc(&ptr, 1024));
cudaGraph_t graph = nullptr;
CUDA_CHECK(cudaGraphCreate(&graph, 0));
CHECK_NOTNULLPTR(graph);
cudaGraph_t sub_graph = nullptr;
CUDA_CHECK(cudaStreamBeginCapture(stream, cudaStreamCaptureModeRelaxed));
CUDA_CHECK(cudaMemsetAsync(ptr, 0, 1024, stream));
CUDA_CHECK(cudaStreamEndCapture(stream, &sub_graph));
CHECK_NOTNULLPTR(sub_graph);
cudaGraphNode_t sub_graph_node = nullptr;
CUDA_CHECK(cudaGraphAddChildGraphNode(&sub_graph_node, graph, nullptr, 0,
sub_graph));
CHECK_NOTNULLPTR(sub_graph_node);
cudaGraphExec_t instance;
cudaGraphNode_t err_node = nullptr;
char buff[100] = {0};
CUDA_CHECK(cudaGraphInstantiate(&instance, graph, &err_node, buff, 100));
CHECK_ISNULLPTR(err_node);
CHECK_NOTNULLPTR(instance);
CUDA_CHECK(cudaGraphDestroy(graph));
// Invalid read of size 8 (below)
CUDA_CHECK(cudaGraphExecDestroy(instance));
}
int main() {
code1();
// code2();
// code3();
return 0;
}
The valgrind
log for processing code3
is:
==1742267== Memcheck, a memory error detector
==1742267== Copyright (C) 2002-2022, and GNU GPL'd, by Julian Seward et al.
==1742267== Using Valgrind-3.21.0 and LibVEX; rerun with -h for copyright info
==1742267== Command: ./a.out
==1742267==
==1742267== Warning: noted but unhandled ioctl 0x30000001 with no direction hints.
==1742267== This could cause spurious value errors to appear.
==1742267== See README_MISSING_SYSCALL_OR_IOCTL for guidance on writing a proper wrapper.
==1742267== Warning: set address range perms: large range [0x200000000, 0x300200000) (noaccess)
==1742267== Warning: set address range perms: large range [0x7e67000, 0x27e66000) (noaccess)
==1742267== Invalid read of size 8
==1742267== at 0x63B2F3E: ??? (in /usr/lib/x86_64-linux-gnu/libcuda.so.525.89.02)
==1742267== by 0x63B2339: ??? (in /usr/lib/x86_64-linux-gnu/libcuda.so.525.89.02)
==1742267== by 0x6469926: ??? (in /usr/lib/x86_64-linux-gnu/libcuda.so.525.89.02)
==1742267== by 0x120A7D: __cudart967 (in /home/leon/simulation_pkg_ljq/a.out)
==1742267== by 0x15CCE7: cudaGraphExecDestroy (in /home/leon/simulation_pkg_ljq/a.out)
==1742267== by 0x11070A: code3() (in /home/leon/simulation_pkg_ljq/a.out)
==1742267== by 0x110796: main (in /home/leon/simulation_pkg_ljq/a.out)
==1742267== Address 0xc2f98a8 is 1,656 bytes inside a block of size 1,944 free'd
==1742267== at 0x40382BB: free (vg_replace_malloc.c:974)
==1742267== by 0x63B2339: ??? (in /usr/lib/x86_64-linux-gnu/libcuda.so.525.89.02)
==1742267== by 0x63B349B: ??? (in /usr/lib/x86_64-linux-gnu/libcuda.so.525.89.02)
==1742267== by 0x63B2339: ??? (in /usr/lib/x86_64-linux-gnu/libcuda.so.525.89.02)
==1742267== by 0x6469BD4: ??? (in /usr/lib/x86_64-linux-gnu/libcuda.so.525.89.02)
==1742267== by 0x120ADD: __cudart789 (in /home/leon/simulation_pkg_ljq/a.out)
==1742267== by 0x15CE57: cudaGraphDestroy (in /home/leon/simulation_pkg_ljq/a.out)
==1742267== by 0x110692: code3() (in /home/leon/simulation_pkg_ljq/a.out)
==1742267== by 0x110796: main (in /home/leon/simulation_pkg_ljq/a.out)
==1742267== Block was alloc'd at
==1742267== at 0x403A853: calloc (vg_replace_malloc.c:1554)
==1742267== by 0x63C336F: ??? (in /usr/lib/x86_64-linux-gnu/libcuda.so.525.89.02)
==1742267== by 0x63C469D: ??? (in /usr/lib/x86_64-linux-gnu/libcuda.so.525.89.02)
==1742267== by 0x63C581D: ??? (in /usr/lib/x86_64-linux-gnu/libcuda.so.525.89.02)
==1742267== by 0x63C5BDC: ??? (in /usr/lib/x86_64-linux-gnu/libcuda.so.525.89.02)
==1742267== by 0x6630D70: ??? (in /usr/lib/x86_64-linux-gnu/libcuda.so.525.89.02)
==1742267== by 0x6462742: ??? (in /usr/lib/x86_64-linux-gnu/libcuda.so.525.89.02)
==1742267== by 0x11EBBD: __cudart1263 (in /home/leon/simulation_pkg_ljq/a.out)
==1742267== by 0x1587F3: cudaGraphAddChildGraphNode (in /home/leon/simulation_pkg_ljq/a.out)
==1742267== by 0x11049F: code3() (in /home/leon/simulation_pkg_ljq/a.out)
==1742267== by 0x110796: main (in /home/leon/simulation_pkg_ljq/a.out)
==1742267==
==1742267==
==1742267== HEAP SUMMARY:
==1742267== in use at exit: 10,590,680 bytes in 10,839 blocks
==1742267== total heap usage: 13,286 allocs, 2,447 frees, 36,750,923 bytes allocated
==1742267==
==1742267== For a detailed leak analysis, rerun with: --leak-check=full
==1742267==
==1742267== For lists of detected and suppressed errors, rerun with: -s
==1742267== ERROR SUMMARY: 1 errors from 1 contexts (suppressed: 0 from 0)
Let me know if you need further information or if there are other areas that require clarity.