Hi folks.
I am using cuda-gdb to attach a pytorch training job, but once i attach it, the process raising segmentation fault. How can I handle it? Thanks.
Env is
name | value |
---|---|
gpu | NVIDIA A100-SXM4-80GB |
host driver | 470.82.01 |
container cuda version | 12.1 |
container cuda compat version | cuda-compat-12-1-530.30.02-1 |
torch | 2.1.0+cu12.1 |
cuda-gdb version
NVIDIA (R) CUDA Debugger
CUDA Toolkit 12.1 release
Portions Copyright (C) 2007-2023 NVIDIA Corporation
GNU gdb (GDB) 12.1
core stack of attached process
#0 0x00007f361679b86a in ?? () from /usr/lib64/libcuda.so.1
#1 0x00007f361682829a in ?? () from /usr/lib64/libcuda.so.1
#2 0x00007f361682a27a in ?? () from /usr/lib64/libcuda.so.1
#3 0x00007f3616730e2d in ?? () from /usr/lib64/libcuda.so.1
#4 0x00007f36168db135 in ?? () from /usr/lib64/libcuda.so.1
#5 0x00007f362148bc79 in __cudart1043 () from /opt/conda/lib/python3.8/site-packages/torch/lib/libc10_cuda.so
#6 0x00007f36214c6275 in cudaDeviceSynchronize () from /opt/conda/lib/python3.8/site-packages/torch/lib/libc10_cuda.so
#7 0x00007f362147b1e4 in c10::cuda::device_synchronize() () from /opt/conda/lib/python3.8/site-packages/torch/lib/libc10_cuda.so
asm of frame 0
0x7f361679b84a: and $0x24,%al
0x7f361679b84c: add $0x8b492774,%eax
0x7f361679b851: rex.R and $0x48,%al
0x7f361679b854: mov 0x43a0(%rax),%ecx
0x7f361679b85a: test %ecx,%ecx
0x7f361679b85c: je 0x7f361679bad0
0x7f361679b862: pause
0x7f361679b864: mov %rbx,%rsi
0x7f361679b867: mov %rbp,%rdi
=> 0x7f361679b86a: callq 0x7f3616af3090
0x7f361679b86f: mov %eax,%r14d
0x7f361679b872: test %eax,%eax
0x7f361679b874: je 0x7f361679b848
0x7f361679b876: mov 0x1c(%rsp),%eax
0x7f361679b87a: movl $0x8,0x28(%rsp)
0x7f361679b882: mov %eax,0x2c(%rsp)
(gdb) x/10i 0x7f3616af3090
0x7f3616af3090: push %r15
0x7f3616af3092: mov %rsi,%r15
0x7f3616af3095: push %r14
0x7f3616af3097: push %r13
0x7f3616af3099: push %r12
0x7f3616af309b: push %rbp
0x7f3616af309c: mov %rdi,%rbp
0x7f3616af309f: push %rbx
0x7f3616af30a0: sub $0x28,%rsp
0x7f3616af30a4: mov (%rsi),%rax
(gdb) i r
rax 0x1bed76c2 468547266
rbx 0x7ffcc0a613a0 140723540595616
rcx 0x7ffcc0b13b12 140723541326610
rdx 0x0 0
rsi 0x7ffcc0a613a0 140723540595616
rdi 0x7ffcc0a61314 140723540595476
rbp 0x7ffcc0a61314 0x7ffcc0a61314
rsp 0x7ffcc0a612f0 0x7ffcc0a612f0
r8 0x7b46420 129262624
r9 0x100000000 4294967296
r10 0xffffffff00000000 -4294967296
r11 0x293 659
r12 0x77f6190 125788560
r13 0x1 1
r14 0x0 0
r15 0x7ffcc0a61320 140723540595488
rip 0x7f361679b86a 0x7f361679b86a
eflags 0x10246 [ PF ZF IF RF ]
cs 0x33 51
ss 0x2b 43
ds 0x0 0
es 0x0 0
fs 0x0 0
gs 0x0 0
k0 0x3f8000003f800000 4575657222473777152
k1 0x3f8000003f800000 4575657222473777152
k2 0x0 0
k3 0x0 0
k4 0x0 0
k5 0x0 0
k6 0x0 0
k7 0x0 0