Cuda-gdb hangs indefinitely

A100, RHEL 8.2, Cuda 11.6
We are experiencing indefinite hangs using cuda-gdb on certain binaries, but not all. These binaries run fine outside the debugger. The hang seems to occur early. The machine has just been set up. We haven’t seen these issues on our previous RHEL 8 machine w/A100, Cuda 11.2. Any ideas appreciated.

An example program I threw together:

do_things.h

#pragma once

__global__ void do_things(int numels, float* arr)
{
    int gid = threadIdx.x + blockIdx.x * blockDim .x;

    if (gid < numels)
    {
        ++arr[gid];
    }
}

fun1.h

#pragma once

#include "do_things.h"
#include <thrust/reduce.h>
#include <cublas_v2.h>
#include <cuda_runtime.h>

void someFunction1()
{
    float *x;
    int n = 2;
    cudaMallocManaged(&x, n*sizeof(float));

    for (int i = 0; i < n; i++)
    {
        x[i] = i*0.00000001;
    }

    cudaMemPrefetchAsync(x, n*sizeof(float), 0);
    cudaDeviceSynchronize();

    do_things<<<1, 32>>>(n, x);

    cudaDeviceSynchronize();

    std::cout << "I did things! " << x[0] << std::endl;

    cudaFree(x);
}

void someFunction2()
{
    float *x;
    int n = 2;
    cudaMallocManaged(&x, n*sizeof(float));

    for (int i = 0; i < n; i++)
    {
        x[i] = i*0.00000001;
    }

    cudaMemPrefetchAsync(x, n*sizeof(float), 0);
    cudaDeviceSynchronize();

    thrust::reduce(thrust::device, x, x+n);

    cudaDeviceSynchronize();

    std::cout << "I did things! " << x[0] << std::endl;

    cudaFree(x);
}

void someFunction3()
{
    int N = 1 << 10;
    size_t bytes = N*N*sizeof(int);

    float *a, *b, *c;
    cudaMallocManaged(&a, bytes);
    cudaMallocManaged(&b, bytes);
    cudaMallocManaged(&c, bytes);

    cudaMemPrefetchAsync(a, N*sizeof(float), 0);
    cudaMemPrefetchAsync(b, N*sizeof(float), 0);
    cudaMemPrefetchAsync(c, N*sizeof(float), 0);
    cudaDeviceSynchronize();

    cublasHandle_t handle;
    cublasCreate(&handle);

    float alpha = 1.0;
    float beta = 0.0;
    cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, a, N, b, N, &beta, c, N);

    std::cout << "I did things! " << c[0] << std::endl;

    cudaDeviceSynchronize();
}

testing123.cu

#include <iostream>
#include "fun1.h"

int main(int argc, char *argv[])
{
    // someFunction1();
    // someFunction2();
    // someFunction3();

    return EXIT_SUCCESS;
}

only running someFunction1, 2, or 3 (commenting out the others), cuda-gdb will freeze consistently on someFunction2 and 3, and intermittently for 1.

compiled with:

nvcc -g -G -O0 testing123.cu -o deleteme -lcublas

cuda-gdb output:

>> cuda-gdb deleteme 
NVIDIA (R) CUDA Debugger
11.6 release
Portions Copyright (C) 2007-2022 NVIDIA Corporation
GNU gdb (GDB) 10.2
Copyright (C) 2021 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.
Type "show copying" and "show warranty" for details.
This GDB was configured as "x86_64-pc-linux-gnu".
Type "show configuration" for configuration details.
For bug reporting instructions, please see:
<https://www.gnu.org/software/gdb/bugs/>.
Find the GDB manual and other documentation resources online at:
    <http://www.gnu.org/software/gdb/documentation/>.

For help, type "help".
Type "apropos word" to search for commands related to "word"...
Reading symbols from deleteme...
(cuda-gdb) r
Starting program: /local/ARBE/1151480/NE_benchmarking/testCudaAPI/deleteme 
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
warning: Cannot parse .gnu_debugdata section; LZMA support was disabled at compile time
warning: Cannot parse .gnu_debugdata section; LZMA support was disabled at compile time
[Detaching after fork from child process 437261]
[New Thread 0x7fffd0a9b000 (LWP 437273)]
[New Thread 0x7fffbffff000 (LWP 437274)]
^C
Thread 1 "deleteme" received signal SIGINT, Interrupt.
0x00007fffee23c957 in sendmsg () from /lib64/libpthread.so.0
(cuda-gdb) b fun1.h:10
Breakpoint 1 at 0x404c7a: file fun1.h, line 11.
(cuda-gdb) r
The program being debugged has been started already.
Start it from the beginning? (y or n) y
Starting program: /local/ARBE/1151480/NE_benchmarking/testCudaAPI/deleteme 
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
warning: Cannot parse .gnu_debugdata section; LZMA support was disabled at compile time
warning: Cannot parse .gnu_debugdata section; LZMA support was disabled at compile time

Breakpoint 1, someFunction1 () at fun1.h:11
11	    int n = 2;
(cuda-gdb) s
12	    cudaMallocManaged(&x, n*sizeof(float));
(cuda-gdb) s
cudaMallocManaged<float> (devPtr=0x7fffffffe068, size=8, flags=1) at /usr/local/cuda/bin/../targets/x86_64-linux/include/cuda_runtime.h:508
508	  return ::cudaMallocManaged((void**)(void*)devPtr, size, flags);
(cuda-gdb) s
[Detaching after fork from child process 437412]
[New Thread 0x7fffd0a9b000 (LWP 437424)]
[New Thread 0x7fffbffff000 (LWP 437425)]

it never gets past that point

system info:
nvidia-smi

+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  NVIDIA A100-PCI...  Off  | 00000000:17:00.0 Off |                    0 |
| N/A   32C    P0    45W / 250W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA A100-PCI...  Off  | 00000000:65:00.0 Off |                    0 |
| N/A   31C    P0    41W / 250W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   2  NVIDIA A100-PCI...  Off  | 00000000:CA:00.0 Off |                    0 |
| N/A   32C    P0    45W / 250W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   3  NVIDIA A100-PCI...  Off  | 00000000:E3:00.0 Off |                    0 |
| N/A   31C    P0    42W / 250W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
|  No running processes found                                                 |
+-----------------------------------------------------------------------------+

compute-sanitizer

========= COMPUTE-SANITIZER
I did things! 1
========= ERROR SUMMARY: 0 errors

bump
I am having the same issue

Hi @user150689 , @cg1733831 Thank you for your report! CUDA debugger team is investigating the issue. While we are looking at the problem, could you, please try the following:

  • Based on your log, the debugger hangs on host in the cudaMallocManaged function. Could you try using next instruction to step over the cudaMallocManaged in line 12.
  • Did you observe similar issues (debugger hanging) when stepping into other functions?
  • I see that you are using CUDA 11.6 - would you be able to try CUDA 11.7?

it never gets passed [New Thread … for me to step any further

I have not observed similar issues when stepping into other functions. It seems to only happen when I use CUDA/trust API calls

It would take a long time to get 11.7 installed since I don’t have root and need to get others involved

FYI cg1733831 and I are on the same team.

I am having trouble reproducing this internally on an A100. Can you try setting the following env var and running on your end again to see if the hanging behavior resolves itself:

export CUDA_VISIBLE_DEVICES=0

No luck, same result

looks like it got cut off in the original post. idk if this helps shed any more light on the issue.

[Detaching after fork from child process 196809]
[New Thread 0x7fffd0a9b000 (LWP 196815)]
[New Thread 0x7fffbffff000 (LWP 196816)]
I did things! 1
[New Thread 0x7fffbe904000 (LWP 196817)]
^C
Thread 1 "thisIsATest" received signal SIGINT, Interrupt.
0x00007fffee23c957 in sendmsg () from /lib64/libpthread.so.0
(cuda-gdb) quit

particularly the " 0x00007fffee23c957 in sendmsg () from /lib64/libpthread.so.0 " line

Hi @cg1733831 we are still looking at the issue (having troubles reproducing it locally). Could you try a few more things:

  • Collect backtrace when thread is stopped in 0x00007fffee23c957 in sendmsg () from /lib64/libpthread.so.0
    • Before doing (cuda-gdb) quit please do (cuda-gdb) bt
  • Can you reproduce the same issue using standard gdb 10.2?

cuda 10.2: NOTE: this a a different server with a Tesla T4 and RHEL7, but the only version of 10.2 I have available to me (so probably not much use)

[New Thread 0x7fffe909d700 (LWP 325025)]
^C
Thread 1 "testing123" received signal SIGINT, Interrupt.
0x00007ffff370fc6d in sendmsg () from /lib64/libpthread.so.0
(cuda-gdb) bt
#0  0x00007ffff370fc6d in sendmsg () from /lib64/libpthread.so.0
#1  0x00007fffef9f6f40 in cudbgApiDetach () from /lib64/libcuda.so.1
#2  0x00007fffef9f7332 in cudbgApiDetach () from /lib64/libcuda.so.1
#3  0x00007fffef9ef36a in cudbgReportDriverInternalError () from /lib64/libcuda.so.1
#4  0x00007fffef9eff50 in cudbgReportDriverInternalError () from /lib64/libcuda.so.1
#5  0x00007fffef9f34e7 in cudbgReportDriverInternalError () from /lib64/libcuda.so.1
#6  0x00007fffef9f3609 in cudbgReportDriverInternalError () from /lib64/libcuda.so.1
#7  0x00007fffefaa48be in cuEGLApiInit () from /lib64/libcuda.so.1
#8  0x00007fffef9dcaf8 in cuMemGetAttribute_v2 () from /lib64/libcuda.so.1
#9  0x00007fffef9dd08c in cuMemGetAttribute_v2 () from /lib64/libcuda.so.1
#10 0x000000000042d6fa in cudart::contextState::loadCubin(bool*, cudart::globalModule*) ()
#11 0x0000000000420820 in cudart::globalModule::loadIntoContext(cudart::contextState*) ()
#12 0x000000000042ca4a in cudart::contextState::applyChanges() ()
#13 0x000000000043074f in cudart::contextStateManager::initRuntimeContextState_nonreentrant(cudart::contextState**) ()
#14 0x0000000000430eba in cudart::contextStateManager::getRuntimeContextState(cudart::contextState**, bool) ()
#15 0x00000000004244ec in cudart::doLazyInitContextState() ()
#16 0x0000000000410450 in cudart::cudaApiMallocManaged(void**, unsigned long, unsigned int) ()
#17 0x0000000000446b70 in cudaMallocManaged ()
#18 0x0000000000405610 in cudaMallocManaged<float> (devPtr=0x7fffffffe168, size=8, flags=1)
    at /usr/local/cuda-10.2/bin/../targets/x86_64-linux/include/cuda_runtime.h:506
#19 0x00000000004045bf in someFunction1 () at fun1.h:12
#20 0x00000000004049a1 in main (argc=1, argv=0x7fffffffe2b8) at testing123.cu:7

backtrace for CUDA 11.6

[New Thread 0x7fffbffff000 (LWP 350774)]
^C
Thread 1 "testing123" received signal SIGINT, Interrupt.
0x00007fffee23c957 in sendmsg () from /lib64/libpthread.so.0
(cuda-gdb) bt
#0  0x00007fffee23c957 in sendmsg () from /lib64/libpthread.so.0
#1  0x00007fffd6e98ea0 in ?? () from /lib64/libcuda.so.1
#2  0x00007fffd6e9d9ff in ?? () from /lib64/libcuda.so.1
#3  0x00007fffd6e9e764 in ?? () from /lib64/libcuda.so.1
#4  0x00007fffd6e9e976 in ?? () from /lib64/libcuda.so.1
#5  0x00007fffd6c955eb in ?? () from /lib64/libcuda.so.1
#6  0x00007fffd6c95bad in ?? () from /lib64/libcuda.so.1
#7  0x00007fffd6d34efa in ?? () from /lib64/libcuda.so.1
#8  0x00007fffd6d3560b in ?? () from /lib64/libcuda.so.1
#9  0x00000000004365f4 in __cudart570 ()
#10 0x000000000042606e in __cudart615 ()
#11 0x000000000043cd84 in __cudart544 ()
#12 0x00000000004414ea in __cudart789 ()
#13 0x0000000000441764 in __cudart779 ()
#14 0x0000000000433b9f in __cudart953 ()
#15 0x0000000000416fad in __cudart843 ()
#16 0x0000000000456050 in cudaMallocManaged ()
#17 0x0000000000405fce in cudaMallocManaged<float> (devPtr=0x7fffffffdea8, size=8, flags=1) at /usr/local/cuda/bin/../targets/x86_64-linux/include/cuda_runtime.h:508
#18 0x0000000000404ca2 in someFunction1 () at <filePath>/fun1.h:12
#19 0x0000000000405058 in main (argc=1, argv=0x7fffffffdfd8) at <filePath>/testing123.cu:7

Could you also should the output from info threads, select each thread using thread command, and then show a backtrace from each thread?

When you encounter the hang, how long are you waiting before giving up? I would like to rule out the case that this is a performance anomaly versus a hang.

Also please verify that you do not have an LD_PRELOAD set in your environment.

The longest I have waited is about 15 minutes, but I usually lose patience after about 2-3 minutes.

> echo $LD_PRELOAD 
LD_PRELOAD: Undefined variable.
[New Thread 0x7fffbffff000 (LWP 495286)]
^C
Thread 1 "testing123" received signal SIGINT, Interrupt.
0x00007fffee23c957 in sendmsg () from /lib64/libpthread.so.0
(cuda-gdb) thread 0
Invalid thread ID: 0
(cuda-gdb) bt    
#0  0x00007fffee23c957 in sendmsg () from /lib64/libpthread.so.0
#1  0x00007fffd6e98ea0 in ?? () from /lib64/libcuda.so.1
#2  0x00007fffd6e9d9ff in ?? () from /lib64/libcuda.so.1
#3  0x00007fffd6e9e764 in ?? () from /lib64/libcuda.so.1
#4  0x00007fffd6e9e976 in ?? () from /lib64/libcuda.so.1
#5  0x00007fffd6c955eb in ?? () from /lib64/libcuda.so.1
#6  0x00007fffd6c95bad in ?? () from /lib64/libcuda.so.1
#7  0x00007fffd6d34efa in ?? () from /lib64/libcuda.so.1
#8  0x00007fffd6d3560b in ?? () from /lib64/libcuda.so.1
#9  0x00000000004365f4 in __cudart570 ()
#10 0x000000000042606e in __cudart615 ()
#11 0x000000000043cd84 in __cudart544 ()
#12 0x00000000004414ea in __cudart789 ()
#13 0x0000000000441764 in __cudart779 ()
#14 0x0000000000433b9f in __cudart953 ()
#15 0x0000000000416fad in __cudart843 ()
#16 0x0000000000456050 in cudaMallocManaged ()
#17 0x0000000000405fce in cudaMallocManaged<float> (devPtr=0x7fffffffdea8, size=8, flags=1) at /usr/local/cuda/bin/../targets/x86_64-linux/include/cuda_runtime.h:508
#18 0x0000000000404ca2 in someFunction1 () at /ARBE/1151480/NE_benchmarking/testCudaAPI/fun1.h:12
#19 0x0000000000405058 in main (argc=1, argv=0x7fffffffdfd8) at /ARBE/1151480/NE_benchmarking/testCudaAPI/testing123.cu:7
(cuda-gdb) thread 1
[Switching to thread 1 (Thread 0x7ffff7fe5000 (LWP 495249))]
#0  0x00007fffee23c957 in sendmsg () from /lib64/libpthread.so.0
(cuda-gdb) bt
#0  0x00007fffee23c957 in sendmsg () from /lib64/libpthread.so.0
#1  0x00007fffd6e98ea0 in ?? () from /lib64/libcuda.so.1
#2  0x00007fffd6e9d9ff in ?? () from /lib64/libcuda.so.1
#3  0x00007fffd6e9e764 in ?? () from /lib64/libcuda.so.1
#4  0x00007fffd6e9e976 in ?? () from /lib64/libcuda.so.1
#5  0x00007fffd6c955eb in ?? () from /lib64/libcuda.so.1
#6  0x00007fffd6c95bad in ?? () from /lib64/libcuda.so.1
#7  0x00007fffd6d34efa in ?? () from /lib64/libcuda.so.1
#8  0x00007fffd6d3560b in ?? () from /lib64/libcuda.so.1
#9  0x00000000004365f4 in __cudart570 ()
#10 0x000000000042606e in __cudart615 ()
#11 0x000000000043cd84 in __cudart544 ()
#12 0x00000000004414ea in __cudart789 ()
#13 0x0000000000441764 in __cudart779 ()
#14 0x0000000000433b9f in __cudart953 ()
#15 0x0000000000416fad in __cudart843 ()
#16 0x0000000000456050 in cudaMallocManaged ()
#17 0x0000000000405fce in cudaMallocManaged<float> (devPtr=0x7fffffffdea8, size=8, flags=1) at /usr/local/cuda/bin/../targets/x86_64-linux/include/cuda_runtime.h:508
#18 0x0000000000404ca2 in someFunction1 () at /ARBE/1151480/NE_benchmarking/testCudaAPI/fun1.h:12
#19 0x0000000000405058 in main (argc=1, argv=0x7fffffffdfd8) at /ARBE/1151480/NE_benchmarking/testCudaAPI/testing123.cu:7
(cuda-gdb) thread 2
[Switching to thread 2 (Thread 0x7fffd0a9b000 (LWP 495285))]
#0  0x00007fffed423a71 in poll () from /lib64/libc.so.6
(cuda-gdb) bt
#0  0x00007fffed423a71 in poll () from /lib64/libc.so.6
#1  0x00007fffd6d033c1 in ?? () from /lib64/libcuda.so.1
#2  0x00007fffd6d0ed3a in ?? () from /lib64/libcuda.so.1
#3  0x00007fffd6cfdf16 in ?? () from /lib64/libcuda.so.1
#4  0x00007fffee23217a in start_thread () from /lib64/libpthread.so.0
#5  0x00007fffed42edf3 in clone () from /lib64/libc.so.6
(cuda-gdb) thread 3
[Switching to thread 3 (Thread 0x7fffbffff000 (LWP 495286))]
#0  0x00007fffed423a71 in poll () from /lib64/libc.so.6
(cuda-gdb) bt
#0  0x00007fffed423a71 in poll () from /lib64/libc.so.6
#1  0x00007fffd6d033c1 in ?? () from /lib64/libcuda.so.1
#2  0x00007fffd6d0ed3a in ?? () from /lib64/libcuda.so.1
#3  0x00007fffd6cfdf16 in ?? () from /lib64/libcuda.so.1
#4  0x00007fffee23217a in start_thread () from /lib64/libpthread.so.0
#5  0x00007fffed42edf3 in clone () from /lib64/libc.so.6
(cuda-gdb) thread 4
Unknown thread 4.

and it repeats “Unknown thread” for the rest of the threads