A100, RHEL 8.2, Cuda 11.6
We are experiencing indefinite hangs using cuda-gdb on certain binaries, but not all. These binaries run fine outside the debugger. The hang seems to occur early. The machine has just been set up. We haven’t seen these issues on our previous RHEL 8 machine w/A100, Cuda 11.2. Any ideas appreciated.
An example program I threw together:
do_things.h
#pragma once
__global__ void do_things(int numels, float* arr)
{
int gid = threadIdx.x + blockIdx.x * blockDim .x;
if (gid < numels)
{
++arr[gid];
}
}
fun1.h
#pragma once
#include "do_things.h"
#include <thrust/reduce.h>
#include <cublas_v2.h>
#include <cuda_runtime.h>
void someFunction1()
{
float *x;
int n = 2;
cudaMallocManaged(&x, n*sizeof(float));
for (int i = 0; i < n; i++)
{
x[i] = i*0.00000001;
}
cudaMemPrefetchAsync(x, n*sizeof(float), 0);
cudaDeviceSynchronize();
do_things<<<1, 32>>>(n, x);
cudaDeviceSynchronize();
std::cout << "I did things! " << x[0] << std::endl;
cudaFree(x);
}
void someFunction2()
{
float *x;
int n = 2;
cudaMallocManaged(&x, n*sizeof(float));
for (int i = 0; i < n; i++)
{
x[i] = i*0.00000001;
}
cudaMemPrefetchAsync(x, n*sizeof(float), 0);
cudaDeviceSynchronize();
thrust::reduce(thrust::device, x, x+n);
cudaDeviceSynchronize();
std::cout << "I did things! " << x[0] << std::endl;
cudaFree(x);
}
void someFunction3()
{
int N = 1 << 10;
size_t bytes = N*N*sizeof(int);
float *a, *b, *c;
cudaMallocManaged(&a, bytes);
cudaMallocManaged(&b, bytes);
cudaMallocManaged(&c, bytes);
cudaMemPrefetchAsync(a, N*sizeof(float), 0);
cudaMemPrefetchAsync(b, N*sizeof(float), 0);
cudaMemPrefetchAsync(c, N*sizeof(float), 0);
cudaDeviceSynchronize();
cublasHandle_t handle;
cublasCreate(&handle);
float alpha = 1.0;
float beta = 0.0;
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, a, N, b, N, &beta, c, N);
std::cout << "I did things! " << c[0] << std::endl;
cudaDeviceSynchronize();
}
testing123.cu
#include <iostream>
#include "fun1.h"
int main(int argc, char *argv[])
{
// someFunction1();
// someFunction2();
// someFunction3();
return EXIT_SUCCESS;
}
only running someFunction1, 2, or 3 (commenting out the others), cuda-gdb will freeze consistently on someFunction2 and 3, and intermittently for 1.
compiled with:
nvcc -g -G -O0 testing123.cu -o deleteme -lcublas
cuda-gdb output:
>> cuda-gdb deleteme
NVIDIA (R) CUDA Debugger
11.6 release
Portions Copyright (C) 2007-2022 NVIDIA Corporation
GNU gdb (GDB) 10.2
Copyright (C) 2021 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.
Type "show copying" and "show warranty" for details.
This GDB was configured as "x86_64-pc-linux-gnu".
Type "show configuration" for configuration details.
For bug reporting instructions, please see:
<https://www.gnu.org/software/gdb/bugs/>.
Find the GDB manual and other documentation resources online at:
<http://www.gnu.org/software/gdb/documentation/>.
For help, type "help".
Type "apropos word" to search for commands related to "word"...
Reading symbols from deleteme...
(cuda-gdb) r
Starting program: /local/ARBE/1151480/NE_benchmarking/testCudaAPI/deleteme
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
warning: Cannot parse .gnu_debugdata section; LZMA support was disabled at compile time
warning: Cannot parse .gnu_debugdata section; LZMA support was disabled at compile time
[Detaching after fork from child process 437261]
[New Thread 0x7fffd0a9b000 (LWP 437273)]
[New Thread 0x7fffbffff000 (LWP 437274)]
^C
Thread 1 "deleteme" received signal SIGINT, Interrupt.
0x00007fffee23c957 in sendmsg () from /lib64/libpthread.so.0
(cuda-gdb) b fun1.h:10
Breakpoint 1 at 0x404c7a: file fun1.h, line 11.
(cuda-gdb) r
The program being debugged has been started already.
Start it from the beginning? (y or n) y
Starting program: /local/ARBE/1151480/NE_benchmarking/testCudaAPI/deleteme
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
warning: Cannot parse .gnu_debugdata section; LZMA support was disabled at compile time
warning: Cannot parse .gnu_debugdata section; LZMA support was disabled at compile time
Breakpoint 1, someFunction1 () at fun1.h:11
11 int n = 2;
(cuda-gdb) s
12 cudaMallocManaged(&x, n*sizeof(float));
(cuda-gdb) s
cudaMallocManaged<float> (devPtr=0x7fffffffe068, size=8, flags=1) at /usr/local/cuda/bin/../targets/x86_64-linux/include/cuda_runtime.h:508
508 return ::cudaMallocManaged((void**)(void*)devPtr, size, flags);
(cuda-gdb) s
[Detaching after fork from child process 437412]
[New Thread 0x7fffd0a9b000 (LWP 437424)]
[New Thread 0x7fffbffff000 (LWP 437425)]
it never gets past that point
system info:
nvidia-smi
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03 Driver Version: 510.47.03 CUDA Version: 11.6 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 NVIDIA A100-PCI... Off | 00000000:17:00.0 Off | 0 |
| N/A 32C P0 45W / 250W | 0MiB / 40960MiB | 0% Default |
| | | Disabled |
+-------------------------------+----------------------+----------------------+
| 1 NVIDIA A100-PCI... Off | 00000000:65:00.0 Off | 0 |
| N/A 31C P0 41W / 250W | 0MiB / 40960MiB | 0% Default |
| | | Disabled |
+-------------------------------+----------------------+----------------------+
| 2 NVIDIA A100-PCI... Off | 00000000:CA:00.0 Off | 0 |
| N/A 32C P0 45W / 250W | 0MiB / 40960MiB | 0% Default |
| | | Disabled |
+-------------------------------+----------------------+----------------------+
| 3 NVIDIA A100-PCI... Off | 00000000:E3:00.0 Off | 0 |
| N/A 31C P0 42W / 250W | 0MiB / 40960MiB | 0% Default |
| | | Disabled |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| No running processes found |
+-----------------------------------------------------------------------------+
compute-sanitizer
========= COMPUTE-SANITIZER
I did things! 1
========= ERROR SUMMARY: 0 errors