Environment configuration
~$ cat /etc/os-release
NAME="CentOS Linux"
VERSION="8 (Core)"
ID="centos"
ID_LIKE="rhel fedora"
VERSION_ID="8"
PLATFORM_ID="platform:el8"
PRETTY_NAME="CentOS Linux 8 (Core)"
ANSI_COLOR="0;31"
CPE_NAME="cpe:/o:centos:centos:8"
HOME_URL="https://www.centos.org/"
BUG_REPORT_URL="https://bugs.centos.org/"
CENTOS_MANTISBT_PROJECT="CentOS-8"
CENTOS_MANTISBT_PROJECT_VERSION="8"
REDHAT_SUPPORT_PRODUCT="centos"
REDHAT_SUPPORT_PRODUCT_VERSION="8"
~$ uname -a
Linux my-h203e-server 4.18.0-193.el8.x86_64 #1 SMP Tue May 27 18:02:44 CST 2025 x86_64 x86_64 x86_64 GNU/Linux
~$ nvidia-smi -q -u
==============NVSMI LOG==============
Timestamp : Tue Jun 24 17:21:16 2025
Driver Version : 570.148.08
CUDA Version : 12.8
HIC Info : N/A
Attached Units : 0
~$ docker version
Client: Docker Engine - Community
Version: 20.10.24
API version: 1.41
Go version: go1.19.7
Git commit: 297e128
Built: Tue Apr 4 18:20:23 2023
OS/Arch: linux/amd64
Context: default
Experimental: true
Server: Docker Engine - Community
Engine:
Version: 20.10.24
API version: 1.41 (minimum version 1.12)
Go version: go1.19.7
Git commit: 5d6db84
Built: Tue Apr 4 18:18:29 2023
OS/Arch: linux/amd64
Experimental: false
containerd:
Version: 1.6.25
GitCommit: d8f198a4ed8892c764191ef7b3b06d8a2eeb5c7f
nvidia:
Version: 1.1.10
GitCommit: v1.1.10-0-g18a0cb0
docker-init:
Version: 0.19.0
GitCommit: de40ad0
- Nvidia container runtime version
~$ rpm -qa | grep nvidia-container
libnvidia-container-tools-1.17.8-1.x86_64
nvidia-container-toolkit-1.17.8-1.x86_64
libnvidia-container1-1.17.8-1.x86_64
nvidia-container-toolkit-base-1.17.8-1.x86_64
- Nvidia container runtime config
~$ cat /etc/nvidia-container-runtime/config.toml
disable-require = false
#swarm-resource = "DOCKER_RESOURCE_GPU"
#accept-nvidia-visible-devices-envvar-when-unprivileged = true
#accept-nvidia-visible-devices-as-volume-mounts = false
[nvidia-container-cli]
#root = "/run/nvidia/driver"
#path = "/usr/bin/nvidia-container-cli"
environment = []
debug = "/var/log/nvidia-container-toolkit.log"
#ldcache = "/etc/ld.so.cache"
load-kmods = true
#no-cgroups = false
#user = "root:video"
ldconfig = "@/sbin/ldconfig"
[nvidia-container-runtime]
debug = "/var/log/nvidia-container-runtime.log"
Reproduction steps
- start pytorch container and get exception symbolic files
~$ docker run --rm -it --entrypoint=/bin/bash --privileged --network=host --ipc=host --gpus=all nvcr.io/nvidia/pytorch:25.05-py3
root@my-h203e-server:/workspace# cudaCheck
CUDA Driver OK
root@my-h203e-server:/workspace# nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2025 NVIDIA Corporation
Built on Wed_Apr__9_19:24:57_PDT_2025
Cuda compilation tools, release 12.9, V12.9.41
Build cuda_12.9.r12.9/compiler.35813241_0
root@my-h203e-server:/workspace# ldconfig
/sbin/ldconfig.real: /opt/amazon/aws-ofi-nccl/lib/ is not a symbolic link
/sbin/ldconfig.real: Can't link /usr/local/lib/llllllllbEEEES8_NS_4guts8typelist8typelistIJSA_SA_SA_SA_SA_SA_llllllllbEEEEELb0EJLm0ELm1ELm2ELm3ELm4ELm5ELm6ELm7ELm8ELm9ELm10ELm11ELm12ELm13ELm14EEJSA_SA_SA_SA_SA_SA_llllllllbEEENSt5decayINSG_21infer_function_traitsIT_E4type11return_typeEE4typeEPNS_14OperatorKernelENS_14DispatchKeySetEPSt6vectorINS_6IValueESaISX_EESt16integer_sequenceImJXspT1_EEEPNSI_IJDpT2_EEE to libtorchvision.so.1.0
root@my-h203e-server:/workspace# ll /opt/hpcx/ncclnet_plugin/lib/
total 684
lrwxrwxrwx 1 root root 22 Jun 24 09:26 ''$'\004\304\t' -> libnccl-tuner.so.0.0.0*
drwxr-xr-x 1 root root 26 Jun 24 09:26 ./
drwxr-xr-x 1 root root 17 May 6 16:43 ../
lrwxrwxrwx 1 root root 20 Jun 24 09:26 e -> libnccl-net.so.0.0.0*
lrwxrwxrwx 1 root root 14 May 6 16:43 libnccl-net-ncclnet.so -> libnccl-net.so*
lrwxrwxrwx 1 root root 20 May 6 16:43 libnccl-net-ncclnet.so.1 -> libnccl-net.so.0.0.0*
lrwxrwxrwx 1 root root 20 May 6 16:43 libnccl-net.so -> libnccl-net.so.0.0.0*
lrwxrwxrwx 1 root root 20 May 6 16:43 libnccl-net.so.0 -> libnccl-net.so.0.0.0*
-rwxr-xr-x 1 root root 651376 May 6 16:43 libnccl-net.so.0.0.0*
lrwxrwxrwx 1 root root 16 May 6 16:43 libnccl-tuner-ncclnet.so -> libnccl-tuner.so*
lrwxrwxrwx 1 root root 22 May 6 16:43 libnccl-tuner-ncclnet.so.1 -> libnccl-tuner.so.0.0.0*
lrwxrwxrwx 1 root root 22 May 6 16:43 libnccl-tuner.so -> libnccl-tuner.so.0.0.0*
lrwxrwxrwx 1 root root 22 May 6 16:43 libnccl-tuner.so.0 -> libnccl-tuner.so.0.0.0*
-rwxr-xr-x 1 root root 42520 May 6 16:43 libnccl-tuner.so.0.0.0*
root@my-h203e-server:/workspace# ldconfig
/sbin/ldconfig.real: /opt/amazon/aws-ofi-nccl/lib/ is not a symbolic link
/sbin/ldconfig.real: Can't link /usr/local/lib/llllllllbEEEES8_NS_4guts8typelist8typelistIJSA_SA_SA_SA_SA_SA_llllllllbEEEEELb0EJLm0ELm1ELm2ELm3ELm4ELm5ELm6ELm7ELm8ELm9ELm10ELm11ELm12ELm13ELm14EEJSA_SA_SA_SA_SA_SA_llllllllbEEENSt5decayINSG_21infer_function_traitsIT_E4type11return_typeEE4typeEPNS_14OperatorKernelENS_14DispatchKeySetEPSt6vectorINS_6IValueESaISX_EESt16integer_sequenceImJXspT1_EEEPNSI_IJDpT2_EEE to libtorchvision.so.1.0
lrwxrwxrwx 1 root root 22 Jun 24 09:26 ''$'\004\304\t' -> libnccl-tuner.so.0.0.0*
lrwxrwxrwx 1 root root 20 Jun 24 09:26 e -> libnccl-net.so.0.0.0*
Thank you for the report! We had one other report of something similar just now. So far the commonality appears to be the comparatively old host distro versus the newer image (no repro with newer host distros that we’ve seen). But we’re attempting to use the details you provided to reproduce the issue to dig deeper.
Best,
Cliff Woolley
NVIDIA Deep Learning Frameworks Engineering
1 Like