We have made some progress. Thanks for the help.
[root@node70 ~]# nvidia-smi
Tue Nov 1 14:06:12 2022
±----------------------------------------------------------------------------+
| NVIDIA-SMI 520.61.05 Driver Version: 520.61.05 CUDA Version: 11.8 |
|-------------------------------±---------------------±---------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 NVIDIA GeForce … Off | 00000000:03:00.0 Off | N/A |
| 32% 66C P0 53W / 215W | 0MiB / 8192MiB | 1% Default |
| | | N/A |
±------------------------------±---------------------±---------------------+
| 1 NVIDIA GeForce … Off | 00000000:04:00.0 Off | N/A |
| 44% 72C P0 54W / 215W | 0MiB / 8192MiB | 0% Default |
| | | N/A |
±------------------------------±---------------------±---------------------+
±----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| No running processes found |
±----------------------------------------------------------------------------+
[root@node70 ~]# ls /dev/nv*
/dev/nvidia0 /dev/nvidiactl /dev/nvidia-uvm /dev/nvram
/dev/nvidia1 /dev/nvidia-modeset /dev/nvidia-uvm-tools
/dev/nvidia-caps:
nvidia-cap1 nvidia-cap2
lsmod | grep nvid
nvidia_drm 61440 0
nvidia_modeset 1138688 1 nvidia_drm
nvidia_uvm 1236992 0
nvidia 54571008 2 nvidia_uvm,nvidia_modeset
drm_kms_helper 266240 5 drm_vram_helper,ast,nvidia_drm
drm 585728 8 drm_kms_helper,drm_vram_helper,ast,nvidia,drm_ttm_helper,nvidia_drm,ttm
[root@node70 ~]# find /lib/modules | grep nvid
/lib/modules/4.18.0-372.26.1.el8_6.x86_64/extra/drivers/video/nvidia
/lib/modules/4.18.0-372.26.1.el8_6.x86_64/extra/drivers/video/nvidia/nvidia-peermem.ko
/lib/modules/4.18.0-372.26.1.el8_6.x86_64/extra/drivers/video/nvidia/nvidia-drm.ko
/lib/modules/4.18.0-372.26.1.el8_6.x86_64/extra/drivers/video/nvidia/nvidia-modeset.ko
/lib/modules/4.18.0-372.26.1.el8_6.x86_64/extra/drivers/video/nvidia/nvidia-uvm.ko
/lib/modules/4.18.0-372.26.1.el8_6.x86_64/extra/drivers/video/nvidia/nvidia.ko
[root@node70 ~]# uname -r
4.18.0-372.26.1.el8_6.x86_64
[root@node70 ~]# !!
egrep -i ‘dkms|nvid|cuda’ /var/log/messages
Nov 1 14:04:12 node70 kernel: nvidia: loading out-of-tree module taints kernel.
Nov 1 14:04:12 node70 kernel: nvidia: module license ‘NVIDIA’ taints kernel.
Nov 1 14:04:12 node70 kernel: nvidia: module verification failed: signature and/or required key missing - tainting kernel
Nov 1 14:04:12 node70 kernel: nvidia-nvlink: Nvlink Core is being initialized, major device number 238
Nov 1 14:04:12 node70 kernel: nvidia 0000:03:00.0: enabling device (0100 → 0103)
Nov 1 14:04:12 node70 kernel: nvidia 0000:03:00.0: vgaarb: changed VGA decodes: olddecodes=io+mem,decodes=none:owns=none
Nov 1 14:04:12 node70 systemd-udevd[1769]: Process ‘/usr/bin/bash -c ‘/usr/bin/mknod -Z -m 666 /dev/nvidiactl c $(grep nvidia-frontend /proc/devices | cut -d \ -f 1) 255’’ failed with exit code 1.
Nov 1 14:04:12 node70 systemd-udevd[1769]: Process ‘/usr/bin/bash -c ‘for i in $(cat /proc/driver/nvidia/gpus/*/information | grep Minor | cut -d \ -f 4); do /usr/bin/mknod -Z -m 666 /dev/nvidia${i} c $(grep nvidia-frontend /proc/devices | cut -d \ -f 1) ${i}; done’’ failed with exit code 1.
Nov 1 14:04:12 node70 kernel: input: HDA NVidia HDMI/DP,pcm=3 as /devices/pci0000:00/0000:00:02.2/0000:03:00.1/sound/card1/input9
Nov 1 14:04:12 node70 kernel: input: HDA NVidia HDMI/DP,pcm=7 as /devices/pci0000:00/0000:00:02.2/0000:03:00.1/sound/card1/input10
Nov 1 14:04:12 node70 kernel: input: HDA NVidia HDMI/DP,pcm=8 as /devices/pci0000:00/0000:00:02.2/0000:03:00.1/sound/card1/input11
Nov 1 14:04:12 node70 kernel: input: HDA NVidia HDMI/DP,pcm=3 as /devices/pci0000:00/0000:00:03.0/0000:04:00.1/sound/card2/input16
Nov 1 14:04:12 node70 kernel: input: HDA NVidia HDMI/DP,pcm=9 as /devices/pci0000:00/0000:00:02.2/0000:03:00.1/sound/card1/input12
Nov 1 14:04:12 node70 kernel: input: HDA NVidia HDMI/DP,pcm=10 as /devices/pci0000:00/0000:00:02.2/0000:03:00.1/sound/card1/input13
Nov 1 14:04:12 node70 kernel: input: HDA NVidia HDMI/DP,pcm=7 as /devices/pci0000:00/0000:00:03.0/0000:04:00.1/sound/card2/input17
Nov 1 14:04:12 node70 kernel: input: HDA NVidia HDMI/DP,pcm=8 as /devices/pci0000:00/0000:00:03.0/0000:04:00.1/sound/card2/input18
Nov 1 14:04:12 node70 kernel: input: HDA NVidia HDMI/DP,pcm=11 as /devices/pci0000:00/0000:00:02.2/0000:03:00.1/sound/card1/input14
Nov 1 14:04:12 node70 kernel: input: HDA NVidia HDMI/DP,pcm=12 as /devices/pci0000:00/0000:00:02.2/0000:03:00.1/sound/card1/input15
Nov 1 14:04:12 node70 kernel: input: HDA NVidia HDMI/DP,pcm=9 as /devices/pci0000:00/0000:00:03.0/0000:04:00.1/sound/card2/input19
Nov 1 14:04:12 node70 kernel: input: HDA NVidia HDMI/DP,pcm=10 as /devices/pci0000:00/0000:00:03.0/0000:04:00.1/sound/card2/input20
Nov 1 14:04:12 node70 kernel: input: HDA NVidia HDMI/DP,pcm=11 as /devices/pci0000:00/0000:00:03.0/0000:04:00.1/sound/card2/input21
Nov 1 14:04:12 node70 kernel: input: HDA NVidia HDMI/DP,pcm=12 as /devices/pci0000:00/0000:00:03.0/0000:04:00.1/sound/card2/input22
Nov 1 14:04:13 node70 kernel: nvidia 0000:04:00.0: enabling device (0100 → 0103)
Nov 1 14:04:13 node70 kernel: nvidia 0000:04:00.0: vgaarb: changed VGA decodes: olddecodes=io+mem,decodes=none:owns=none
Nov 1 14:04:13 node70 kernel: NVRM: loading NVIDIA UNIX x86_64 Kernel Module 520.61.05 Thu Sep 29 05:30:25 UTC 2022
Nov 1 14:04:13 node70 kernel: nvidia-uvm: Loaded the UVM driver, major device number 236.
Nov 1 14:04:13 node70 kernel: nvidia-modeset: Loading NVIDIA Kernel Mode Setting Driver for UNIX platforms 520.61.05 Thu Sep 29 05:29:37 UTC 2022
Nov 1 14:04:13 node70 kernel: [drm] [nvidia-drm] [GPU ID 0x00000300] Loading driver
Nov 1 14:04:13 node70 kernel: [drm] Initialized nvidia-drm 0.0.0 20160202 for 0000:03:00.0 on minor 1
Nov 1 14:04:13 node70 kernel: [drm] [nvidia-drm] [GPU ID 0x00000400] Loading driver
Nov 1 14:04:13 node70 kernel: [drm] Initialized nvidia-drm 0.0.0 20160202 for 0000:04:00.0 on minor 2
[root@node70 ~]#