Host Hardware:
AMD EPYC 9554
Scenario:
- Linux Kernel 5.10 host with Tesla L40
- Single CentOS 7 VM
- Attempting GPU-passthrough install NVIDIA driver on VM
On VM:
sh NVIDIA-Linux-x86_64-525.147.05.run
Unable to load the kernel module ‘nvidia.ko’ error.
/var/log/nvidia-install.log show as follows:
Actually, I have tried the same version of NVIDIA driver in Host, it works well.
My server is Dell R7625.
I have read several topcis said that the “above 4G decoding” should be opened, but there is no such options in BIOS.
Anyone could help me fix this ?
HOST config:
[root@gpu0008 ~]# uname -a
Linux gpu0008.f.pri.idc2f.shanghai.tjdxspl 5.10.0-60.67.0.104.ule3.x86_64 #1 SMP Wed Aug 16 15:31:25 CST 2023 x86_64 x86_64 x86_64 GNU/Linux
[root@gpu0008 ~]# rpm -qa |grep kernel
kernel-headers-5.10.0-60.67.0.104.ule3.x86_64
kernel-devel-5.10.0-60.67.0.104.ule3.x86_64
kernel-tools-5.10.0-60.67.0.104.ule3.x86_64
kernel-5.10.0-60.67.0.104.ule3.x86_64
[root@gpu0008 ~]# lscpu | grep Virtualization
Virtualization: AMD-V
[root@gpu0008 ~]# lspci -nnk | grep -i nvidia
21:00.0 3D controller [0302]: NVIDIA Corporation Device [10de:26b5] (rev a1)
Subsystem: NVIDIA Corporation Device [10de:169d]
e1:00.0 3D controller [0302]: NVIDIA Corporation Device [10de:26b5] (rev a1)
Subsystem: NVIDIA Corporation Device [10de:169d]
[root@gpu0008 ~]# cat /etc/modprobe.d/vfio.conf
options vfio-pci ids=10de:26b5
[root@localhost /]# dmesg | grep -E “DMAR: IOMMU”
[root@gpu0008 ~]# dmesg | grep -i iommu
[ 0.000000] Command line: BOOT_IMAGE=/vmlinuz-5.10.0-60.67.0.104.ule3.x86_64 root=/dev/mapper/wocloud_vg-lv_root ro crashkernel=512M rd.lvm.lv=wocloud_vg/lv_root rd.driver.pre=vfio-pci pci=realloc amd_iommu=on iommu=pt rhgb quiet selinux=0
[ 0.016960] Kernel command line: BOOT_IMAGE=/vmlinuz-5.10.0-60.67.0.104.ule3.x86_64 root=/dev/mapper/wocloud_vg-lv_root ro crashkernel=512M rd.lvm.lv=wocloud_vg/lv_root rd.driver.pre=vfio-pci pci=realloc amd_iommu=on iommu=pt rhgb quiet selinux=0
[ 1.189708] iommu: Default domain type: Passthrough (set via kernel command line)
[ 1.432572] pci 0000:60:00.2: AMD-Vi: IOMMU performance counters supported
[ 1.432601] pci 0000:40:00.2: AMD-Vi: IOMMU performance counters supported
[ 1.432611] pci 0000:00:00.2: AMD-Vi: IOMMU performance counters supported
[ 1.432621] pci 0000:20:00.2: AMD-Vi: IOMMU performance counters supported
[ 1.432653] pci 0000:e0:00.2: AMD-Vi: IOMMU performance counters supported
[ 1.432666] pci 0000:c0:00.2: AMD-Vi: IOMMU performance counters supported
[ 1.432678] pci 0000:80:00.2: AMD-Vi: IOMMU performance counters supported
[ 1.432691] pci 0000:a0:00.2: AMD-Vi: IOMMU performance counters supported
[ 1.432733] pci 0000:00:00.3: Adding to iommu group 0
[ 1.432748] pci 0000:00:01.0: Adding to iommu group 1
[ 1.432757] pci 0000:00:01.1: Adding to iommu group 2
[ 1.432771] pci 0000:00:02.0: Adding to iommu group 3
[ 1.432784] pci 0000:00:03.0: Adding to iommu group 4
[ 1.432797] pci 0000:00:04.0: Adding to iommu group 5
[ 1.432816] pci 0000:00:05.0: Adding to iommu group 6
[ 1.432825] pci 0000:00:05.1: Adding to iommu group 7
[ 1.432843] pci 0000:00:07.0: Adding to iommu group 8
[ 1.432851] pci 0000:00:07.1: Adding to iommu group 8
[ 1.432870] pci 0000:00:14.0: Adding to iommu group 9
[ 1.432877] pci 0000:00:14.3: Adding to iommu group 9
[ 1.432927] pci 0000:00:18.0: Adding to iommu group 10
[ 1.432936] pci 0000:00:18.1: Adding to iommu group 10
[ 1.432946] pci 0000:00:18.2: Adding to iommu group 10
[ 1.432955] pci 0000:00:18.3: Adding to iommu group 10
[ 1.432964] pci 0000:00:18.4: Adding to iommu group 10
[ 1.432977] pci 0000:00:18.5: Adding to iommu group 10
[ 1.432986] pci 0000:00:18.6: Adding to iommu group 10
[ 1.432996] pci 0000:00:18.7: Adding to iommu group 10
[ 1.433047] pci 0000:00:19.0: Adding to iommu group 11
[ 1.433057] pci 0000:00:19.1: Adding to iommu group 11
[ 1.433065] pci 0000:00:19.2: Adding to iommu group 11
[ 1.433074] pci 0000:00:19.3: Adding to iommu group 11
[ 1.433082] pci 0000:00:19.4: Adding to iommu group 11
[ 1.433090] pci 0000:00:19.5: Adding to iommu group 11
[ 1.433099] pci 0000:00:19.6: Adding to iommu group 11
[ 1.433108] pci 0000:00:19.7: Adding to iommu group 11
[ 1.433118] pci 0000:01:00.0: Adding to iommu group 12
[ 1.433129] pci 0000:01:00.1: Adding to iommu group 13
[ 1.433143] pci 0000:02:00.0: Adding to iommu group 14
[ 1.433147] pci 0000:03:00.0: Adding to iommu group 8
[ 1.433151] pci 0000:03:00.1: Adding to iommu group 8
[ 1.433155] pci 0000:03:00.4: Adding to iommu group 8
[ 1.433158] pci 0000:03:00.5: Adding to iommu group 8
[ 1.433183] pci 0000:20:00.3: Adding to iommu group 15
[ 1.433199] pci 0000:20:01.0: Adding to iommu group 16
[ 1.433210] pci 0000:20:01.1: Adding to iommu group 17
[ 1.433225] pci 0000:20:02.0: Adding to iommu group 18
[ 1.433241] pci 0000:20:03.0: Adding to iommu group 19
[ 1.433259] pci 0000:20:04.0: Adding to iommu group 20
[ 1.433275] pci 0000:20:05.0: Adding to iommu group 21
[ 1.433296] pci 0000:20:07.0: Adding to iommu group 22
[ 1.433307] pci 0000:20:07.1: Adding to iommu group 22
[ 1.433318] pci 0000:21:00.0: Adding to iommu group 23
[ 1.433322] pci 0000:22:00.0: Adding to iommu group 22
[ 1.433325] pci 0000:22:00.1: Adding to iommu group 22
[ 1.433350] pci 0000:40:00.3: Adding to iommu group 24
[ 1.433364] pci 0000:40:01.0: Adding to iommu group 25
[ 1.433387] pci 0000:40:02.0: Adding to iommu group 26
[ 1.433401] pci 0000:40:03.0: Adding to iommu group 27
[ 1.433418] pci 0000:40:04.0: Adding to iommu group 28
[ 1.433432] pci 0000:40:05.0: Adding to iommu group 29
[ 1.433451] pci 0000:40:07.0: Adding to iommu group 30
[ 1.433461] pci 0000:40:07.1: Adding to iommu group 30
[ 1.433464] pci 0000:41:00.0: Adding to iommu group 30
[ 1.433466] pci 0000:41:00.1: Adding to iommu group 30
[ 1.433490] pci 0000:60:00.3: Adding to iommu group 31
[ 1.433504] pci 0000:60:01.0: Adding to iommu group 32
[ 1.433521] pci 0000:60:01.1: Adding to iommu group 33
[ 1.433535] pci 0000:60:02.0: Adding to iommu group 34
[ 1.433549] pci 0000:60:03.0: Adding to iommu group 35
[ 1.433562] pci 0000:60:04.0: Adding to iommu group 36
[ 1.433578] pci 0000:60:05.0: Adding to iommu group 37
[ 1.433588] pci 0000:60:05.2: Adding to iommu group 38
[ 1.433597] pci 0000:60:05.3: Adding to iommu group 39
[ 1.433621] pci 0000:60:07.0: Adding to iommu group 40
[ 1.433636] pci 0000:60:07.1: Adding to iommu group 40
[ 1.433647] pci 0000:60:07.2: Adding to iommu group 40
[ 1.433657] pci 0000:61:00.0: Adding to iommu group 41
[ 1.433660] pci 0000:62:00.0: Adding to iommu group 41
[ 1.433681] pci 0000:63:00.0: Adding to iommu group 42
[ 1.433692] pci 0000:63:00.1: Adding to iommu group 42
[ 1.433702] pci 0000:64:00.0: Adding to iommu group 43
[ 1.433712] pci 0000:64:00.1: Adding to iommu group 44
[ 1.433715] pci 0000:65:00.0: Adding to iommu group 40
[ 1.433722] pci 0000:65:00.1: Adding to iommu group 40
[ 1.433725] pci 0000:65:00.4: Adding to iommu group 40
[ 1.433727] pci 0000:66:00.0: Adding to iommu group 40
[ 1.433756] pci 0000:80:00.3: Adding to iommu group 45
[ 1.433770] pci 0000:80:01.0: Adding to iommu group 46
[ 1.433784] pci 0000:80:02.0: Adding to iommu group 47
[ 1.433797] pci 0000:80:03.0: Adding to iommu group 48
[ 1.433815] pci 0000:80:04.0: Adding to iommu group 49
[ 1.433829] pci 0000:80:05.0: Adding to iommu group 50
[ 1.433848] pci 0000:80:07.0: Adding to iommu group 51
[ 1.433860] pci 0000:80:07.1: Adding to iommu group 51
[ 1.433862] pci 0000:81:00.0: Adding to iommu group 51
[ 1.433868] pci 0000:81:00.1: Adding to iommu group 51
[ 1.433870] pci 0000:81:00.5: Adding to iommu group 51
[ 1.433895] pci 0000:a0:00.3: Adding to iommu group 52
[ 1.433909] pci 0000:a0:01.0: Adding to iommu group 53
[ 1.433919] pci 0000:a0:01.1: Adding to iommu group 54
[ 1.433933] pci 0000:a0:02.0: Adding to iommu group 55
[ 1.433946] pci 0000:a0:03.0: Adding to iommu group 56
[ 1.433960] pci 0000:a0:04.0: Adding to iommu group 57
[ 1.433973] pci 0000:a0:05.0: Adding to iommu group 58
[ 1.433996] pci 0000:a0:07.0: Adding to iommu group 59
[ 1.434008] pci 0000:a0:07.1: Adding to iommu group 59
[ 1.434022] pci 0000:a1:00.0: Adding to iommu group 60
[ 1.434031] pci 0000:a1:00.1: Adding to iommu group 61
[ 1.434039] pci 0000:a2:00.0: Adding to iommu group 59
[ 1.434042] pci 0000:a2:00.1: Adding to iommu group 59
[ 1.434066] pci 0000:c0:00.3: Adding to iommu group 62
[ 1.434079] pci 0000:c0:01.0: Adding to iommu group 63
[ 1.434093] pci 0000:c0:02.0: Adding to iommu group 64
[ 1.434110] pci 0000:c0:03.0: Adding to iommu group 65
[ 1.434123] pci 0000:c0:04.0: Adding to iommu group 66
[ 1.434137] pci 0000:c0:05.0: Adding to iommu group 67
[ 1.434155] pci 0000:c0:07.0: Adding to iommu group 68
[ 1.434167] pci 0000:c0:07.1: Adding to iommu group 68
[ 1.434170] pci 0000:c1:00.0: Adding to iommu group 68
[ 1.434176] pci 0000:c1:00.1: Adding to iommu group 68
[ 1.434200] pci 0000:e0:00.3: Adding to iommu group 69
[ 1.434214] pci 0000:e0:01.0: Adding to iommu group 70
[ 1.434224] pci 0000:e0:01.1: Adding to iommu group 71
[ 1.434241] pci 0000:e0:02.0: Adding to iommu group 72
[ 1.434255] pci 0000:e0:03.0: Adding to iommu group 73
[ 1.434269] pci 0000:e0:04.0: Adding to iommu group 74
[ 1.434283] pci 0000:e0:05.0: Adding to iommu group 75
[ 1.434301] pci 0000:e0:07.0: Adding to iommu group 76
[ 1.434314] pci 0000:e0:07.1: Adding to iommu group 76
[ 1.434324] pci 0000:e1:00.0: Adding to iommu group 77
[ 1.434327] pci 0000:e2:00.0: Adding to iommu group 76
[ 1.434330] pci 0000:e2:00.1: Adding to iommu group 76
[ 1.435116] pci 0000:60:00.2: AMD-Vi: Found IOMMU cap 0x40
[ 1.435120] pci 0000:40:00.2: AMD-Vi: Found IOMMU cap 0x40
[ 1.435122] pci 0000:00:00.2: AMD-Vi: Found IOMMU cap 0x40
[ 1.435124] pci 0000:20:00.2: AMD-Vi: Found IOMMU cap 0x40
[ 1.435126] pci 0000:e0:00.2: AMD-Vi: Found IOMMU cap 0x40
[ 1.435128] pci 0000:c0:00.2: AMD-Vi: Found IOMMU cap 0x40
[ 1.435130] pci 0000:80:00.2: AMD-Vi: Found IOMMU cap 0x40
[ 1.435132] pci 0000:a0:00.2: AMD-Vi: Found IOMMU cap 0x40
[ 1.451740] perf/amd_iommu: Detected AMD IOMMU #0 (2 banks, 4 counters/bank).
[ 1.451759] perf/amd_iommu: Detected AMD IOMMU #1 (2 banks, 4 counters/bank).
[ 1.451775] perf/amd_iommu: Detected AMD IOMMU #2 (2 banks, 4 counters/bank).
[ 1.451794] perf/amd_iommu: Detected AMD IOMMU #3 (2 banks, 4 counters/bank).
[ 1.451814] perf/amd_iommu: Detected AMD IOMMU #4 (2 banks, 4 counters/bank).
[ 1.451833] perf/amd_iommu: Detected AMD IOMMU #5 (2 banks, 4 counters/bank).
[ 1.451853] perf/amd_iommu: Detected AMD IOMMU #6 (2 banks, 4 counters/bank).
[ 1.451872] perf/amd_iommu: Detected AMD IOMMU #7 (2 banks, 4 counters/bank).
[root@gpu0008 ~]# grep pci /etc/modules-load.d/*
vfio_pci
[root@gpu0008 ~]# dmesg | grep -E vfio_pci
[ 1.861671] vfio_pci: add [10de:26b5[ffffffff:ffffffff]] class 0x000000/00000000
[ 436.327539] vfio_pci: vfio_pci_dma_fault_init: Get DOMAIN_ATTR_NESTING failed: -19.
[ 436.327540] vfio_pci: vfio_pci_dma_fault_response_init: Get DOMAIN_ATTR_NESTING failed: -19.
[ 954.367725] vfio_pci: vfio_pci_dma_fault_init: Get DOMAIN_ATTR_NESTING failed: -19.
[ 954.367726] vfio_pci: vfio_pci_dma_fault_response_init: Get DOMAIN_ATTR_NESTING failed: -19.
[root@gpu0008 ~]# virsh version
Compiled against library: libvirt 6.2.0
Using library: libvirt 6.2.0
Using API: QEMU 6.2.0
Running hypervisor: QEMU 3.0.0
[root@gpu0008 ~]# lsmod | grep kvm
kvm_amd 122880 19
ccp 118784 1 kvm_amd
kvm 978944 1 kvm_amd
irqbypass 16384 6 vfio_pci,kvm
VM config:
[root@localhost ~]# rpm -qa | grep kernel
kernel-devel-5.10.0-60.67.0.96.ule3.x86_64
kernel-tools-5.10.0-60.67.0.96.ule3.x86_64
kernel-headers-5.10.0-60.67.0.96.ule3.x86_64
kernel-5.10.0-60.67.0.96.ule3.x86_64
[root@vm1cent ~]# lspci -nnk | grep NVIDIA
00:07.0 3D controller [0302]: NVIDIA Corporation Device [10de:26b5] (rev a1)
Subsystem: NVIDIA Corporation Device [10de:169d]
[root@vm1cent ~]# grep -i nvidia /etc/modprobe.d/* /lib/modprobe.d/*
/lib/modprobe.d/dist-blacklist.conf:blacklist nvidiafb
echo 1 > /sys/bus/pci/devices/0000:00:08.0/remove
echo 1 > /sys/bus/pci/rescan
returns
[root@vm1cent ~]# echo 1 > /sys/bus/pci/rescan
dmesg


