Host Hardware:
supermicro x8dtl-if,dual intel 3.33GHz,96GiB RAM
HOST_lshw_short.log (12.0 KB)
Scenario:
- CentOS 7 host with Tesla P40
- Single CentOS 7 VM
- Attempting GPU-passthrough CUDA install on VM
On VM:
First sh NVIDIA-Linux-x86_64-440.33.01.run
Unable to load the kernel module ‘nvidia.ko’ error.
rpm -i cuda-repo-rhel7-10-2-local-10.2.89-440.33.01-1.0-1.x86_64.rpm
yum clean all
yum -y install nvidia-driver-latest-dkms
After dkms install and reboot, I start seeing this (not present before)
[ 6.741814] NVRM: This PCI I/O region assigned to your NVIDIA device is invalid:
[ 6.741814] NVRM: BAR1 is 0M @ 0x0 (PCI:0000:00:08.0)
[ 6.745474] NVRM: The system BIOS may have misconfigured your GPU.
[ 6.747561] nvidia: probe of 0000:00:08.0 failed with error -1
[ 6.750235] NVRM: The NVIDIA probe routine failed for 1 device(s).
[ 6.752146] NVRM: None of the NVIDIA devices were initialized.
[ 6.754248] nvidia-nvlink: Unregistered the Nvlink Core, major device number 242
2nd try at sh NVIDIA-Linux-x86_64-440.33.01.run
Install passes previous ‘nvidia.ko’ error point
Building kernel ---- 100%
Would you like to register the kernel model sources with DKMS ---- yes
WARNING: nvidia-installer was forced to guess the X library path… ---- OK
Install NVIDIA’s 32-bit compatibility libraries? ---- yes
reaches “installing dkms kernel module” up to 89% then hangs and fails returning same error
This PCI I/O region assigned to your NVIDIA device is invalid: etc…
VM-nvidia-bug-report.log (134.6 KB)
VM-nvidia-installer.log (8.5 KB)
HOST config:
[root@localhost /]# uname -a
Linux localhost.localdomain 3.10.0-1062.18.1.el7.x86_64 #1 SMP Tue Mar 17 23:49:17 UTC 2020 x86_64 x86_64 x86_64 GNU/Linux
[root@localhost /]# rpm -qa | grep kernel
kernel-tools-3.10.0-1062.18.1.el7.x86_64
kernel-3.10.0-1062.el7.x86_64
kernel-3.10.0-1062.18.1.el7.x86_64
kernel-tools-libs-3.10.0-1062.18.1.el7.x86_64
abrt-addon-kerneloops-2.1.11-55.el7.centos.x86_64
[root@localhost /]# lscpu | grep Virtualization
Virtualization: VT-x
[root@localhost /]# lspci -nnk | grep NVIDIA
03:00.0 3D controller [0302]: NVIDIA Corporation GP102GL [Tesla P40] [10de:1b38] (rev a1)
Subsystem: NVIDIA Corporation Device [10de:11d9]
[root@localhost /]# cat /etc/modprobe.d/vfio.conf
options vfio-pci ids=10de:1b38
[root@localhost /]# dmesg | grep -E “DMAR: IOMMU”
[ 0.000000] DMAR: IOMMU enabled
[root@localhost /]# grep pci /etc/modules-load.d/*
vfio-pci
[root@localhost /]# dmesg | grep -E vfio_pci
[ 3.534008] vfio_pci: add [10de:1b38[ffff:ffff]] class 0x000000/00000000
[root@localhost /]# systemctl status libvirtd | grep active
Active: active (running) since Fri 2020-03-27 05:10:26 CDT; 1h 10min ago
[root@localhost /]# virsh version
Compiled against library: libvirt 4.5.0
Using library: libvirt 4.5.0
Using API: QEMU 4.5.0
Running hypervisor: QEMU 2.12.0
[root@localhost /]# lsmod | grep kvm
kvm_intel 188688 0
kvm 636921 1 kvm_intel
irqbypass 13503 2 kvm,vfio_pci
VM config:
[root@vm1cent ~]# hostnamectl
Static hostname: vm1cent
Icon name: computer-vm
Chassis: vm
Machine ID: 725498d49ec2499491dc7b886f9707bc
Boot ID: dba31291ea954d5c9971ad2900776a12
Virtualization: kvm
Operating System: CentOS Linux 7 (Core)
CPE OS Name: cpe:/o:centos:centos:7
Kernel: Linux 3.10.0-1062.18.1.el7.x86_64
Architecture: x86-64
[root@localhost ~]# rpm -qa | grep kernel
kernel-tools-libs-3.10.0-1062.18.1.el7.x86_64
kernel-headers-3.10.0-1062.18.1.el7.x86_64
kernel-devel-3.10.0-1062.18.1.el7.x86_64
kernel-3.10.0-1062.18.1.el7.x86_64
kernel-tools-3.10.0-1062.18.1.el7.x86_64
kernel-3.10.0-957.el7.x86_64
[root@vm1cent ~]# dkms status
nvidia, 440.33.01, 3.10.0-1062.18.1.el7.x86_64, x86_64: installed
[root@vm1cent ~]# lspci -nnk | grep NVIDIA
00:08.0 3D controller [0302]: NVIDIA Corporation GP102GL [Tesla P40] [10de:1b38] (rev a1)
Subsystem: NVIDIA Corporation Device [10de:11d9]
[root@vm1cent ~]# grep -i nvidia /etc/modprobe.d/* /lib/modprobe.d/*
/lib/modprobe.d/dist-blacklist.conf:#blacklist nvidiafb <I’ve commented out # this line>
/lib/modprobe.d/nvidia.conf:#options nvidia-drm modeset=1
/lib/modprobe.d/nvidia-uvm.conf:# Make a soft dependency for nvidia-uvm as adding the module loading to
/lib/modprobe.d/nvidia-uvm.conf:# /usr/lib/modules-load.d/nvidia-uvm.conf for systemd consumption, makes the
/lib/modprobe.d/nvidia-uvm.conf:softdep nvidia post: nvidia-uvm
[root@vm1cent ~]# LC_ALL=C lscpu | grep Virtualization
Virtualization: VT-x
Virtualization type: full
[root@vm1cent ~]# cat /etc/default/grub
GRUB_TIMEOUT=5
GRUB_DISTRIBUTOR=“$(sed ‘s, release .*$,g’ /etc/system-release)”
GRUB_DEFAULT=saved
GRUB_DISABLE_SUBMENU=true
GRUB_TERMINAL_OUTPUT=“console”
GRUB_CMDLINE_LINUX=“crashkernel=auto rd.lvm.lv=centos/root rd.lvm.lv=centos/swap nomodeset console=ttyS0 rd.driver.blacklist=nouveau nouveau.modeset=0 modprobe.blacklist=nouveau nvidia-drm.modeset=1”
GRUB_DISABLE_RECOVERY=“true”
[root@vm1cent ~]# cat /etc/modprobe.d/blacklist.conf
blacklist nouveau
options nouveau modeset=0
echo 1 > /sys/bus/pci/devices/0000:00:08.0/remove
echo 1 > /sys/bus/pci/rescan
returns
[root@vm1cent ~]# echo 1 > /sys/bus/pci/rescan
[ 343.698407] pci 0000:00:08.0: 32.000 Gb/s available PCIe bandwidth, limited by 5 GT/s x8 link at 0000:00:08.0 (capable of 126.016 Gb/s with 8 GT/s x16 link)
[ 343.708168] pci 0000:00:08.0: BAR 0: assigned [mem 0xc0000000-0xc0ffffff]
[root@vm1cent ~]# [ 344.726488] nvidia-nvlink: Nvlink Core is being initialized, major device number 242
[ 344.777592] NVRM: This PCI I/O region assigned to your NVIDIA device is invalid:
[ 344.777592] NVRM: BAR1 is 0M @ 0x0 (PCI:0000:00:08.0)
[ 344.781789] NVRM: The system BIOS may have misconfigured your GPU.
[ 344.783720] nvidia: probe of 0000:00:08.0 failed with error -1
[ 344.785557] NVRM: The NVIDIA probe routine failed for 1 device(s).
[ 344.787536] NVRM: None of the NVIDIA devices were initialized.
[ 344.789624] nvidia-nvlink: Unregistered the Nvlink Core, major device number 242