Kernel NULL pointer dereference when using the 580.142 driver

I just started SDDM then the driver crashed.
GPU: GTX 1060
Driver: 580.142

This bug is reproducible at 100%. Rolling back to the 580.126.18 driver is the only way to get my PC usable.

[ 2164.202115] BUG: kernel NULL pointer dereference, address: 0000000000000028
[ 2164.202122] #PF: supervisor read access in kernel mode
[ 2164.202124] #PF: error_code(0x0000) - not-present page
[ 2164.202126] PGD 8000000157eca067 P4D 8000000157eca067 PUD 0 
[ 2164.202131] Oops: Oops: 0000 [#1] SMP PTI
[ 2164.202135] CPU: 5 UID: 962 PID: 168275 Comm: kwin_wayland Tainted: P S         OE       6.19.6-arch1-1 #1 PREEMPT(full)  a70f585a3574c37bff18875a6cf7bd8652b4cbca
[ 2164.202140] Tainted: [P]=PROPRIETARY_MODULE, [S]=CPU_OUT_OF_SPEC, [O]=OOT_MODULE, [E]=UNSIGNED_MODULE
[ 2164.202142] Hardware name: Micro-Star International Co., Ltd. GT70 2OC/2OD/MS-1763, BIOS E1763IMS.11D 04/17/2015
[ 2164.202144] RIP: 0010:_nv000582kms+0x4/0x10 [nvidia_modeset]
[ 2164.202188] Code: 87 50 01 00 00 20 2a 2d c1 48 c7 87 38 01 00 00 70 28 2d c1 48 c7 87 40 01 00 00 40 2c 2d c1 c3 66 0f 1f 44 00 00 f3 0f 1e fa <0f> b6 47 28 c3 0f 1f 80 00 00 00 00 89 c9 48 8d 04 49 48 c1 e0 04
[ 2164.202191] RSP: 0018:ffffd28280d47918 EFLAGS: 00010206
[ 2164.202193] RAX: ffffffffc12d3060 RBX: ffff89f72c60a100 RCX: 0000040000000000
[ 2164.202196] RDX: 0000000000000000 RSI: ffff89f7670af478 RDI: 0000000000000000
[ 2164.202197] RBP: ffffd28280d47948 R08: ffff89f4f06539e8 R09: 0000000000000000
[ 2164.202199] R10: ffff89f7670af4a0 R11: ffffd28280d47888 R12: 0000000000000000
[ 2164.202201] R13: ffff89f72c60a108 R14: 0000000000000000 R15: 0000000000000000
[ 2164.202203] FS:  00007f44f6bc8b80(0000) GS:ffff89f824c04000(0000) knlGS:0000000000000000
[ 2164.202205] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 2164.202207] CR2: 0000000000000028 CR3: 0000000107c70001 CR4: 00000000001726f0
[ 2164.202210] Call Trace:
[ 2164.202213]  <TASK>
[ 2164.202215]  nv_drm_framebuffer_create+0x20c/0x450 [nvidia_drm 27a5a7be672e8c5f48b0015257b43f2ed0c4d9aa]
[ 2164.202230]  drm_internal_framebuffer_create+0x426/0x5a0
[ 2164.202235]  ? __pfx_drm_mode_addfb2_ioctl+0x10/0x10
[ 2164.202238]  drm_mode_addfb2+0x45/0x110
[ 2164.202241]  ? drm_dev_enter+0x1d/0x60
[ 2164.202245]  drm_ioctl_kernel+0xae/0x100
[ 2164.202250]  drm_ioctl+0x29b/0x520
[ 2164.202254]  ? __pfx_drm_mode_addfb2_ioctl+0x10/0x10
[ 2164.202258]  __x64_sys_ioctl+0x97/0xe0
[ 2164.202263]  do_syscall_64+0x81/0x610
[ 2164.202268]  ? drm_ioctl_kernel+0xae/0x100
[ 2164.202272]  ? __check_object_size+0x44/0x230
[ 2164.202276]  ? _copy_to_user+0x31/0x40
[ 2164.202279]  ? drm_ioctl+0x2d0/0x520
[ 2164.202283]  ? __pfx_drm_prime_fd_to_handle_ioctl+0x10/0x10
[ 2164.202287]  ? __x64_sys_ioctl+0xb1/0xe0
[ 2164.202291]  ? do_syscall_64+0x81/0x610
[ 2164.202294]  ? __task_pid_nr_ns+0x5f/0xc0
[ 2164.202299]  ? __do_sys_getpid+0x1d/0x30
[ 2164.202302]  ? do_syscall_64+0x81/0x610
[ 2164.202306]  ? __pfx_i915_gem_set_tiling_ioctl+0x10/0x10 [i915 d2a6b38cb26fc335cd6038231e519f742003ff93]
[ 2164.202483]  ? __x64_sys_ioctl+0xb1/0xe0
[ 2164.202487]  ? do_syscall_64+0x81/0x610
[ 2164.202491]  ? exc_page_fault+0x7e/0x1a0
[ 2164.202494]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[ 2164.202498] RIP: 0033:0x7f44fd72504d
[ 2164.202514] Code: 04 25 28 00 00 00 48 89 45 c8 31 c0 48 8d 45 10 c7 45 b0 10 00 00 00 48 89 45 b8 48 8d 45 d0 48 89 45 c0 b8 10 00 00 00 0f 05 <89> c2 3d 00 f0 ff ff 77 1a 48 8b 45 c8 64 48 2b 04 25 28 00 00 00
[ 2164.202516] RSP: 002b:00007ffd6629af30 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
[ 2164.202518] RAX: ffffffffffffffda RBX: 00007ffd6629b114 RCX: 00007f44fd72504d
[ 2164.202520] RDX: 00007ffd6629afc0 RSI: 00000000c06864b8 RDI: 0000000000000016
[ 2164.202521] RBP: 00007ffd6629af80 R08: 00007ffd6629b170 R09: 0000561362543e68
[ 2164.202522] R10: 00007f4500ac02c3 R11: 0000000000000246 R12: 00007ffd6629afc0
[ 2164.202523] R13: 00000000c06864b8 R14: 0000000000000016 R15: 0000561362543e30
[ 2164.202525]  </TASK>
[ 2164.202526] Modules linked in: nvidia_uvm(POE) nvidia_drm(POE) nvidia_modeset(POE) nvidia(POE) rfcomm snd_seq_dummy snd_hrtimer snd_seq snd_seq_device uhid cmac algif_hash algif_skcipher af_alg bnep intel_rapl_msr intel_rapl_common x86_pkg_temp_thermal intel_powerclamp coretemp dm_zero iwlmvm snd_hda_codec_alc662 kvm_intel snd_hda_codec_realtek_lib snd_hda_codec_nvhdmi snd_hda_codec_generic snd_hda_codec_hdmi kvm mac80211 snd_hda_intel btusb ptp at24 irqbypass btmtk pps_core snd_hda_codec libarc4 rapl btrtl vfat intel_cstate btbcm snd_hda_core ntfs3 fat mei_hdcp mei_pxp iwlwifi spi_nor iTCO_wdt i2c_i801 snd_intel_dspcfg btintel intel_uncore msi_wmi i2c_smbus intel_oc_wdt intel_pmc_bxt snd_intel_sdw_acpi psmouse pcspkr i2c_mux mtd sparse_keymap iTCO_vendor_support snd_hwdep cfg80211 bluetooth snd_pcm alx snd_timer mei_me mdio rfkill snd mei soundcore drm_ttm_helper mousedev joydev mac_hid ntsync i2c_dev ec_sys crypto_user nfnetlink dm_crypt encrypted_keys trusted asn1_encoder tee dm_mod raid0 md_mod rtsx_pci_sdmmc
[ 2164.202581]  spi_intel_platform mmc_core spi_intel ghash_clmulni_intel aesni_intel sr_mod serio_raw cdrom hid_gt683r rtsx_pci lpc_ich i915 i2c_algo_bit mxm_wmi drm_buddy video wmi ttm intel_gtt drm_display_helper cec
[ 2164.202592] Unloaded tainted modules: nvidia(POE):1 nvidia_uvm(POE):1 nvidia_modeset(POE):1 nvidia_drm(POE):1 [last unloaded: nvidia(POE)]
[ 2164.202598] CR2: 0000000000000028
[ 2164.202609] ---[ end trace 0000000000000000 ]---
[ 2164.202612] RIP: 0010:_nv000582kms+0x4/0x10 [nvidia_modeset]
[ 2164.202651] Code: 87 50 01 00 00 20 2a 2d c1 48 c7 87 38 01 00 00 70 28 2d c1 48 c7 87 40 01 00 00 40 2c 2d c1 c3 66 0f 1f 44 00 00 f3 0f 1e fa <0f> b6 47 28 c3 0f 1f 80 00 00 00 00 89 c9 48 8d 04 49 48 c1 e0 04
[ 2164.202653] RSP: 0018:ffffd28280d47918 EFLAGS: 00010206
[ 2164.202656] RAX: ffffffffc12d3060 RBX: ffff89f72c60a100 RCX: 0000040000000000
[ 2164.202658] RDX: 0000000000000000 RSI: ffff89f7670af478 RDI: 0000000000000000
[ 2164.202660] RBP: ffffd28280d47948 R08: ffff89f4f06539e8 R09: 0000000000000000
[ 2164.202662] R10: ffff89f7670af4a0 R11: ffffd28280d47888 R12: 0000000000000000
[ 2164.202663] R13: ffff89f72c60a108 R14: 0000000000000000 R15: 0000000000000000
[ 2164.202665] FS:  00007f44f6bc8b80(0000) GS:ffff89f824c04000(0000) knlGS:0000000000000000
[ 2164.202667] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 2164.202669] CR2: 0000000000000028 CR3: 0000000107c70001 CR4: 00000000001726f0

nvidia-bug-report.log.gz (580.8 KB)

Bug ID 5983006 created for same. I will check and revert back.

I could see few traces of driver mismatch, can you clean all traces of older driver 580.126.18 and reinstall the latest driver again? I will wait for your response.

I unloaded the old driver (580.126.18), installed the new driver (580.142) with the package manager then loaded the new driver. This is why you see traces of the old driver, but even in that case, unloading a driver shouldn’t corrupt the kernel state.
I tested again starting the PC only with the new driver, and it crashes in the kernel as soon as SSDM starts with the same error. SDDM uses kwin_wayland as compositor.
nvidia-bug-report.log.gz (568.3 KB)

I reverted nvidia-drm-fb.c by applying the following patch and the driver didn’t crash anymore.

diff --git a/nvidia-drm/nvidia-drm-fb.c b/nvidia-drm/nvidia-drm-fb.c
index 05506435..d6f38ee8 100644
--- a/nvidia-drm/nvidia-drm-fb.c
+++ b/nvidia-drm/nvidia-drm-fb.c
@@ -141,7 +141,6 @@ static int nv_drm_framebuffer_init(struct drm_device *dev,
     struct NvKmsKapiCreateSurfaceParams params = { };
     struct nv_drm_gem_object *nv_gem;
     struct drm_framebuffer *fb = &nv_fb->base;
-    bool non_scanout_mem_backed = false;
     uint32_t i;
     int ret;
 
@@ -163,10 +162,6 @@ static int nv_drm_framebuffer_init(struct drm_device *dev,
             params.planes[i].memory = nv_gem->pMemory;
             params.planes[i].offset = fb->offsets[i];
             params.planes[i].pitch = fb->pitches[i];
-
-            if (!nvKms->isVidmem(nv_gem->pMemory) && nv_dev->hasVideoMemory) {
-                non_scanout_mem_backed = true;
-            }
         }
     }
     params.height = fb->height;
@@ -233,15 +228,10 @@ static int nv_drm_framebuffer_init(struct drm_device *dev,
 
     /* Create NvKmsKapiSurface */
 
-    if (non_scanout_mem_backed) {
-        /* Do not register drm_framebuffer against nvkms */
-        nv_fb->pSurface = NULL;
-    } else {
-        nv_fb->pSurface = nvKms->createSurface(nv_dev->pDevice, &params);
-        if (nv_fb->pSurface == NULL) {
-            NV_DRM_DEV_DEBUG_DRIVER(nv_dev, "Failed to create NvKmsKapiSurface");
-            goto fail;
-        }
+    nv_fb->pSurface = nvKms->createSurface(nv_dev->pDevice, &params);
+    if (nv_fb->pSurface == NULL) {
+        NV_DRM_DEV_DEBUG_DRIVER(nv_dev, "Failed to create NvKmsKapiSurface");
+        goto fail;
     }
 
     return 0;

Not looked closely at the code but, if I were to take a guess, when non_scanout_mem_backedis true, the function returns 0 rather than -EINVAL (without goto fail) and a user of this function may be expecting that pSurface is not NULL if it returned 0.

Thanks, we have successfully reproduced this issue locally here at NVIDIA Labs, our engineers are taking look.

This revert wasn’t applied in nvidia-580.159.03, is that correct?

Haven’t tried myself but one of our user has confirmed that this issue still happens, and the file had no changes since 580.142. So, not fixed yet.

Both 595.58.03 and 595.71.05 are also affected by the issue.