Hi,
I just wanted to setup our new graphic workstation and ran into this problem: whenever I connect my Canon 4K500ST projector to the card, I get frequent driver crashes. Due to the hight resolution of the the projector (4096x2400) it connects via 4 DVI cables with a resolution of 1024x2400 each. As the 4090 only has 4 outlets I connect a LCD display on the HDMI Port, 2 DP2DVI and one DP2HDMI MST Splitter (DeLock 87769) with two HDMI2DVI adapter. When I start Xorg with only an xterm and use xrandr to change the output from HDMI to the projector I often get a kernel driver crash. Although the system continues to run I can only restart the driver with a reboot, as unloading/reloading the driver is not possible. As I thought it has to do with the 5 connected displays and only 4 possible outlets at a time I disconnected the LCD and have only the projector connected. But in this configuration the driver crashes immediately. Running “nvidia-bugreport.sh” after the crash fails as it hangs while “cat /proc/driver/nvidia/gpus/0000:01:00.0/power”. I attach a bugreport from a fresh booted system and from after the crash started with “sudo nvidia-bug-report.sh --safe-mode --extra-system-data”. Following messages are visible in the journal:
[ 255.406537] BUG: unable to handle page fault for address: 00000000000016c8
[ 255.406548] #PF: supervisor read access in kernel mode
[ 255.406552] #PF: error_code(0x0000) - not-present page
[ 255.406556] PGD 0 P4D 0
[ 255.406561] Oops: 0000 [#1] SMP NOPTI
[ 255.406567] CPU: 10 PID: 590 Comm: nvidia-modeset/ Tainted: P OE 5.15.0-88-generic #98-Ubuntu
[ 255.406572] Hardware name: Dell Inc. Precision 3660/0PRR48, BIOS 2.6.1 06/14/2023
[ 255.406575] RIP: 0010:_nv018585rm+0x18d/0x310 [nvidia]
[ 255.406973] Code: ff e8 e7 16 fc ff 48 89 c7 48 c7 c6 20 49 68 c4 e8 a8 30 63 00 48 8b 4d 00 44 8b 91 cc 16 00 00 41 83 fa 02 0f 84 33 01 00 00 <44> 8b 80 c8 16 00 00 41 ba 01 00 00 00 44 0f b6 4d 1d 41 8d 54 24
[ 255.406974] RSP: 0018:ffffa4b1c0d87b90 EFLAGS: 00010293
[ 255.406975] RAX: 0000000000000000 RBX: ffff9490b0417008 RCX: ffff9490b04b4008
[ 255.406975] RDX: 0000000000000009 RSI: 0000000000272cdd RDI: 0000000000000000
[ 255.406976] RBP: ffff9490930e5b80 R08: 0000000000000000 R09: 0000000000000002
[ 255.406977] R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000003
[ 255.406977] R13: 0000000000000000 R14: ffff9490b04b4010 R15: ffffa4b1c2271008
[ 255.406978] FS: 0000000000000000(0000) GS:ffff949fcf680000(0000) knlGS:0000000000000000
[ 255.406979] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 255.406979] CR2: 00000000000016c8 CR3: 000000012a7fa000 CR4: 0000000000750ee0
[ 255.406980] PKRU: 55555554
[ 255.406980] Call Trace:
[ 255.406981] <TASK>
[ 255.406983] ? show_trace_log_lvl+0x1d6/0x2ea
[ 255.406986] ? show_trace_log_lvl+0x1d6/0x2ea
[ 255.406987] ? show_regs.part.0+0x23/0x29
[ 255.406988] ? __die_body.cold+0x8/0xd
[ 255.406989] ? __die+0x2b/0x37
[ 255.406991] ? page_fault_oops+0x13b/0x170
[ 255.406993] ? do_user_addr_fault+0x321/0x670
[ 255.406994] ? exc_page_fault+0x77/0x170
[ 255.406996] ? asm_exc_page_fault+0x27/0x30
[ 255.406997] ? _nv018585rm+0x18d/0x310 [nvidia]
[ 255.407176] ? _nv018477rm+0x4b/0xf0 [nvidia]
[ 255.407356] ? _nv020951rm+0xd9e/0x1748 [nvidia]
[ 255.407531] ? _nv043246rm+0x1a9/0x1b0 [nvidia]
[ 255.407699] ? _nv020668rm+0xd9/0x160 [nvidia]
[ 255.407868] ? _nv045201rm+0x1f1/0x300 [nvidia]
[ 255.408038] ? _nv013229rm+0x335/0x630 [nvidia]
[ 255.408166] ? _nv043390rm+0x69/0xd0 [nvidia]
[ 255.408290] ? _nv011754rm+0x86/0xa0 [nvidia]
[ 255.408412] ? _nv000578rm+0x5e/0x70 [nvidia]
[ 255.408534] ? rm_kernel_rmapi_op+0x127/0x213 [nvidia]
[ 255.408669] ? nvidia_modeset_rm_ops_alloc_stack+0x3c/0x60 [nvidia]
[ 255.408780] ? nvkms_call_rm+0x4d/0x80 [nvidia_modeset]
[ 255.408792] ? _nv002699kms+0x42/0x50 [nvidia_modeset]
[ 255.408804] ? _nv001280kms+0x1a8/0x350 [nvidia_modeset]
[ 255.408821] ? _nv001556kms+0xce/0xe0 [nvidia_modeset]
[ 255.408834] ? _nv001656kms+0x30/0x80 [nvidia_modeset]
[ 255.408847] ? nvkms_kthread_q_callback+0xc3/0x160 [nvidia_modeset]
[ 255.408855] ? _main_loop+0x89/0x140 [nvidia_modeset]
[ 255.408863] ? nvkms_sema_up+0x20/0x20 [nvidia_modeset]
[ 255.408870] ? kthread+0x127/0x150
[ 255.408873] ? set_kthread_struct+0x50/0x50
[ 255.408874] ? ret_from_fork+0x1f/0x30
[ 255.408876] </TASK>
[ 255.408876] Modules linked in: snd_ctl_led snd_hda_codec_realtek snd_hda_codec_generic rpcsec_gss_krb5 auth_rpcgss nfsv4 nfs lockd grace fscache netfs sunrpc binfmt_misc nls_iso8859_1 intel_rapl_msr snd_sof_pci_intel_tgl intel_rapl_common i915 snd_sof_intel_hda_common soundwire_intel soundwire_generic_allocation soundwire_cadence snd_sof_intel_hda snd_sof_pci snd_sof_xtensa_dsp snd_sof snd_soc_hdac_hda snd_hda_ext_core snd_soc_acpi_intel_match snd_soc_acpi soundwire_bus snd_hda_codec_hdmi x86_pkg_temp_thermal intel_powerclamp snd_soc_core coretemp snd_compress ttm ac97_bus snd_pcm_dmaengine i2c_algo_bit snd_hda_intel snd_intel_dspcfg snd_intel_sdw_acpi kvm_intel snd_hda_codec dell_wmi snd_hda_core ledtrig_audio snd_hwdep dell_smbios mei_hdcp snd_pcm dell_wmi_sysman dcdbas kvm joydev input_leds firmware_attributes_class dell_wmi_descriptor wmi_bmof snd_timer mei_me snd mei soundcore mac_hid int3403_thermal int340x_thermal_zone intel_hid nvidia_uvm(POE) acpi_pad acpi_tad sparse_keymap
[ 255.408900] sch_fq_codel dm_multipath scsi_dh_rdac scsi_dh_emc scsi_dh_alua msr efi_pstore ip_tables x_tables autofs4 btrfs blake2b_generic zstd_compress raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq libcrc32c raid1 raid0 multipath linear hid_generic usbhid hid nvidia_drm(POE) nvidia_modeset(POE) nvidia(POE) drm_kms_helper syscopyarea sysfillrect sysimgblt crct10dif_pclmul crc32_pclmul fb_sys_fops ghash_clmulni_intel cec rtsx_pci_sdmmc ucsi_acpi intel_lpss_pci aesni_intel rc_core intel_lpss nvme crypto_simd ahci typec_ucsi i2c_i801 xhci_pci cryptd drm e1000e nvme_core i2c_smbus libahci rtsx_pci idma64 typec xhci_pci_renesas wmi video pinctrl_alderlake
[ 255.408921] CR2: 00000000000016c8
[ 255.408923] ---[ end trace 7159c33c549d8c4d ]---
[ 256.000371] RIP: 0010:_nv018585rm+0x18d/0x310 [nvidia]
[ 256.000612] Code: ff e8 e7 16 fc ff 48 89 c7 48 c7 c6 20 49 68 c4 e8 a8 30 63 00 48 8b 4d 00 44 8b 91 cc 16 00 00 41 83 fa 02 0f 84 33 01 00 00 <44> 8b 80 c8 16 00 00 41 ba 01 00 00 00 44 0f b6 4d 1d 41 8d 54 24
[ 256.000612] RSP: 0018:ffffa4b1c0d87b90 EFLAGS: 00010293
[ 256.000614] RAX: 0000000000000000 RBX: ffff9490b0417008 RCX: ffff9490b04b4008
[ 256.000615] RDX: 0000000000000009 RSI: 0000000000272cdd RDI: 0000000000000000
[ 256.000615] RBP: ffff9490930e5b80 R08: 0000000000000000 R09: 0000000000000002
[ 256.000616] R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000003
[ 256.000616] R13: 0000000000000000 R14: ffff9490b04b4010 R15: ffffa4b1c2271008
[ 256.000617] FS: 0000000000000000(0000) GS:ffff949fcf680000(0000) knlGS:0000000000000000
[ 256.000618] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 256.000618] CR2: 00000000000016c8 CR3: 000000012a7fa000 CR4: 0000000000750ee0
[ 256.000619] PKRU: 55555554
Any help is much appreciated!
nvidia-bug-report-after-reboot.log.gz (374.2 KB)
nvidia-bug-report-after-crash.log.gz (131.4 KB)
Regards,
HellG