Crash with cuda and nvidia 450

Hi

I have ran deep learning using cuda and nvidia 450. I use 10 cpu cores and I get a hard crash. Sending along a nvidia-bugz-log.

nvidia-bug-report_log.txt (2.5 MB)

I get this error message.
[17841.563190] CPU: 6 PID: 121561 Comm: kworker/u40:0 Tainted: P B W OEL 5.7.7-200.fc32.x86_64 #1
[17841.563190] Hardware name: Micro-Star International Co., Ltd. MS-7A93/X299 SLI PLUS (MS-7A93), BIOS 1.H0 05/29/2020
[17841.563196] Workqueue: writeback wb_workfn (flush-259:0)
[17841.563201] RIP: 0010:xas_find_marked+0x1c8/0x2f0
[17841.563203] Code: b6 42 01 83 c0 01 88 47 12 48 8b 52 08 48 89 57 18 48 85 d2 74 b0 49 39 f0 0f 82 d9 00 00 00 3c 40 74 dc 0f b6 c0 4e 8d 2c 1a <83> c0 05 48 8d 04 c2 48 8b 00 48 89 c1 83 e1 03 48 83 f9 02 75 08
[17841.563203] RSP: 0018:ffffaf0888413890 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff13
[17841.563204] RAX: 0000000000000013 RBX: ffffffffffffffff RCX: 0000000000000014
[17841.563204] RDX: ffff9bab85e9cb68 RSI: 0000000000002d93 RDI: ffffaf08884138e0
[17841.563205] RBP: ffffffffffffffc0 R08: ffffffffffffffff R09: 0000000000000000
[17841.563205] R10: 0000000000000000 R11: 0000000000000228 R12: 0000000000002d93
[17841.563205] R13: ffff9bab85e9cd90 R14: 0000000000000228 R15: ffffffffffffffff
[17841.563206] FS: 0000000000000000(0000) GS:ffff9bb35fd80000(0000) knlGS:0000000000000000
[17841.563206] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[17841.563207] CR2: 00000ff7e4576008 CR3: 000000093e80a005 CR4: 00000000003606e0
[17841.563208] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[17841.563208] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[17841.563208] Call Trace:
[17841.563215] find_get_pages_range_tag+0x1ce/0x300
[17841.563219] ? __switch_to_asm+0x40/0x70
[17841.563220] ? __switch_to_asm+0x34/0x70
[17841.563221] ? __switch_to_asm+0x40/0x70
[17841.563224] pagevec_lookup_range_tag+0x24/0x30
[17841.563227] mpage_prepare_extent_to_map+0x9b/0x260
[17841.563228] ? __kprobes_text_end+0x181958/0x181958
[17841.563231] ? __switch_to+0x80/0x420
[17841.563232] ? __switch_to_asm+0x34/0x70
[17841.563233] ? __schedule+0x279/0x770
[17841.563234] ? _cond_resched+0x16/0x40
[17841.563235] ? kmem_cache_alloc+0x168/0x220
[17841.563237] ext4_writepages+0x206/0xfe0
[17841.563239] ? xas_load+0x9/0x80
[17841.563240] ? find_get_entry+0xac/0x190
[17841.563241] ? __find_get_block+0xb6/0x2f0
[17841.563243] ? cpumask_next_and+0x1a/0x20
[17841.563246] ? update_sd_lb_stats.constprop.0+0x814/0x8b0
[17841.563248] do_writepages+0x28/0xa0
[17841.563250] __writeback_single_inode+0x3d/0x340
[17841.563251] writeback_sb_inodes+0x202/0x4e0
[17841.563253] __writeback_inodes_wb+0x4c/0xf0
[17841.563254] wb_writeback+0x22e/0x2c0
[17841.563255] ? 0xffffffffba000000
[17841.563256] wb_workfn+0x295/0x460
[17841.563259] process_one_work+0x1b4/0x380
[17841.563260] worker_thread+0x53/0x3e0
[17841.563261] ? process_one_work+0x380/0x380
[17841.563262] kthread+0x115/0x140
[17841.563263] ? __kthread_bind_mask+0x60/0x60
[17841.563264] ret_from_fork+0x1f/0x40
[17853.831673] rcu: INFO: rcu_sched self-detected stall on CPU
[17853.831677] rcu: 6-…: (240003 ticks this GP) idle=afa/1/0x4000000000000002 softirq=1252143/1252143 fqs=56457
[17853.831678] (t=240004 jiffies g=4563109 q=148502)
[17853.831678] NMI backtrace for cpu 6
[17853.831680] CPU: 6 PID: 121561 Comm: kworker/u40:0 Tainted: P B W OEL 5.7.7-200.fc32.x86_64 #1
[17853.831680] Hardware name: Micro-Star International Co., Ltd. MS-7A93/X299 SLI PLUS (MS-7A93), BIOS 1.H0 05/29/2020
[17853.831686] Workqueue: writeback wb_workfn (flush-259:0)
[17853.831687] Call Trace:
[17853.831688]
[17853.831693] dump_stack+0x64/0x88
[17853.831695] nmi_cpu_backtrace.cold+0x14/0x53
[17853.831698] ? lapic_can_unplug_cpu.cold+0x3e/0x3e
[17853.831699] nmi_trigger_cpumask_backtrace+0xd7/0xde
[17853.831701] rcu_dump_cpu_stacks+0xa5/0xd3
[17853.831703] rcu_sched_clock_irq.cold+0x1cb/0x3bf
[17853.831706] ? trigger_load_balance+0x5a/0x210
[17853.831708] update_process_times+0x5b/0x90
[17853.831710] tick_sched_handle+0x22/0x60
[17853.831711] tick_sched_timer+0x38/0x80
[17853.831712] ? tick_sched_do_timer+0x70/0x70
[17853.831713] __hrtimer_run_queues+0x118/0x280
[17853.831714] hrtimer_interrupt+0x10e/0x280
[17853.831717] smp_apic_timer_interrupt+0x6e/0x130
[17853.831718] apic_timer_interrupt+0xf/0x20
[17853.831719]
[17853.831722] RIP: 0010:xas_find_marked+0x1d2/0x2f0
[17853.831722] Code: 8b 52 08 48 89 57 18 48 85 d2 74 b0 49 39 f0 0f 82 d9 00 00 00 3c 40 74 dc 0f b6 c0 4e 8d 2c 1a 83 c0 05 48 8d 04 c2 48 8b 00 <48> 89 c1 83 e1 03 48 83 f9 02 75 08 48 3d fd 00 00 00 76 7e 0f b6
[17853.831723] RSP: 0018:ffffaf0888413890 EFLAGS: 00000206 ORIG_RAX: ffffffffffffff13
[17853.831724] RAX: ffffd996eb57dd00 RBX: ffffffffffffffff RCX: 0000000000000014
[17853.831724] RDX: ffff9bab85e9cb68 RSI: 0000000000002d93 RDI: ffffaf08884138e0
[17853.831725] RBP: ffffffffffffffc0 R08: ffffffffffffffff R09: 0000000000000000
[17853.831725] R10: 0000000000000000 R11: 0000000000000228 R12: 0000000000002d93
[17853.831726] R13: ffff9bab85e9cd90 R14: 0000000000000228 R15: ffffffffffffffff
[17853.831730] find_get_pages_range_tag+0x1ce/0x300
[17853.831733] ? __switch_to_asm+0x40/0x70
[17853.831734] ? __switch_to_asm+0x34/0x70
[17853.831735] ? __switch_to_asm+0x40/0x70
[17853.831738] pagevec_lookup_range_tag+0x24/0x30
[17853.831741] mpage_prepare_extent_to_map+0x9b/0x260
[17853.831742] ? __kprobes_text_end+0x181958/0x181958
[17853.831744] ? __switch_to+0x80/0x420
[17853.831745] ? __switch_to_asm+0x34/0x70
[17853.831746] ? __schedule+0x279/0x770
[17853.831746] ? _cond_resched+0x16/0x40
[17853.831748] ? kmem_cache_alloc+0x168/0x220
[17853.831750] ext4_writepages+0x206/0xfe0
[17853.831751] ? xas_load+0x9/0x80
[17853.831753] ? find_get_entry+0xac/0x190
[17853.831754] ? __find_get_block+0xb6/0x2f0
[17853.831756] ? cpumask_next_and+0x1a/0x20
[17853.831757] ? update_sd_lb_stats.constprop.0+0x814/0x8b0
[17853.831759] do_writepages+0x28/0xa0
[17853.831761] __writeback_single_inode+0x3d/0x340
[17853.831762] writeback_sb_inodes+0x202/0x4e0
[17853.831764] __writeback_inodes_wb+0x4c/0xf0
[17853.831765] wb_writeback+0x22e/0x2c0