I am receiving a persistent GPU error on boot, and subsequent intermittent errors following boot (specifically when loading GPU-intensive apps such as hello AI world). I am running L4T 32.4.2 - any ideas what could be causing this?
[ 398.409779] nvgpu: 57000000.gpu nvgpu_set_error_notifier_locked:137 [ERR] error notifier set to 8 for ch 502
[ 398.419964] nvgpu: 57000000.gpu gk20a_fifo_handle_sched_error:2531 [ERR] fifo sched ctxsw timeout error: engine=0, tsg=6, ms=3100
[ 398.432169] ---- mlocks ----
[ 398.436556] ---- syncpts ----
[ 398.439532] id 1 (disp0_a) min 4 max 4 refs 1 (previous client : )
[ 398.445763] id 2 (disp0_b) min 3 max 3 refs 1 (previous client : )
[ 398.451960] id 3 (disp0_c) min 3 max 3 refs 1 (previous client : )
[ 398.458160] id 7 (gm20b_507) min 27914 max 27914 refs 1 (previous client : )
[ 398.465226] id 8 (gm20b_506) min 17022 max 17022 refs 1 (previous client : )
[ 398.472287] id 9 (gm20b_505) min 2 max 2 refs 1 (previous client : )
[ 398.478651] id 11 (gm20b_504) min 2 max 2 refs 1 (previous client : )
[ 398.485189] id 12 (gm20b_503) min 54 max 54 refs 1 (previous client : )
[ 398.491812] id 13 (gm20b_502) min 6902 max 6908 refs 1 (previous client : )
[ 398.498781] id 14 (gm20b_501) min 4 max 4 refs 1 (previous client : )
[ 398.505226] id 26 (vblank0) min 717 max -6 refs 1 (previous client : )
[ 398.513320] ---- channels ----
[ 398.516379]
channel 0 - 54340000.vic
[ 398.522910] 0-54340000.vic (0):
[ 398.525966] inactive
[ 398.529633]
---- host general irq ----
[ 398.536333] sync_hintmask_ext = 0xc0000000
[ 398.540440] sync_hintmask = 0x80000000
[ 398.544191] sync_intc0mask = 0x00000001
[ 398.548029] sync_intmask = 0x00000011
[ 398.551692]
---- host syncpt irq mask ----
[ 398.558739] syncpt_thresh_int_mask(0) = 0x04000001
[ 398.563528] syncpt_thresh_int_mask(1) = 0x00000000
[ 398.568319] syncpt_thresh_int_mask(2) = 0x00000000
[ 398.573123] syncpt_thresh_int_mask(3) = 0x00000000
[ 398.577914] syncpt_thresh_int_mask(4) = 0x00000000
[ 398.582704] syncpt_thresh_int_mask(5) = 0x00000000
[ 398.587495] syncpt_thresh_int_mask(6) = 0x00000000
[ 398.592284] syncpt_thresh_int_mask(7) = 0x00000000
[ 398.597071] syncpt_thresh_int_mask(8) = 0x00000000
[ 398.601860] syncpt_thresh_int_mask(9) = 0x00000000
[ 398.606653] syncpt_thresh_int_mask(10) = 0x00000000
[ 398.611529] syncpt_thresh_int_mask(11) = 0x00000000
[ 398.616407]
---- host syncpt irq status ----
[ 398.623627] syncpt_thresh_cpu0_int_status(0) = 0x00000000
[ 398.629030] syncpt_thresh_cpu0_int_status(1) = 0x00000000
[ 398.634427] syncpt_thresh_cpu0_int_status(2) = 0x00000000
[ 398.639826] syncpt_thresh_cpu0_int_status(3) = 0x00000000
[ 398.645223] syncpt_thresh_cpu0_int_status(4) = 0x00000000
[ 398.650620] syncpt_thresh_cpu0_int_status(5) = 0x00000000
[ 398.656014]
---- host syncpt thresh ----
[ 398.662889] syncpt_int_thresh_thresh_0(0) = 1
[ 398.667250] syncpt_int_thresh_thresh_0(13) = 6904
[ 398.672027] gm20b pbdma 0:
[ 398.674639] id: 6 (tsg), next_id: 6 (tsg) chan status: valid
[ 398.680303] PBDMA_PUT: 0000001e00008678 PBDMA_GET: 0000001e00008600 GP_PUT: 00000074 GP_GET: 00000072 FETCH: 00000074 HEADER: 60030100
HDR: 60116040 SHADOW0: 000085e4 SHADOW1: 0000941e
[ 398.698113] gm20b eng 0:
[ 398.700563] id: 0 (channel), next_id: 6 (tsg), ctx status: load
[ 398.706567] busy
[ 398.708508] gm20b eng 1:
[ 398.710946] id: 0 (channel), next_id: 0 (channel), ctx status: invalid
[ 398.720729] 501-gm20b, pid 11802, refs 2:
[ 398.724659] channel status: in use idle not busy
[ 398.729364] RAMFC : TOP: 8000001f0000c030 PUT: 0000001f0000c030 GET: 0000001f0000c030 FETCH: 0000001f0000c030
HEADER: 60400000 COUNT: 80000000
SYNCPOINT 00000000 00000e01 SEMAPHORE 00000000 00000000 00000000 00000000
[ 398.752820] 502-gm20b, pid 11802, refs 8:
[ 398.756746] channel status: in use on_pbdma_and_eng busy
[ 398.762147] RAMFC : TOP: 8000001f00006f60 PUT: 0000001f00006f60 GET: 0000001f00006f60 FETCH: 0000001f00006f60
HEADER: 60400000 COUNT: 80000000
SYNCPOINT 00000000 00000d01 SEMAPHORE 0000000d fc001000 05e82179 01100002
[ 398.785590] 503-gm20b, pid 9069, refs 2:
[ 398.789425] channel status: in use idle not busy
[ 398.794131] RAMFC : TOP: 8000001f00280288 PUT: 0000001f00280288 GET: 0000001f00280288 FETCH: 0000001f00280288
HEADER: 60400000 COUNT: 80000000
SYNCPOINT 00000000 00000c01 SEMAPHORE 00000000 00000000 00000000 00000000
[ 398.817570] 504-gm20b, pid 9069, refs 2:
[ 398.821406] channel status: in use idle not busy
[ 398.826109] RAMFC : TOP: 8000001f00240018 PUT: 0000001f00240018 GET: 0000001f00240018 FETCH: 0000001f00240018
HEADER: 60400000 COUNT: 80000000
SYNCPOINT 00000000 00000b01 SEMAPHORE 00000000 00000000 00000000 00000000
[ 398.849564] 505-gm20b, pid 9069, refs 2:
[ 398.853400] channel status: in use idle not busy
[ 398.858106] RAMFC : TOP: 8000001f00200018 PUT: 0000001f00200018 GET: 0000001f00200018 FETCH: 0000001f00200018
HEADER: 60400000 COUNT: 80000000
SYNCPOINT 00000000 00000901 SEMAPHORE 00000000 00000000 00000000 00000000
[ 398.881544] 506-gm20b, pid 9069, refs 2:
[ 398.885382] channel status: in use idle not busy
[ 398.890086] RAMFC : TOP: 8000001f00171de8 PUT: 0000001f00171de8 GET: 0000001f00171de8 FETCH: 0000001f00171de8
HEADER: 60400000 COUNT: 80000000
SYNCPOINT 00000000 00000801 SEMAPHORE 00000001 0002fff0 0000367d 00000004
[ 398.913527] 507-gm20b, pid 9069, refs 2:
[ 398.917360] channel status: in use idle not busy
[ 398.922065] RAMFC : TOP: 8000001f00051c88 PUT: 0000001f00051c88 GET: 0000001f00051c88 FETCH: 0000001f00051c88
HEADER: 60400000 COUNT: 80000000
SYNCPOINT 00000000 00000701 SEMAPHORE 00000001 0002ffb0 00000003 00001004
[ 398.945506] 508-gm20b, pid 3989, refs 2:
[ 398.949341] channel status: in use idle not busy
[ 398.954044] RAMFC : TOP: 0000000000000000 PUT: 0000000000000000 GET: 0000000000000000 FETCH: 0000000000000000
HEADER: 60400000 COUNT: 00000000
SYNCPOINT 00000000 00000000 SEMAPHORE 00000000 00000000 00000000 00000000
[ 398.977496] 509-gm20b, pid 3989, refs 2:
[ 398.981331] channel status: in use idle not busy
[ 398.986035] RAMFC : TOP: 0000000000000000 PUT: 0000000000000000 GET: 0000000000000000 FETCH: 0000000000000000
HEADER: 60400000 COUNT: 00000000
SYNCPOINT 00000000 00000000 SEMAPHORE 00000000 00000000 00000000 00000000
[ 399.009476] 510-gm20b, pid 3989, refs 2:
[ 399.013313] channel status: in use idle not busy
[ 399.018017] RAMFC : TOP: 0000000000000000 PUT: 0000000000000000 GET: 0000000000000000 FETCH: 0000000000000000
HEADER: 60400000 COUNT: 00000000
SYNCPOINT 00000000 00000000 SEMAPHORE 00000000 00000000 00000000 00000000
[ 399.041462] 511-gm20b, pid 3989, refs 2:
[ 399.045297] channel status: in use idle not busy
[ 399.050001] RAMFC : TOP: 0000000000000000 PUT: 0000000000000000 GET: 0000000000000000 FETCH: 0000000000000000
HEADER: 60400000 COUNT: 00000000
SYNCPOINT 00000000 00000000 SEMAPHORE 00000000 00000000 00000000 00000000
[ 399.073687] nvgpu: 57000000.gpu gk20a_fifo_handle_mmu_fault_locked:1721 [ERR] fake mmu fault on engine 0, engine subid 0 (gpc), client 0 (l1 0), addr 0x547b119000, type 4 (unbound inst), access_type 0x00000001,inst_ptr 0x20fc8000
[ 399.094101] nvgpu: 57000000.gpu gk20a_fecs_dump_falcon_stats:129 [ERR] gr_fecs_os_r : 0
[ 399.102708] nvgpu: 57000000.gpu gk20a_fecs_dump_falcon_stats:131 [ERR] gr_fecs_cpuctl_r : 0x40
[ 399.111930] nvgpu: 57000000.gpu gk20a_fecs_dump_falcon_stats:133 [ERR] gr_fecs_idlestate_r : 0x1
[ 399.121320] nvgpu: 57000000.gpu gk20a_fecs_dump_falcon_stats:135 [ERR] gr_fecs_mailbox0_r : 0x0
[ 399.130624] nvgpu: 57000000.gpu gk20a_fecs_dump_falcon_stats:137 [ERR] gr_fecs_mailbox1_r : 0x0
[ 399.139921] nvgpu: 57000000.gpu gk20a_fecs_dump_falcon_stats:139 [ERR] gr_fecs_irqstat_r : 0x0
[ 399.149133] nvgpu: 57000000.gpu gk20a_fecs_dump_falcon_stats:141 [ERR] gr_fecs_irqmode_r : 0x4
[ 399.158343] nvgpu: 57000000.gpu gk20a_fecs_dump_falcon_stats:143 [ERR] gr_fecs_irqmask_r : 0x8704
[ 399.167817] nvgpu: 57000000.gpu gk20a_fecs_dump_falcon_stats:145 [ERR] gr_fecs_irqdest_r : 0x0
[ 399.177026] nvgpu: 57000000.gpu gk20a_fecs_dump_falcon_stats:147 [ERR] gr_fecs_debug1_r : 0x40
[ 399.186239] nvgpu: 57000000.gpu gk20a_fecs_dump_falcon_stats:149 [ERR] gr_fecs_debuginfo_r : 0x0
[ 399.195624] nvgpu: 57000000.gpu gk20a_fecs_dump_falcon_stats:151 [ERR] gr_fecs_ctxsw_status_1_r : 0x304
[ 399.205615] nvgpu: 57000000.gpu gk20a_fecs_dump_falcon_stats:155 [ERR] gr_fecs_ctxsw_mailbox_r(0) : 0x1
[ 399.215605] nvgpu: 57000000.gpu gk20a_fecs_dump_falcon_stats:155 [ERR] gr_fecs_ctxsw_mailbox_r(1) : 0x0
[ 399.225599] nvgpu: 57000000.gpu gk20a_fecs_dump_falcon_stats:155 [ERR] gr_fecs_ctxsw_mailbox_r(2) : 0x90009
[ 399.235944] nvgpu: 57000000.gpu gk20a_fecs_dump_falcon_stats:155 [ERR] gr_fecs_ctxsw_mailbox_r(3) : 0x14000
[ 399.246287] nvgpu: 57000000.gpu gk20a_fecs_dump_falcon_stats:155 [ERR] gr_fecs_ctxsw_mailbox_r(4) : 0x1ffda0
[ 399.256710] nvgpu: 57000000.gpu gk20a_fecs_dump_falcon_stats:155 [ERR] gr_fecs_ctxsw_mailbox_r(5) : 0x0
[ 399.266702] nvgpu: 57000000.gpu gk20a_fecs_dump_falcon_stats:155 [ERR] gr_fecs_ctxsw_mailbox_r(6) : 0x0
[ 399.276691] nvgpu: 57000000.gpu gk20a_fecs_dump_falcon_stats:155 [ERR] gr_fecs_ctxsw_mailbox_r(7) : 0x0
[ 399.286684] nvgpu: 57000000.gpu gk20a_fecs_dump_falcon_stats:155 [ERR] gr_fecs_ctxsw_mailbox_r(8) : 0x0
[ 399.296673] nvgpu: 57000000.gpu gk20a_fecs_dump_falcon_stats:155 [ERR] gr_fecs_ctxsw_mailbox_r(9) : 0x0
[ 399.306667] nvgpu: 57000000.gpu gk20a_fecs_dump_falcon_stats:155 [ERR] gr_fecs_ctxsw_mailbox_r(10) : 0x0
[ 399.316745] nvgpu: 57000000.gpu gk20a_fecs_dump_falcon_stats:155 [ERR] gr_fecs_ctxsw_mailbox_r(11) : 0x0
[ 399.326826] nvgpu: 57000000.gpu gk20a_fecs_dump_falcon_stats:155 [ERR] gr_fecs_ctxsw_mailbox_r(12) : 0x0
[ 399.336902] nvgpu: 57000000.gpu gk20a_fecs_dump_falcon_stats:155 [ERR] gr_fecs_ctxsw_mailbox_r(13) : 0x0
[ 399.346981] nvgpu: 57000000.gpu gk20a_fecs_dump_falcon_stats:155 [ERR] gr_fecs_ctxsw_mailbox_r(14) : 0x0
[ 399.357058] nvgpu: 57000000.gpu gk20a_fecs_dump_falcon_stats:155 [ERR] gr_fecs_ctxsw_mailbox_r(15) : 0x0
[ 399.367136] nvgpu: 57000000.gpu gk20a_fecs_dump_falcon_stats:159 [ERR] gr_fecs_engctl_r : 0x0
[ 399.376269] nvgpu: 57000000.gpu gk20a_fecs_dump_falcon_stats:161 [ERR] gr_fecs_curctx_r : 0x0
[ 399.385396] nvgpu: 57000000.gpu gk20a_fecs_dump_falcon_stats:163 [ERR] gr_fecs_nxtctx_r : 0x0
[ 399.394519] nvgpu: 57000000.gpu gk20a_fecs_dump_falcon_stats:169 [ERR] FECS_FALCON_REG_IMB : 0xbadfbadf
[ 399.404521] nvgpu: 57000000.gpu gk20a_fecs_dump_falcon_stats:175 [ERR] FECS_FALCON_REG_DMB : 0xbadfbadf
[ 399.414519] nvgpu: 57000000.gpu gk20a_fecs_dump_falcon_stats:181 [ERR] FECS_FALCON_REG_CSW : 0xbadfbadf
[ 399.424517] nvgpu: 57000000.gpu gk20a_fecs_dump_falcon_stats:187 [ERR] FECS_FALCON_REG_CTX : 0xbadfbadf
[ 399.434511] nvgpu: 57000000.gpu gk20a_fecs_dump_falcon_stats:193 [ERR] FECS_FALCON_REG_EXCI : 0xbadfbadf
[ 399.444597] nvgpu: 57000000.gpu gk20a_fecs_dump_falcon_stats:200 [ERR] FECS_FALCON_REG_PC : 0xbadfbadf
[ 399.454506] nvgpu: 57000000.gpu gk20a_fecs_dump_falcon_stats:206 [ERR] FECS_FALCON_REG_SP : 0xbadfbadf
[ 399.464414] nvgpu: 57000000.gpu gk20a_fecs_dump_falcon_stats:200 [ERR] FECS_FALCON_REG_PC : 0xbadfbadf
[ 399.474321] nvgpu: 57000000.gpu gk20a_fecs_dump_falcon_stats:206 [ERR] FECS_FALCON_REG_SP : 0xbadfbadf
[ 399.484230] nvgpu: 57000000.gpu gk20a_fecs_dump_falcon_stats:200 [ERR] FECS_FALCON_REG_PC : 0xbadfbadf
[ 399.494138] nvgpu: 57000000.gpu gk20a_fecs_dump_falcon_stats:206 [ERR] FECS_FALCON_REG_SP : 0xbadfbadf
[ 399.504046] nvgpu: 57000000.gpu gk20a_fecs_dump_falcon_stats:200 [ERR] FECS_FALCON_REG_PC : 0xbadfbadf
[ 399.513952] nvgpu: 57000000.gpu gk20a_fecs_dump_falcon_stats:206 [ERR] FECS_FALCON_REG_SP : 0xbadfbadf
[ 399.523862] nvgpu: 57000000.gpu gk20a_fifo_handle_mmu_fault_locked:1726 [ERR] gr_status_r : 0x81
[ 399.533597] nvgpu: 57000000.gpu fifo_error_isr:2605 [ERR] channel reset initiated from fifo_error_isr; intr=0x00000100