Orin nx do not work, and it reboot

orin nx jp5.1.1 客户自定义板子

一开始设备能正常工作,最近发现会频繁的重启,开机几分钟就会重启。

Feb  9 15:59:37 user-desktop kernel: [  586.626266] NVRM gpumgrGetSomeGpu: Failed to retrieve pGpu - Too early call!.
Feb  9 15:59:37 user-desktop kernel: [  586.626275] NVRM nvAssertFailedNoLog: Assertion failed: NV_FALSE @ gpu_mgr.c:296
Feb  9 15:59:37 user-desktop kernel: [  586.626290] CPU: 5 PID: 1839 Comm: Xorg Tainted: G           OE     5.10.104-tegra #1
Feb  9 15:59:37 user-desktop kernel: [  586.626293] Hardware name: Unknown NVIDIA Orin NX Developer Kit/NVIDIA Orin NX Developer Kit, BIOS r35.3.1-5e812e4-dirty 11/05/2023
Feb  9 15:59:37 user-desktop kernel: [  586.626297] Call trace:
Feb  9 15:59:37 user-desktop kernel: [  586.626312]  dump_backtrace+0x0/0x1d0
Feb  9 15:59:37 user-desktop kernel: [  586.626319]  show_stack+0x30/0x40
Feb  9 15:59:37 user-desktop kernel: [  586.626328]  dump_stack+0xd8/0x138
Feb  9 15:59:37 user-desktop kernel: [  586.626442]  os_dump_stack+0x18/0x20 [nvidia]
Feb  9 15:59:37 user-desktop kernel: [  586.626546]  tlsEntryGet+0x130/0x138 [nvidia]
Feb  9 15:59:37 user-desktop kernel: [  586.626650]  gpumgrGetSomeGpu+0x7c/0x90 [nvidia]
Feb  9 15:59:37 user-desktop kernel: [  586.626755]  threadPriorityStateFree+0x234/0x2a0 [nvidia]
Feb  9 15:59:37 user-desktop kernel: [  586.626858]  RmShutdownAdapter+0x168/0x268 [nvidia]
Feb  9 15:59:37 user-desktop kernel: [  586.626961]  rm_shutdown_adapter+0x50/0x70 [nvidia]
Feb  9 15:59:37 user-desktop kernel: [  586.627063]  nv_shutdown_adapter+0xb4/0x4b0 [nvidia]
Feb  9 15:59:37 user-desktop kernel: [  586.627166]  nv_shutdown_adapter+0x2d8/0x4b0 [nvidia]
Feb  9 15:59:37 user-desktop kernel: [  586.627267]  nvidia_dev_put+0x38/0xc40 [nvidia]
Feb  9 15:59:37 user-desktop kernel: [  586.627341]  nvkms_close_gpu+0x60/0x98 [nvidia_modeset]
Feb  9 15:59:37 user-desktop kernel: [  586.627408]  nvRmFreeDeviceEvo+0x8c/0x130 [nvidia_modeset]
Feb  9 15:59:37 user-desktop kernel: [  586.627474]  nvkms_ioctl_common+0x180/0x1b0 [nvidia_modeset]
Feb  9 15:59:37 user-desktop kernel: [  586.627576]  nvidia_frontend_unlocked_ioctl+0x5c/0x78 [nvidia]
Feb  9 15:59:37 user-desktop kernel: [  586.627584]  __arm64_sys_ioctl+0xac/0xf0
Feb  9 15:59:37 user-desktop kernel: [  586.627591]  el0_svc_common.constprop.0+0x80/0x1d0
Feb  9 15:59:37 user-desktop kernel: [  586.627596]  do_el0_svc+0x38/0xb0
Feb  9 15:59:37 user-desktop kernel: [  586.627603]  el0_svc+0x1c/0x30
Feb  9 15:59:37 user-desktop kernel: [  586.627607]  el0_sync_handler+0xa8/0xb0
Feb  9 15:59:37 user-desktop kernel: [  586.627611]  el0_sync+0x16c/0x180

能确定是什么原因导致的吗

might be missing this patch in the nvdisplay driver.

diff --git a/drivers/resman/src/kernel/core/locks_minimal.c b/drivers/resman/src/kernel/core/locks_minimal.c
index 252af2a..3f6c5f7 100644
--- a/drivers/resman/src/kernel/core/locks_minimal.c
+++ b/drivers/resman/src/kernel/core/locks_minimal.c
@@ -44,6 +44,7 @@
     OS_THREAD_HANDLE    threadId;  //<! ID of thread owning the lock, ~0 if none
     NvU64               timestamp; //<! Timestamp of last lock acquire
     LOCK_TRACE_INFO     traceInfo; //<! Lock acquire/release trace info
+    NvBool              bValid;    //<! If ready to acquire/release the lock 
 } GPULOCK;
 
 static GPULOCK rmGpuLock;
@@ -63,6 +64,7 @@
     OS_THREAD_HANDLE threadId;
     osGetCurrentThread(&threadId);
     osGetCurrentTick(&timestamp);
+    rmGpuLock.bValid = NV_FALSE;
 
     INSERT_LOCK_TRACE(&rmGpuLock.traceInfo, NV_RETURN_ADDRESS(),
                       lockTraceAlloc, 0, 0, threadId,
@@ -87,12 +89,16 @@
 {
     NV_ASSERT_OR_RETURN(gpuInst == 0, NV_ERR_INVALID_ARGUMENT);
     NV_ASSERT_OR_RETURN(rmGpuLock.pLock != NULL, NV_ERR_INVALID_STATE);
+
+    rmGpuLock.bValid = NV_TRUE;
+
     return NV_OK;
 }
 
 void rmGpuLockFree(NvU32 gpuInst)
 {
     NV_ASSERT_OR_RETURN_VOID(gpuInst == 0);
+    rmGpuLock.bValid = NV_FALSE;
 }
 
 static NV_STATUS _rmGpuLockAcquire(NvU32 flags, void *ra)
@@ -100,6 +106,14 @@
     NvBool bCondAcquire = !!(flags & GPUS_LOCK_FLAGS_COND_ACQUIRE);
     NvBool bHighIrql = (portSyncExSafeToSleep() == NV_FALSE);
 
+    //
+    // We may get a bValid as NV_FALSE before GPU is attached.
+    //
+    if (rmGpuLock.bValid == NV_FALSE)
+    {
+        return NV_OK;
+    }
+
     NV_ASSERT_OR_RETURN(rmGpuLock.pLock != NULL, NV_ERR_INVALID_STATE);
     NV_ASSERT_OR_RETURN(!rmGpuLockIsOwner(), NV_ERR_CYCLE_DETECTED);
     if (bCondAcquire || bHighIrql)
@@ -114,6 +128,7 @@
         portSyncSemaphoreAcquire(rmGpuLock.pLock);
         portAtomicDecrementU32(&rmGpuLock.waiting);
     }
+
     osGetCurrentThread(&rmGpuLock.threadId);
     osGetCurrentTick(&rmGpuLock.timestamp);
 
@@ -124,6 +139,7 @@
     INSERT_LOCK_TRACE(&rmGpuLock.traceInfo, ra,
                       lockTraceAcquire, 0, 0, rmGpuLock.threadId,
                       bHighIrql, 0, rmGpuLock.timestamp);
+
     return NV_OK;
 }
 
@@ -132,6 +148,14 @@
     OS_THREAD_HANDLE threadId;
     NvU64 timestamp;
 
+    //
+    // We may get a bValid as NV_FALSE before GPU is attached.
+    //
+    if (rmGpuLock.bValid == NV_FALSE)
+    {
+        return NV_OK;
+    }
+

我检查了一下,这笔patch之前已经合入了

please help check if even latest Jetpack could reproduce this error.

刷机之后就没问题,这问题是客户用一段时间之后出现的。一开始也不会一直重启