GPU not detected on Nvidia Jetson Orin Nano after sudden reboot (I need to run TensorRT engine inference)

Hi there,

I’m no longer able to run TensorRT inference after my Jetson suddenly rebooted after running a script where it performs AI inference on a life video feed. See this post for the sample script, however we are facing a different issue with a different camera now: X264 and TensorRT sudden reboot (MJPG encoder not affected, but not fast enough) on Jetson Orin Nano - Jetson & Embedded Systems / Jetson Orin Nano - NVIDIA Developer Forums

Here is the script:

import cv2
import os
import numpy as np

import data.augmentation as augmentations
from tensorrt_inference import TensorRTInference
from utils.fps import FPS

if os.name == 'nt':
    ENGINE_PATH: str = "models/windows_built_engine.trt"
else:
    ENGINE_PATH: str = "/home/fov/Desktop/FOVCamerasWebApp/jetson/models/jetson_orin_dynamic_output.trt"


print("Creating capture and encoder pipelines...")

  cap = cv2.VideoCapture(
      'nvarguscamerasrc !  '
      'video/x-raw(memory:NVMM) , width=1920, height=1080, format=NV12, framerate=30/1 ! '
      'nvvidconv flip-method=2 ! '
      'video/x-raw, width=1920, height=1080, format=BGRx ! '
      'videoconvert ! '
      'video/x-raw, format=BGR ! '
      'appsink'
  )

  # Note: this one works, but only runs at about 5FPS which is too slow
  # out = cv2.VideoWriter("test.avi", cv2.VideoWriter_fourcc(*'MJPG'), 30, (1920, 1080))

  # Note: this can run at 20FPS, but causes a sudden reboot when ran with the tensorrt model
  out = cv2.VideoWriter(
      'appsrc is-live=true do-timestamp=true format=3 ! '
      'video/x-raw, format=(string)BGR ! '
      'videoconvert ! '
      'video/x-raw, format=(string)NV12 ! '
      'x264enc bitrate=4000 speed-preset=ultrafast tune=zerolatency ! '
      'video/x-h264, profile=baseline ! '
      'h264parse ! '
      'qtmux ! '
      'filesink location=output.mp4',
      0, 30.0, (1920, 1080), True
  )

print("Loading TensorRT model...")
tensorrt_model = TensorRTInference(ENGINE_PATH)

inputs, outputs, bindings, stream = tensorrt_model.allocate_buffers()

fps = FPS()

while cap.isOpened():
    fps.start()
    ret_val, img = cap.read()

    if not ret_val:
        print("Failed to get the frame from the camera.")
        break

    if os.name == 'nt':
        img = cv2.resize(img, (1920, 1080))

    img_tensor = augmentations.numpy2tensor(img)
    out.write(img)
    del img

    # Run inference
    np.copyto(inputs[0].host, img_tensor.ravel())

    tensorrt_model.do_inference(inputs, outputs, bindings, stream)

    fps.stop()
    print(f"FPS: {fps.fps()}")

    # Get the output
    res = [output.host for output in outputs]
    print(res)

cap.release()
out.release()

Here is the relevant tensorrt_inference code I wrote too although its mostly boilerplate:

import ctypes
import numpy as np
import tensorrt as trt

from cuda import cuda, cudart
from typing import Union, Optional


TRT_LOGGER = trt.Logger(trt.Logger.WARNING)


def check_cuda_err(err):
    if isinstance(err, cuda.CUresult):
        if err != cuda.CUresult.CUDA_SUCCESS:
            raise RuntimeError("Cuda Error: {}".format(err))
    if isinstance(err, cudart.cudaError_t):
        if err != cudart.cudaError_t.cudaSuccess:
            raise RuntimeError("Cuda Runtime Error: {}".format(err))
    else:
        raise RuntimeError("Unknown error type: {}".format(err))


def cuda_call(call):
    err, res = call[0], call[1:]
    check_cuda_err(err)
    if len(res) == 1:
        res = res[0]
    return res


class TensorRTInference:
    def __init__(self, engine_path: str):
        self.engine = self.load_engine(engine_path)
        self.context = self.engine.create_execution_context()
        print("TensorRT engine loaded successfully")

    @staticmethod
    def load_engine(engine_path) -> trt.ICudaEngine:
        """Load a TensorRT engine from file."""
        with open(engine_path, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:
            return runtime.deserialize_cuda_engine(f.read())

    def allocate_buffers(self):
        inputs = []
        outputs = []
        bindings = []
        stream = cuda_call(cudart.cudaStreamCreate())
        tensor_names = [self.engine.get_tensor_name(i) for i in range(self.engine.num_io_tensors)]
        for binding in tensor_names:
            # get_tensor_profile_shape returns (min_shape, optimal_shape, max_shape)
            # Pick out the max shape to allocate enough memory for the binding.
            if binding == "boxes":
                shape = (5, 4)
            elif binding == "labels":
                shape = (5,)
            elif binding == "scores":
                shape = (5,)
            else:
                shape = self.engine.get_tensor_shape(binding)
            shape_valid = np.all([s >= 0 for s in shape])
            if not shape_valid:
                raise ValueError(f"Binding {binding} has dynamic shape, " + \
                                 "but no profile was specified.")
            size = trt.volume(shape)
            trt_type = self.engine.get_tensor_dtype(binding)

            # Allocate host and device buffers
            if trt.nptype(trt_type):
                dtype = np.dtype(trt.nptype(trt_type))
                bindingMemory = HostDeviceMem(size, dtype)
            else:  # no numpy support: create a byte array instead (BF16, FP8, INT4)
                size = int(size * trt_type.itemsize)
                bindingMemory = HostDeviceMem(size)

            # Append the device buffer to device bindings.
            bindings.append(int(bindingMemory.device))

            # Append to the appropriate list.
            if self.engine.get_tensor_mode(binding) == trt.TensorIOMode.INPUT:
                inputs.append(bindingMemory)
            else:
                outputs.append(bindingMemory)
        return inputs, outputs, bindings, stream

    def do_inference(self, inputs, outputs, bindings, stream):
        def execute_async_func():
            self.context.execute_async_v3(stream_handle=stream)
        # Setup context tensor address.
        num_io = self.engine.num_io_tensors
        for i in range(num_io):
            self.context.set_tensor_address(self.engine.get_tensor_name(i), bindings[i])
        return self._do_inference_base(inputs, outputs, stream, execute_async_func)

    def _do_inference_base(self, inputs, outputs, stream, execute_async_func):
        # Transfer input data to the GPU.
        kind = cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
        [cuda_call(cudart.cudaMemcpyAsync(inp.device, inp.host, inp.nbytes, kind, stream)) for inp in inputs]
        # Run inference.
        execute_async_func()
        # Transfer predictions back from the GPU.
        kind = cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost
        [cuda_call(cudart.cudaMemcpyAsync(out.host, out.device, out.nbytes, kind, stream)) for out in outputs]
        # Synchronize the stream
        cuda_call(cudart.cudaStreamSynchronize(stream))
        # Return only the host outputs.
        return [out.host for out in outputs]


class HostDeviceMem:
    """Pair of host and device memory, where the host memory is wrapped in a numpy array"""
    def __init__(self, size: int, dtype: Optional[np.dtype] = None):
        dtype = dtype or np.dtype(np.uint8)
        nbytes = size * dtype.itemsize
        host_mem = cuda_call(cudart.cudaMallocHost(nbytes))
        pointer_type = ctypes.POINTER(np.ctypeslib.as_ctypes_type(dtype))

        self._host = np.ctypeslib.as_array(ctypes.cast(host_mem, pointer_type), (size,))
        self._device = cuda_call(cudart.cudaMalloc(nbytes))
        self._nbytes = nbytes

    @property
    def host(self) -> np.ndarray:
        return self._host

    @host.setter
    def host(self, data: Union[np.ndarray, bytes]):
        if isinstance(data, np.ndarray):
            if data.size > self.host.size:
                raise ValueError(
                    f"Tried to fit an array of size {data.size} into host memory of size {self.host.size}"
                )
            np.copyto(self.host[:data.size], data.flat, casting='safe')
        else:
            assert self.host.dtype == np.uint8
            self.host[:self.nbytes] = np.frombuffer(data, dtype=np.uint8)

    @property
    def device(self) -> int:
        return self._device

    @property
    def nbytes(self) -> int:
        return self._nbytes

    def __str__(self):
        return f"Host:\n{self.host}\nDevice:\n{self.device}\nSize:\n{self.nbytes}\n"

    def __repr__(self):
        return self.__str__()

    def free(self):
        cuda_call(cudart.cudaFree(self.device))
        cuda_call(cudart.cudaFreeHost(self.host.ctypes.data))

When I try to run this script now I get the following error:

$ python record_images_and_detect.py

[05/22/2024-14:10:44] [TRT] [W] CUDA initialization failure with error: 100
Traceback (most recent call last):
  File "record_images_and_detect.py", line 307, in <module>
    main()
  File "record_images_and_detect.py", line 301, in main
    tensorrt_model = TensorRTInference(ENGINE_PATH)
  File "/home/fov/Desktop/FOVCamerasWebApp/jetson/tensorrt_inference.py", line 33, in __init__
    self.engine = self.load_engine(engine_path)
  File "/home/fov/Desktop/FOVCamerasWebApp/jetson/tensorrt_inference.py", line 40, in load_engine
    with open(engine_path, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:
TypeError: pybind11::init(): factory function returned nullptr

Out of curiousity, I wanted to have a look at sudo jtop however it doesn’t work, and when I inspected its logs I got this:

May 22 14:15:59 marvel-fov-8 systemd[1]: Started jtop service.
May 22 14:15:59 marvel-fov-8 systemd[472530]: jtop.service: Failed to execute command: No such file or directory
May 22 14:15:59 marvel-fov-8 systemd[472530]: jtop.service: Failed at step EXEC spawning /usr/local/bin/jtop: No such file or directory
May 22 14:15:59 marvel-fov-8 systemd[1]: jtop.service: Main process exited, code=exited, status=203/EXEC
May 22 14:15:59 marvel-fov-8 systemd[1]: jtop.service: Failed with result 'exit-code'.
May 22 14:16:09 marvel-fov-8 systemd[1]: jtop.service: Scheduled restart job, restart counter is at 1.
May 22 14:16:09 marvel-fov-8 systemd[1]: Stopped jtop service.
May 22 14:16:09 marvel-fov-8 systemd[1]: Started jtop service.
May 22 14:16:09 marvel-fov-8 jtop[472547]: [INFO] jtop.core.config - Build service folder in /usr/local/jtop
May 22 14:16:09 marvel-fov-8 jtop[472547]: [INFO] jtop.service - jetson_stats 4.2.8 - server loaded
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.service - Running on Python: 3.8.10
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.hardware - Hardware detected aarch64
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.hardware - NVIDIA Jetson 699-level Part Number=699-13767-0005-300 K.2
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.hardware - NVIDIA Jetson Module=NVIDIA Jetson Orin Nano (Developer kit)
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.hardware - NVIDIA Jetson detected L4T=35.3.1
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.cpu - Found 6 CPU
**May 22 14:16:10 marvel-fov-8 jtop[472547]: [WARNING] jtop.core.gpu - No NVIDIA GPU available**
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.processes - Process service started
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.memory - Found EMC!
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.memory - Memory service started
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.engine - Engines found: [APE NVDEC NVENC NVJPG OFA SE VIC]
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.temperature - Found thermal "CV0" in thermal_zone2
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.temperature - Found thermal "CPU" in thermal_zone0
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.temperature - Found thermal "SOC2" in thermal_zone7
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.temperature - Found thermal "SOC0" in thermal_zone5
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.temperature - Found thermal "CV1" in thermal_zone3
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.temperature - Found thermal "GPU" in thermal_zone1
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.temperature - Found thermal "tj" in thermal_zone8
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.temperature - Found thermal "SOC1" in thermal_zone6
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.temperature - Found thermal "CV2" in thermal_zone4
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.power - Alarms VDD_IN - {'crit_alarm': 0, 'max_alarm': 0}
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.power - Alarms VDD_CPU_GPU_CV - {'crit_alarm': 0, 'max_alarm': 0}
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.power - Alarms VDD_SOC - {'crit_alarm': 0, 'max_alarm': 0}
May 22 14:16:10 marvel-fov-8 jtop[472547]: [WARNING] jtop.core.power - Skipped "sum of shunt voltages" /sys/bus/i2c/devices/1-0040/hwmon/hwmon3/in7_label
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.power - Found I2C power monitor
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.fan - Fan pwmfan(1) found in /sys/class/hwmon/hwmon2
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.fan - RPM pwm_tach found in /sys/class/hwmon/hwmon0
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.fan - Found nvfancontrol.service
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.jetson_clocks - jetson_clocks found in /usr/bin/jetson_clocks
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.nvpmodel - nvpmodel running in [0]15W - Default: 0
May 22 14:16:10 marvel-fov-8 jtop[472573]: [INFO] jtop.service - Initialization service
May 22 14:16:11 marvel-fov-8 jtop[472573]: Process JtopServer-1:
May 22 14:16:11 marvel-fov-8 jtop[472573]: Traceback (most recent call last):
May 22 14:16:11 marvel-fov-8 jtop[472573]:   File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
May 22 14:16:11 marvel-fov-8 jtop[472573]:     self.run()
May 22 14:16:11 marvel-fov-8 jtop[472573]:   File "/usr/local/lib/python3.8/dist-packages/jtop/service.py", line 319, in run
May 22 14:16:11 marvel-fov-8 jtop[472573]:     self.jetson_clocks.initialization(self.nvpmodel, data)
May 22 14:16:11 marvel-fov-8 jtop[472573]:   File "/usr/local/lib/python3.8/dist-packages/jtop/core/jetson_clocks.py", line 370, in initialization
May 22 14:16:11 marvel-fov-8 jtop[472573]:     self._engines_list = self.show()
May 22 14:16:11 marvel-fov-8 jtop[472573]:   File "/usr/local/lib/python3.8/dist-packages/jtop/core/jetson_clocks.py", line 522, in show
May 22 14:16:11 marvel-fov-8 jtop[472573]:     lines = cmd(timeout=COMMAND_TIMEOUT)
May 22 14:16:11 marvel-fov-8 jtop[472573]:   File "/usr/local/lib/python3.8/dist-packages/jtop/core/command.py", line 115, in __call__
May 22 14:16:11 marvel-fov-8 jtop[472573]:     raise Command.CommandException('Error process:', self.process.returncode)
May 22 14:16:11 marvel-fov-8 jtop[472573]: jtop.core.command.Command.CommandException: [errno:1] Error process:
May 22 14:16:11 marvel-fov-8 jtop[472547]: [INFO] jtop.service - Service closed
May 22 14:16:11 marvel-fov-8 systemd[1]: jtop.service: Succeeded.

Most notably: May 22 14:16:10 marvel-fov-8 jtop[472547]: [WARNING] jtop.core.gpu - No NVIDIA GPU available

More simply, torch says cuda isn’t available:

fov@marvel-fov-8:~$ python
Python 3.8.10 (default, May 26 2023, 14:05:08) 
[GCC 9.4.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> import torch
>>> print("CUDA is available" if torch.cuda.is_available() else "CUDA is not available")
CUDA is not available

Here is the device information taken from an earlier screenshot from jtop:

I will note in advance that we are not able to reflash the device as it is deployed remotely in a different country.

Thanks in advance for guidance and please let me know if there’s any more information I can provide
Thanks

This is duplicate of
X264 and TensorRT sudden reboot (MJPG encoder not affected, but not fast enough) on Jetson Orin Nano