Hi there,
I’m no longer able to run TensorRT inference after my Jetson suddenly rebooted after running a script where it performs AI inference on a life video feed. See this post for the sample script, however we are facing a different issue with a different camera now: X264 and TensorRT sudden reboot (MJPG encoder not affected, but not fast enough) on Jetson Orin Nano - Jetson & Embedded Systems / Jetson Orin Nano - NVIDIA Developer Forums
Here is the script:
import cv2
import os
import numpy as np
import data.augmentation as augmentations
from tensorrt_inference import TensorRTInference
from utils.fps import FPS
if os.name == 'nt':
ENGINE_PATH: str = "models/windows_built_engine.trt"
else:
ENGINE_PATH: str = "/home/fov/Desktop/FOVCamerasWebApp/jetson/models/jetson_orin_dynamic_output.trt"
print("Creating capture and encoder pipelines...")
cap = cv2.VideoCapture(
'nvarguscamerasrc ! '
'video/x-raw(memory:NVMM) , width=1920, height=1080, format=NV12, framerate=30/1 ! '
'nvvidconv flip-method=2 ! '
'video/x-raw, width=1920, height=1080, format=BGRx ! '
'videoconvert ! '
'video/x-raw, format=BGR ! '
'appsink'
)
# Note: this one works, but only runs at about 5FPS which is too slow
# out = cv2.VideoWriter("test.avi", cv2.VideoWriter_fourcc(*'MJPG'), 30, (1920, 1080))
# Note: this can run at 20FPS, but causes a sudden reboot when ran with the tensorrt model
out = cv2.VideoWriter(
'appsrc is-live=true do-timestamp=true format=3 ! '
'video/x-raw, format=(string)BGR ! '
'videoconvert ! '
'video/x-raw, format=(string)NV12 ! '
'x264enc bitrate=4000 speed-preset=ultrafast tune=zerolatency ! '
'video/x-h264, profile=baseline ! '
'h264parse ! '
'qtmux ! '
'filesink location=output.mp4',
0, 30.0, (1920, 1080), True
)
print("Loading TensorRT model...")
tensorrt_model = TensorRTInference(ENGINE_PATH)
inputs, outputs, bindings, stream = tensorrt_model.allocate_buffers()
fps = FPS()
while cap.isOpened():
fps.start()
ret_val, img = cap.read()
if not ret_val:
print("Failed to get the frame from the camera.")
break
if os.name == 'nt':
img = cv2.resize(img, (1920, 1080))
img_tensor = augmentations.numpy2tensor(img)
out.write(img)
del img
# Run inference
np.copyto(inputs[0].host, img_tensor.ravel())
tensorrt_model.do_inference(inputs, outputs, bindings, stream)
fps.stop()
print(f"FPS: {fps.fps()}")
# Get the output
res = [output.host for output in outputs]
print(res)
cap.release()
out.release()
Here is the relevant tensorrt_inference code I wrote too although its mostly boilerplate:
import ctypes
import numpy as np
import tensorrt as trt
from cuda import cuda, cudart
from typing import Union, Optional
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
def check_cuda_err(err):
if isinstance(err, cuda.CUresult):
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError("Cuda Error: {}".format(err))
if isinstance(err, cudart.cudaError_t):
if err != cudart.cudaError_t.cudaSuccess:
raise RuntimeError("Cuda Runtime Error: {}".format(err))
else:
raise RuntimeError("Unknown error type: {}".format(err))
def cuda_call(call):
err, res = call[0], call[1:]
check_cuda_err(err)
if len(res) == 1:
res = res[0]
return res
class TensorRTInference:
def __init__(self, engine_path: str):
self.engine = self.load_engine(engine_path)
self.context = self.engine.create_execution_context()
print("TensorRT engine loaded successfully")
@staticmethod
def load_engine(engine_path) -> trt.ICudaEngine:
"""Load a TensorRT engine from file."""
with open(engine_path, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:
return runtime.deserialize_cuda_engine(f.read())
def allocate_buffers(self):
inputs = []
outputs = []
bindings = []
stream = cuda_call(cudart.cudaStreamCreate())
tensor_names = [self.engine.get_tensor_name(i) for i in range(self.engine.num_io_tensors)]
for binding in tensor_names:
# get_tensor_profile_shape returns (min_shape, optimal_shape, max_shape)
# Pick out the max shape to allocate enough memory for the binding.
if binding == "boxes":
shape = (5, 4)
elif binding == "labels":
shape = (5,)
elif binding == "scores":
shape = (5,)
else:
shape = self.engine.get_tensor_shape(binding)
shape_valid = np.all([s >= 0 for s in shape])
if not shape_valid:
raise ValueError(f"Binding {binding} has dynamic shape, " + \
"but no profile was specified.")
size = trt.volume(shape)
trt_type = self.engine.get_tensor_dtype(binding)
# Allocate host and device buffers
if trt.nptype(trt_type):
dtype = np.dtype(trt.nptype(trt_type))
bindingMemory = HostDeviceMem(size, dtype)
else: # no numpy support: create a byte array instead (BF16, FP8, INT4)
size = int(size * trt_type.itemsize)
bindingMemory = HostDeviceMem(size)
# Append the device buffer to device bindings.
bindings.append(int(bindingMemory.device))
# Append to the appropriate list.
if self.engine.get_tensor_mode(binding) == trt.TensorIOMode.INPUT:
inputs.append(bindingMemory)
else:
outputs.append(bindingMemory)
return inputs, outputs, bindings, stream
def do_inference(self, inputs, outputs, bindings, stream):
def execute_async_func():
self.context.execute_async_v3(stream_handle=stream)
# Setup context tensor address.
num_io = self.engine.num_io_tensors
for i in range(num_io):
self.context.set_tensor_address(self.engine.get_tensor_name(i), bindings[i])
return self._do_inference_base(inputs, outputs, stream, execute_async_func)
def _do_inference_base(self, inputs, outputs, stream, execute_async_func):
# Transfer input data to the GPU.
kind = cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
[cuda_call(cudart.cudaMemcpyAsync(inp.device, inp.host, inp.nbytes, kind, stream)) for inp in inputs]
# Run inference.
execute_async_func()
# Transfer predictions back from the GPU.
kind = cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost
[cuda_call(cudart.cudaMemcpyAsync(out.host, out.device, out.nbytes, kind, stream)) for out in outputs]
# Synchronize the stream
cuda_call(cudart.cudaStreamSynchronize(stream))
# Return only the host outputs.
return [out.host for out in outputs]
class HostDeviceMem:
"""Pair of host and device memory, where the host memory is wrapped in a numpy array"""
def __init__(self, size: int, dtype: Optional[np.dtype] = None):
dtype = dtype or np.dtype(np.uint8)
nbytes = size * dtype.itemsize
host_mem = cuda_call(cudart.cudaMallocHost(nbytes))
pointer_type = ctypes.POINTER(np.ctypeslib.as_ctypes_type(dtype))
self._host = np.ctypeslib.as_array(ctypes.cast(host_mem, pointer_type), (size,))
self._device = cuda_call(cudart.cudaMalloc(nbytes))
self._nbytes = nbytes
@property
def host(self) -> np.ndarray:
return self._host
@host.setter
def host(self, data: Union[np.ndarray, bytes]):
if isinstance(data, np.ndarray):
if data.size > self.host.size:
raise ValueError(
f"Tried to fit an array of size {data.size} into host memory of size {self.host.size}"
)
np.copyto(self.host[:data.size], data.flat, casting='safe')
else:
assert self.host.dtype == np.uint8
self.host[:self.nbytes] = np.frombuffer(data, dtype=np.uint8)
@property
def device(self) -> int:
return self._device
@property
def nbytes(self) -> int:
return self._nbytes
def __str__(self):
return f"Host:\n{self.host}\nDevice:\n{self.device}\nSize:\n{self.nbytes}\n"
def __repr__(self):
return self.__str__()
def free(self):
cuda_call(cudart.cudaFree(self.device))
cuda_call(cudart.cudaFreeHost(self.host.ctypes.data))
When I try to run this script now I get the following error:
$ python record_images_and_detect.py
[05/22/2024-14:10:44] [TRT] [W] CUDA initialization failure with error: 100
Traceback (most recent call last):
File "record_images_and_detect.py", line 307, in <module>
main()
File "record_images_and_detect.py", line 301, in main
tensorrt_model = TensorRTInference(ENGINE_PATH)
File "/home/fov/Desktop/FOVCamerasWebApp/jetson/tensorrt_inference.py", line 33, in __init__
self.engine = self.load_engine(engine_path)
File "/home/fov/Desktop/FOVCamerasWebApp/jetson/tensorrt_inference.py", line 40, in load_engine
with open(engine_path, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:
TypeError: pybind11::init(): factory function returned nullptr
Out of curiousity, I wanted to have a look at sudo jtop
however it doesn’t work, and when I inspected its logs I got this:
May 22 14:15:59 marvel-fov-8 systemd[1]: Started jtop service.
May 22 14:15:59 marvel-fov-8 systemd[472530]: jtop.service: Failed to execute command: No such file or directory
May 22 14:15:59 marvel-fov-8 systemd[472530]: jtop.service: Failed at step EXEC spawning /usr/local/bin/jtop: No such file or directory
May 22 14:15:59 marvel-fov-8 systemd[1]: jtop.service: Main process exited, code=exited, status=203/EXEC
May 22 14:15:59 marvel-fov-8 systemd[1]: jtop.service: Failed with result 'exit-code'.
May 22 14:16:09 marvel-fov-8 systemd[1]: jtop.service: Scheduled restart job, restart counter is at 1.
May 22 14:16:09 marvel-fov-8 systemd[1]: Stopped jtop service.
May 22 14:16:09 marvel-fov-8 systemd[1]: Started jtop service.
May 22 14:16:09 marvel-fov-8 jtop[472547]: [INFO] jtop.core.config - Build service folder in /usr/local/jtop
May 22 14:16:09 marvel-fov-8 jtop[472547]: [INFO] jtop.service - jetson_stats 4.2.8 - server loaded
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.service - Running on Python: 3.8.10
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.hardware - Hardware detected aarch64
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.hardware - NVIDIA Jetson 699-level Part Number=699-13767-0005-300 K.2
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.hardware - NVIDIA Jetson Module=NVIDIA Jetson Orin Nano (Developer kit)
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.hardware - NVIDIA Jetson detected L4T=35.3.1
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.cpu - Found 6 CPU
**May 22 14:16:10 marvel-fov-8 jtop[472547]: [WARNING] jtop.core.gpu - No NVIDIA GPU available**
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.processes - Process service started
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.memory - Found EMC!
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.memory - Memory service started
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.engine - Engines found: [APE NVDEC NVENC NVJPG OFA SE VIC]
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.temperature - Found thermal "CV0" in thermal_zone2
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.temperature - Found thermal "CPU" in thermal_zone0
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.temperature - Found thermal "SOC2" in thermal_zone7
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.temperature - Found thermal "SOC0" in thermal_zone5
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.temperature - Found thermal "CV1" in thermal_zone3
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.temperature - Found thermal "GPU" in thermal_zone1
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.temperature - Found thermal "tj" in thermal_zone8
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.temperature - Found thermal "SOC1" in thermal_zone6
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.temperature - Found thermal "CV2" in thermal_zone4
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.power - Alarms VDD_IN - {'crit_alarm': 0, 'max_alarm': 0}
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.power - Alarms VDD_CPU_GPU_CV - {'crit_alarm': 0, 'max_alarm': 0}
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.power - Alarms VDD_SOC - {'crit_alarm': 0, 'max_alarm': 0}
May 22 14:16:10 marvel-fov-8 jtop[472547]: [WARNING] jtop.core.power - Skipped "sum of shunt voltages" /sys/bus/i2c/devices/1-0040/hwmon/hwmon3/in7_label
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.power - Found I2C power monitor
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.fan - Fan pwmfan(1) found in /sys/class/hwmon/hwmon2
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.fan - RPM pwm_tach found in /sys/class/hwmon/hwmon0
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.fan - Found nvfancontrol.service
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.jetson_clocks - jetson_clocks found in /usr/bin/jetson_clocks
May 22 14:16:10 marvel-fov-8 jtop[472547]: [INFO] jtop.core.nvpmodel - nvpmodel running in [0]15W - Default: 0
May 22 14:16:10 marvel-fov-8 jtop[472573]: [INFO] jtop.service - Initialization service
May 22 14:16:11 marvel-fov-8 jtop[472573]: Process JtopServer-1:
May 22 14:16:11 marvel-fov-8 jtop[472573]: Traceback (most recent call last):
May 22 14:16:11 marvel-fov-8 jtop[472573]: File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
May 22 14:16:11 marvel-fov-8 jtop[472573]: self.run()
May 22 14:16:11 marvel-fov-8 jtop[472573]: File "/usr/local/lib/python3.8/dist-packages/jtop/service.py", line 319, in run
May 22 14:16:11 marvel-fov-8 jtop[472573]: self.jetson_clocks.initialization(self.nvpmodel, data)
May 22 14:16:11 marvel-fov-8 jtop[472573]: File "/usr/local/lib/python3.8/dist-packages/jtop/core/jetson_clocks.py", line 370, in initialization
May 22 14:16:11 marvel-fov-8 jtop[472573]: self._engines_list = self.show()
May 22 14:16:11 marvel-fov-8 jtop[472573]: File "/usr/local/lib/python3.8/dist-packages/jtop/core/jetson_clocks.py", line 522, in show
May 22 14:16:11 marvel-fov-8 jtop[472573]: lines = cmd(timeout=COMMAND_TIMEOUT)
May 22 14:16:11 marvel-fov-8 jtop[472573]: File "/usr/local/lib/python3.8/dist-packages/jtop/core/command.py", line 115, in __call__
May 22 14:16:11 marvel-fov-8 jtop[472573]: raise Command.CommandException('Error process:', self.process.returncode)
May 22 14:16:11 marvel-fov-8 jtop[472573]: jtop.core.command.Command.CommandException: [errno:1] Error process:
May 22 14:16:11 marvel-fov-8 jtop[472547]: [INFO] jtop.service - Service closed
May 22 14:16:11 marvel-fov-8 systemd[1]: jtop.service: Succeeded.
Most notably: May 22 14:16:10 marvel-fov-8 jtop[472547]: [WARNING] jtop.core.gpu - No NVIDIA GPU available
More simply, torch
says cuda isn’t available:
fov@marvel-fov-8:~$ python
Python 3.8.10 (default, May 26 2023, 14:05:08)
[GCC 9.4.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> import torch
>>> print("CUDA is available" if torch.cuda.is_available() else "CUDA is not available")
CUDA is not available
Here is the device information taken from an earlier screenshot from jtop
:
I will note in advance that we are not able to reflash the device as it is deployed remotely in a different country.
Thanks in advance for guidance and please let me know if there’s any more information I can provide
Thanks