system: jetson agx orin 64 dev kit.
env: docker container
import os
import cv2
import time
import json
import onnx
import numpy as np
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import tensorflow as tf
from easydict import EasyDict
import numpy as np
import matplotlib.pyplot as plt
cfg_coco = EasyDict()
cfg_coco.idx2cls = {
‘1’: ‘person’, ‘2’: ‘bicycle’, ‘3’: ‘car’, ‘4’: ‘motorcycle’, ‘5’: ‘airplane’,
‘6’: ‘bus’, ‘7’: ‘train’, ‘8’: ‘truck’, ‘9’: ‘boat’, ‘10’: ‘traffic light’,
‘11’: ‘fire hydrant’, ‘13’: ‘stop sign’, ‘14’: ‘parking meter’, ‘15’: ‘bench’, ‘16’: ‘bird’,
‘17’: ‘cat’, ‘18’: ‘dog’, ‘19’: ‘horse’, ‘20’: ‘sheep’, ‘21’: ‘cow’,
‘22’: ‘elephant’, ‘23’: ‘bear’, ‘24’: ‘zebra’, ‘25’: ‘giraffe’, ‘27’: ‘backpack’,
‘28’: ‘umbrella’, ‘31’: ‘handbag’, ‘32’: ‘tie’, ‘33’: ‘suitcase’, ‘34’: ‘frisbee’,
‘35’: ‘skis’, ‘36’: ‘snowboard’, ‘37’: ‘sports ball’, ‘38’: ‘kite’, ‘39’: ‘baseball bat’,
‘40’: ‘baseball glove’, ‘41’: ‘skateboard’, ‘42’: ‘surfboard’, ‘43’: ‘tennis racket’, ‘44’: ‘bottle’,
‘46’: ‘wine glass’, ‘47’: ‘cup’, ‘48’: ‘fork’, ‘49’: ‘knife’, ‘50’: ‘spoon’,
‘51’: ‘bowl’, ‘52’: ‘banana’, ‘53’: ‘apple’, ‘54’: ‘sandwich’, ‘55’: ‘orange’,
‘56’: ‘broccoli’, ‘57’: ‘carrot’, ‘58’: ‘hot dog’, ‘59’: ‘pizza’, ‘60’: ‘donut’,
‘61’: ‘cake’, ‘62’: ‘chair’, ‘63’: ‘couch’, ‘64’: ‘potted plant’, ‘65’: ‘bed’,
‘67’: ‘dining table’, ‘70’: ‘toilet’, ‘72’: ‘tv’, ‘73’: ‘laptop’, ‘74’: ‘mouse’,
‘75’: ‘remote’, ‘76’: ‘keyboard’, ‘77’: ‘cell phone’, ‘78’: ‘microwave’, ‘79’: ‘oven’,
‘80’: ‘toaster’, ‘81’: ‘sink’, ‘82’: ‘refrigerator’, ‘84’: ‘book’, ‘85’: ‘clock’,
‘86’: ‘vase’, ‘87’: ‘scissors’, ‘88’: ‘teddy bear’, ‘89’: ‘hair drier’, ‘90’: ‘toothbrush’
}
cfg_coco.n_class = len(cfg_coco.idx2cls)
cfg_coco.n_anchors = 4
cfg_coco.box_params = 4
cfg_coco.obj_params = 1
cfg_coco.pred_dim = cfg_coco.box_params + cfg_coco.obj_params + cfg_coco.n_class
cfg_coco.clip_bound = 2.
cfg_coco.scale_offset = 0.5
cfg_coco.image_size = (512, 512, 3)
cfg_coco.grid_sizes = [64, 32, 16]
cfg_coco.max_output_size = 100
cfg_coco.iou_threshold = 0.5
cfg_coco.score_threshold = 0.5
cfg = cfg_coco
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
def extract_onnx_io(onnx_path):
model = onnx.load(onnx_path)
graph = model.graph
def shape_from_tensor(tensor):
return [dim.dim_value if dim.dim_value>0 else -1
for dim in tensor.type.tensor_type.shape.dim]
inputs = [{“name”: inp.name, “shape”: shape_from_tensor(inp)}
for inp in graph.input
if inp.name not in {init.name for init in graph.initializer}]
outputs = [{“name”: out.name, “shape”: shape_from_tensor(out)}
for out in graph.output]
return {“inputs”: inputs, “outputs”: outputs}
ENGINE_PATH = “../engine_models/OD_coco.engine”
def build_engine(onnx_path, io_info):
if os.path.exists(ENGINE_PATH):
print(f"Loading serialized engine from {ENGINE_PATH}“)
with open(ENGINE_PATH, “rb”) as f:
return trt.Runtime(TRT_LOGGER).deserialize_cuda_engine(f.read())
print(“Building TensorRT engine…”)
builder = trt.Builder(TRT_LOGGER)
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
parser = trt.OnnxParser(network, TRT_LOGGER)
with open(onnx_path, ‘rb’) as f:
if not parser.parse(f.read()):
for i in range(parser.num_errors):
print(parser.get_error(i))
raise RuntimeError(“ONNX parsing failed”)
config = builder.create_builder_config()
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 28) # 256MiB
config.set_flag(trt.BuilderFlag.FP16) # mode
# optimization profile: batch=1 fixed
profile = builder.create_optimization_profile()
inp = io_info[“inputs”][0]
shape = inp[“shape”][:]
shape[0] = 1
profile.set_shape(inp[“name”], tuple(shape), tuple(shape), tuple(shape))
config.add_optimization_profile(profile)
serialized = builder.build_serialized_network(network, config)
with open(ENGINE_PATH, “wb”) as f:
f.write(serialized)
print(f"Engine serialized to {ENGINE_PATH}”)
return trt.Runtime(TRT_LOGGER).deserialize_cuda_engine(serialized)
def allocate_buffers(engine, context):
inputs, outputs, bindings = , ,
stream = cuda.Stream()
for idx in range(engine.num_bindings):
name = engine.get_binding_name(idx)
# dynamic shape 처리
if engine.binding_is_input(idx):
shape = context.get_binding_shape(idx)
else:
shape = engine.get_binding_shape(idx)
dtype = trt.nptype(engine.get_binding_dtype(idx))
size = abs(int(np.prod(shape)))
host_mem = cuda.pagelocked_empty(size, dtype)
dev_mem = cuda.mem_alloc(host_mem.nbytes)
bindings.append(int(dev_mem))
if engine.binding_is_input(idx):
inputs.append((name, host_mem, dev_mem, tuple(shape)))
else:
outputs.append((name, host_mem, dev_mem, tuple(shape)))
return inputs, outputs, bindings, stream
def do_inference(context, bindings, inputs, outputs, stream):
for _, h, d, _ in inputs:
cuda.memcpy_htod_async(d, h, stream)
context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
for _, h, d, _ in outputs:
cuda.memcpy_dtoh_async(h, d, stream)
stream.synchronize()
return {name: h.reshape(shape) for name, h, _, shape in outputs}
def decode_predictions_multiscale(preds, cfg):
boxes =
for grid in cfg.grid_sizes:
out = preds[str(grid)][0] # (grid,grid,anchors*pred_dim)
out = out.reshape((grid, grid, cfg.n_anchors, cfg.pred_dim))
cell = cfg.image_size[0] / grid
for i in range(grid):
for j in range(grid):
for a in range(cfg.n_anchors):
p = out[i,j,a]
score = p[4]
if score < cfg.score_threshold: continue
tx,ty,tw,th = p[:4]
cx,cy = (j+tx)*cell, (i+ty)*cell
tw = np.clip(tw, -cfg.clip_bound, cfg.clip_bound)
th = np.clip(th, -cfg.clip_bound, cfg.clip_bound)
bw = (np.exp(tw)-cfg.scale_offset)*cfg.image_size[0]
bh = (np.exp(th)-cfg.scale_offset)*cfg.image_size[1]
xmin, ymin = cx-bw/2, cy-bh/2
xmax, ymax = cx+bw/2, cy+bh/2
xmin, xmax = np.clip([xmin,xmax],0,cfg.image_size[0])
ymin, ymax = np.clip([ymin,ymax],0,cfg.image_size[1])
if xmax<=xmin or ymax<=ymin: continue
cls = int(np.argmax(p[5:]))
boxes.append([xmin,ymin,xmax,ymax, cls, score])
return boxes
def apply_nms(boxes, cfg):
if not boxes: return
arr = np.array(boxes, dtype=np.float32)
b = arr[:,:4]
scores = arr[:,5]
classes = arr[:,4].astype(np.int32)
idx, sc = tf.image.non_max_suppression_with_scores(
boxes=b, scores=scores,
max_output_size=cfg.max_output_size,
iou_threshold=cfg.iou_threshold,
score_threshold=cfg.score_threshold
)
idx = idx.numpy()
return [boxes[i] for i in idx]
ONNX_PATH = “../onnx_models/OD_coco.onnx”
def main():
io_info = extract_onnx_io(ONNX_PATH)
print("ONNX IO:", json.dumps(io_info, indent=2))
engine = build_engine(ONNX_PATH, io_info)
context = engine.create_execution_context()
context.set_binding_shape(0, (1, 512, 512, 3)) # NHWC 기준
inputs, outputs, bindings, stream = allocate_buffers(engine, context)
cap = cv2.VideoCapture(0)
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)
classes = list(cfg.idx2cls.values())
cmap = plt.get_cmap('tab20', cfg.n_class)
colors = [(int(c[2]*255),int(c[1]*255),int(c[0]*255)) for c in cmap.colors]
while True:
ret, frame = cap.read()
if not ret:
break
disp_frame = frame.copy()
img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB).astype(np.float32) / 255.0
h0, w0 = img.shape[:2]
img_resized = cv2.resize(img, cfg.image_size[:2][::-1])
np.copyto(inputs[0][1], img_resized.ravel())
start = time.time()
trt_outs = do_inference(context, bindings, inputs, outputs, stream)
duration = time.time() - start
for name, arr in trt_outs.items():
flat = arr.flatten()
print(f"[RAW] {name} head5:", flat[:5])
preds = {name: trt_outs[name] for name in trt_outs}
boxes = decode_predictions_multiscale(preds, cfg)
boxes = apply_nms(boxes, cfg)
for xmin, ymin, xmax, ymax, cls, score in boxes:
sx = w0 / cfg.image_size[1]
sy = h0 / cfg.image_size[0]
x1, y1 = int(xmin * sx), int(ymin * sy)
x2, y2 = int(xmax * sx), int(ymax * sy)
color = colors[int(cls)]
cv2.rectangle(disp_frame, (x1, y1), (x2, y2), color, 2)
txt = f"{classes[int(cls)]}:{score:.2f}"
cv2.putText(disp_frame, txt, (x1, max(y1-10,0)),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255,255,255), 1)
cv2.putText(disp_frame, f"FPS: {1/duration:.1f}", (10,30),
cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0,255,0), 2)
cv2.imshow("TRT Object Detection", disp_frame)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
cap.release()
cv2.destroyAllWindows()
if name == “main”:
main()
I am experimenting with tensorRT inference in jetson using the above code. I have done custom object detection on the server and saw the result, and after changing the model to onnx, I am doing inference on jetson. However, the image obtained from the USB camera is continuously changing, but the bounding box detected is the same box obtained in the first frame. Then I get Cuda Runtime (invalid resource handle) as below. do_inference() is not working properly. I have been looking at it for a few days, but I can’t figure out the cause. can you help me?
[RAW] 16 head5: [ 6.3183594e-01 7.1289062e-01 -5.9179688e-01 -5.7373047e-01
4.8398972e-04]
[RAW] 32 head5: [ 8.5986328e-01 5.1562500e-01 -4.3579102e-01 -4.8779297e-01
7.5531006e-04]
[RAW] 64 head5: [ 0.6069336 0.47436523 -0.42138672 -0.39257812 0.00462723]
[05/07/2025-04:14:45] [TRT] [E] 1: [genericReformat.cuh::copyVectorizedRunKernel::1579] Error Code 1: Cuda Runtime (invalid resource handle)
[RAW] 16 head5: [ 6.3183594e-01 7.1289062e-01 -5.9179688e-01 -5.7373047e-01
4.8398972e-04]
[RAW] 32 head5: [ 8.5986328e-01 5.1562500e-01 -4.3579102e-01 -4.8779297e-01
7.5531006e-04]
[RAW] 64 head5: [ 0.6069336 0.47436523 -0.42138672 -0.39257812 0.00462723]
[05/07/2025-04:14:46] [TRT] [E] 1: [genericReformat.cuh::copyVectorizedRunKernel::1579] Error Code 1: Cuda Runtime (invalid resource handle)
[RAW] 16 head5: [ 6.3183594e-01 7.1289062e-01 -5.9179688e-01 -5.7373047e-01
4.8398972e-04]
[RAW] 32 head5: [ 8.5986328e-01 5.1562500e-01 -4.3579102e-01 -4.8779297e-01
7.5531006e-04]
[RAW] 64 head5: [ 0.6069336 0.47436523 -0.42138672 -0.39257812 0.00462723]
[05/07/2025-04:14:46] [TRT] [E] 1: [genericReformat.cuh::copyVectorizedRunKernel::1579] Error Code 1: Cuda Runtime (invalid resource handle)
[RAW] 16 head5: [ 6.3183594e-01 7.1289062e-01 -5.9179688e-01 -5.7373047e-01
4.8398972e-04]
[RAW] 32 head5: [ 8.5986328e-01 5.1562500e-01 -4.3579102e-01 -4.8779297e-01
7.5531006e-04]
[RAW] 64 head5: [ 0.6069336 0.47436523 -0.42138672 -0.39257812 0.00462723]
[05/07/2025-04:14:46] [TRT] [E] 1: [genericReformat.cuh::copyVectorizedRunKernel::1579] Error Code 1: Cuda Runtime (invalid resource handle)
[RAW] 16 head5: [ 6.3183594e-01 7.1289062e-01 -5.9179688e-01 -5.7373047e-01
4.8398972e-04]
[RAW] 32 head5: [ 8.5986328e-01 5.1562500e-01 -4.3579102e-01 -4.8779297e-01
7.5531006e-04]
[RAW] 64 head5: [ 0.6069336 0.47436523 -0.42138672 -0.39257812 0.00462723]
[05/07/2025-04:14:46] [TRT] [E] 1: [genericReformat.cuh::copyVectorizedRunKernel::1579] Error Code 1: Cuda Runtime (invalid resource handle)
[RAW] 16 head5: [ 6.3183594e-01 7.1289062e-01 -5.9179688e-01 -5.7373047e-01
4.8398972e-04]
[RAW] 32 head5: [ 8.5986328e-01 5.1562500e-01 -4.3579102e-01 -4.8779297e-01
7.5531006e-04]
[RAW] 64 head5: [ 0.6069336 0.47436523 -0.42138672 -0.39257812 0.00462723]
[05/07/2025-04:14:46] [TRT] [E] 1: [genericReformat.cuh::copyVectorizedRunKernel::1579] Error Code 1: Cuda Runtime (invalid resource handle)
[RAW] 16 head5: [ 6.3183594e-01 7.1289062e-01 -5.9179688e-01 -5.7373047e-01
4.8398972e-04]
[RAW] 32 head5: [ 8.5986328e-01 5.1562500e-01 -4.3579102e-01 -4.8779297e-01
7.5531006e-04]
[RAW] 64 head5: [ 0.6069336 0.47436523 -0.42138672 -0.39257812 0.00462723]