Object Detection inference problem: image updates but bounding box (bbox) is fixed to bbox from first frame

system: jetson agx orin 64 dev kit.
env: docker container

import os
import cv2
import time
import json
import onnx
import numpy as np
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import tensorflow as tf
from easydict import EasyDict
import numpy as np
import matplotlib.pyplot as plt

cfg_coco = EasyDict()
cfg_coco.idx2cls = {
‘1’: ‘person’, ‘2’: ‘bicycle’, ‘3’: ‘car’, ‘4’: ‘motorcycle’, ‘5’: ‘airplane’,
‘6’: ‘bus’, ‘7’: ‘train’, ‘8’: ‘truck’, ‘9’: ‘boat’, ‘10’: ‘traffic light’,
‘11’: ‘fire hydrant’, ‘13’: ‘stop sign’, ‘14’: ‘parking meter’, ‘15’: ‘bench’, ‘16’: ‘bird’,
‘17’: ‘cat’, ‘18’: ‘dog’, ‘19’: ‘horse’, ‘20’: ‘sheep’, ‘21’: ‘cow’,
‘22’: ‘elephant’, ‘23’: ‘bear’, ‘24’: ‘zebra’, ‘25’: ‘giraffe’, ‘27’: ‘backpack’,
‘28’: ‘umbrella’, ‘31’: ‘handbag’, ‘32’: ‘tie’, ‘33’: ‘suitcase’, ‘34’: ‘frisbee’,
‘35’: ‘skis’, ‘36’: ‘snowboard’, ‘37’: ‘sports ball’, ‘38’: ‘kite’, ‘39’: ‘baseball bat’,
‘40’: ‘baseball glove’, ‘41’: ‘skateboard’, ‘42’: ‘surfboard’, ‘43’: ‘tennis racket’, ‘44’: ‘bottle’,
‘46’: ‘wine glass’, ‘47’: ‘cup’, ‘48’: ‘fork’, ‘49’: ‘knife’, ‘50’: ‘spoon’,
‘51’: ‘bowl’, ‘52’: ‘banana’, ‘53’: ‘apple’, ‘54’: ‘sandwich’, ‘55’: ‘orange’,
‘56’: ‘broccoli’, ‘57’: ‘carrot’, ‘58’: ‘hot dog’, ‘59’: ‘pizza’, ‘60’: ‘donut’,
‘61’: ‘cake’, ‘62’: ‘chair’, ‘63’: ‘couch’, ‘64’: ‘potted plant’, ‘65’: ‘bed’,
‘67’: ‘dining table’, ‘70’: ‘toilet’, ‘72’: ‘tv’, ‘73’: ‘laptop’, ‘74’: ‘mouse’,
‘75’: ‘remote’, ‘76’: ‘keyboard’, ‘77’: ‘cell phone’, ‘78’: ‘microwave’, ‘79’: ‘oven’,
‘80’: ‘toaster’, ‘81’: ‘sink’, ‘82’: ‘refrigerator’, ‘84’: ‘book’, ‘85’: ‘clock’,
‘86’: ‘vase’, ‘87’: ‘scissors’, ‘88’: ‘teddy bear’, ‘89’: ‘hair drier’, ‘90’: ‘toothbrush’
}
cfg_coco.n_class = len(cfg_coco.idx2cls)
cfg_coco.n_anchors = 4
cfg_coco.box_params = 4
cfg_coco.obj_params = 1
cfg_coco.pred_dim = cfg_coco.box_params + cfg_coco.obj_params + cfg_coco.n_class
cfg_coco.clip_bound = 2.
cfg_coco.scale_offset = 0.5

cfg_coco.image_size = (512, 512, 3)
cfg_coco.grid_sizes = [64, 32, 16]

cfg_coco.max_output_size = 100
cfg_coco.iou_threshold = 0.5
cfg_coco.score_threshold = 0.5

cfg = cfg_coco

TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

def extract_onnx_io(onnx_path):
model = onnx.load(onnx_path)
graph = model.graph
def shape_from_tensor(tensor):
return [dim.dim_value if dim.dim_value>0 else -1
for dim in tensor.type.tensor_type.shape.dim]
inputs = [{“name”: inp.name, “shape”: shape_from_tensor(inp)}
for inp in graph.input
if inp.name not in {init.name for init in graph.initializer}]
outputs = [{“name”: out.name, “shape”: shape_from_tensor(out)}
for out in graph.output]
return {“inputs”: inputs, “outputs”: outputs}

ENGINE_PATH = “../engine_models/OD_coco.engine”
def build_engine(onnx_path, io_info):
if os.path.exists(ENGINE_PATH):
print(f"Loading serialized engine from {ENGINE_PATH}“)
with open(ENGINE_PATH, “rb”) as f:
return trt.Runtime(TRT_LOGGER).deserialize_cuda_engine(f.read())
print(“Building TensorRT engine…”)
builder = trt.Builder(TRT_LOGGER)
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
parser = trt.OnnxParser(network, TRT_LOGGER)
with open(onnx_path, ‘rb’) as f:
if not parser.parse(f.read()):
for i in range(parser.num_errors):
print(parser.get_error(i))
raise RuntimeError(“ONNX parsing failed”)
config = builder.create_builder_config()
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 28) # 256MiB
config.set_flag(trt.BuilderFlag.FP16) # mode
# optimization profile: batch=1 fixed
profile = builder.create_optimization_profile()
inp = io_info[“inputs”][0]
shape = inp[“shape”][:]
shape[0] = 1
profile.set_shape(inp[“name”], tuple(shape), tuple(shape), tuple(shape))
config.add_optimization_profile(profile)
serialized = builder.build_serialized_network(network, config)
with open(ENGINE_PATH, “wb”) as f:
f.write(serialized)
print(f"Engine serialized to {ENGINE_PATH}”)
return trt.Runtime(TRT_LOGGER).deserialize_cuda_engine(serialized)

def allocate_buffers(engine, context):
inputs, outputs, bindings = , ,
stream = cuda.Stream()
for idx in range(engine.num_bindings):
name = engine.get_binding_name(idx)

    # dynamic shape 처리
    if engine.binding_is_input(idx):
        shape = context.get_binding_shape(idx)
    else:
        shape = engine.get_binding_shape(idx)

    dtype = trt.nptype(engine.get_binding_dtype(idx))
    size  = abs(int(np.prod(shape)))
    host_mem = cuda.pagelocked_empty(size, dtype)
    dev_mem  = cuda.mem_alloc(host_mem.nbytes)
    bindings.append(int(dev_mem))
    if engine.binding_is_input(idx):
        inputs.append((name, host_mem, dev_mem, tuple(shape)))
    else:
        outputs.append((name, host_mem, dev_mem, tuple(shape)))
return inputs, outputs, bindings, stream

def do_inference(context, bindings, inputs, outputs, stream):
for _, h, d, _ in inputs:
cuda.memcpy_htod_async(d, h, stream)
context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
for _, h, d, _ in outputs:
cuda.memcpy_dtoh_async(h, d, stream)
stream.synchronize()
return {name: h.reshape(shape) for name, h, _, shape in outputs}

def decode_predictions_multiscale(preds, cfg):
boxes =
for grid in cfg.grid_sizes:
out = preds[str(grid)][0] # (grid,grid,anchors*pred_dim)
out = out.reshape((grid, grid, cfg.n_anchors, cfg.pred_dim))
cell = cfg.image_size[0] / grid
for i in range(grid):
for j in range(grid):
for a in range(cfg.n_anchors):
p = out[i,j,a]
score = p[4]
if score < cfg.score_threshold: continue
tx,ty,tw,th = p[:4]
cx,cy = (j+tx)*cell, (i+ty)*cell
tw = np.clip(tw, -cfg.clip_bound, cfg.clip_bound)
th = np.clip(th, -cfg.clip_bound, cfg.clip_bound)
bw = (np.exp(tw)-cfg.scale_offset)*cfg.image_size[0]
bh = (np.exp(th)-cfg.scale_offset)*cfg.image_size[1]
xmin, ymin = cx-bw/2, cy-bh/2
xmax, ymax = cx+bw/2, cy+bh/2
xmin, xmax = np.clip([xmin,xmax],0,cfg.image_size[0])
ymin, ymax = np.clip([ymin,ymax],0,cfg.image_size[1])
if xmax<=xmin or ymax<=ymin: continue
cls = int(np.argmax(p[5:]))
boxes.append([xmin,ymin,xmax,ymax, cls, score])
return boxes

def apply_nms(boxes, cfg):
if not boxes: return
arr = np.array(boxes, dtype=np.float32)
b = arr[:,:4]
scores = arr[:,5]
classes = arr[:,4].astype(np.int32)
idx, sc = tf.image.non_max_suppression_with_scores(
boxes=b, scores=scores,
max_output_size=cfg.max_output_size,
iou_threshold=cfg.iou_threshold,
score_threshold=cfg.score_threshold
)
idx = idx.numpy()
return [boxes[i] for i in idx]

ONNX_PATH = “../onnx_models/OD_coco.onnx”
def main():

io_info = extract_onnx_io(ONNX_PATH)
print("ONNX IO:", json.dumps(io_info, indent=2))
engine = build_engine(ONNX_PATH, io_info)

context = engine.create_execution_context()
context.set_binding_shape(0, (1, 512, 512, 3))  # NHWC 기준
inputs, outputs, bindings, stream = allocate_buffers(engine, context)

cap = cv2.VideoCapture(0)
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)

classes = list(cfg.idx2cls.values())
cmap = plt.get_cmap('tab20', cfg.n_class)
colors = [(int(c[2]*255),int(c[1]*255),int(c[0]*255)) for c in cmap.colors]

while True:
    ret, frame = cap.read()
    if not ret:
        break

    disp_frame = frame.copy()

    img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB).astype(np.float32) / 255.0
    h0, w0 = img.shape[:2]
    img_resized = cv2.resize(img, cfg.image_size[:2][::-1])

    np.copyto(inputs[0][1], img_resized.ravel())

    start = time.time()
    trt_outs = do_inference(context, bindings, inputs, outputs, stream)
    duration = time.time() - start

    for name, arr in trt_outs.items():
        flat = arr.flatten()
        print(f"[RAW] {name} head5:", flat[:5]) 

    preds  = {name: trt_outs[name] for name in trt_outs}
    boxes  = decode_predictions_multiscale(preds, cfg)
    boxes  = apply_nms(boxes, cfg)

    for xmin, ymin, xmax, ymax, cls, score in boxes:
        sx = w0 / cfg.image_size[1]
        sy = h0 / cfg.image_size[0]
        x1, y1 = int(xmin * sx), int(ymin * sy)
        x2, y2 = int(xmax * sx), int(ymax * sy)
        color = colors[int(cls)]
        cv2.rectangle(disp_frame, (x1, y1), (x2, y2), color, 2)
        txt = f"{classes[int(cls)]}:{score:.2f}"
        cv2.putText(disp_frame, txt, (x1, max(y1-10,0)),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255,255,255), 1)

    cv2.putText(disp_frame, f"FPS: {1/duration:.1f}", (10,30),
                cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0,255,0), 2)

    cv2.imshow("TRT Object Detection", disp_frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break


cap.release()
cv2.destroyAllWindows()

if name == “main”:
main()

I am experimenting with tensorRT inference in jetson using the above code. I have done custom object detection on the server and saw the result, and after changing the model to onnx, I am doing inference on jetson. However, the image obtained from the USB camera is continuously changing, but the bounding box detected is the same box obtained in the first frame. Then I get Cuda Runtime (invalid resource handle) as below. do_inference() is not working properly. I have been looking at it for a few days, but I can’t figure out the cause. can you help me?

[RAW] 16 head5: [ 6.3183594e-01 7.1289062e-01 -5.9179688e-01 -5.7373047e-01
4.8398972e-04]
[RAW] 32 head5: [ 8.5986328e-01 5.1562500e-01 -4.3579102e-01 -4.8779297e-01
7.5531006e-04]
[RAW] 64 head5: [ 0.6069336 0.47436523 -0.42138672 -0.39257812 0.00462723]
[05/07/2025-04:14:45] [TRT] [E] 1: [genericReformat.cuh::copyVectorizedRunKernel::1579] Error Code 1: Cuda Runtime (invalid resource handle)
[RAW] 16 head5: [ 6.3183594e-01 7.1289062e-01 -5.9179688e-01 -5.7373047e-01
4.8398972e-04]
[RAW] 32 head5: [ 8.5986328e-01 5.1562500e-01 -4.3579102e-01 -4.8779297e-01
7.5531006e-04]
[RAW] 64 head5: [ 0.6069336 0.47436523 -0.42138672 -0.39257812 0.00462723]
[05/07/2025-04:14:46] [TRT] [E] 1: [genericReformat.cuh::copyVectorizedRunKernel::1579] Error Code 1: Cuda Runtime (invalid resource handle)
[RAW] 16 head5: [ 6.3183594e-01 7.1289062e-01 -5.9179688e-01 -5.7373047e-01
4.8398972e-04]
[RAW] 32 head5: [ 8.5986328e-01 5.1562500e-01 -4.3579102e-01 -4.8779297e-01
7.5531006e-04]
[RAW] 64 head5: [ 0.6069336 0.47436523 -0.42138672 -0.39257812 0.00462723]
[05/07/2025-04:14:46] [TRT] [E] 1: [genericReformat.cuh::copyVectorizedRunKernel::1579] Error Code 1: Cuda Runtime (invalid resource handle)
[RAW] 16 head5: [ 6.3183594e-01 7.1289062e-01 -5.9179688e-01 -5.7373047e-01
4.8398972e-04]
[RAW] 32 head5: [ 8.5986328e-01 5.1562500e-01 -4.3579102e-01 -4.8779297e-01
7.5531006e-04]
[RAW] 64 head5: [ 0.6069336 0.47436523 -0.42138672 -0.39257812 0.00462723]
[05/07/2025-04:14:46] [TRT] [E] 1: [genericReformat.cuh::copyVectorizedRunKernel::1579] Error Code 1: Cuda Runtime (invalid resource handle)
[RAW] 16 head5: [ 6.3183594e-01 7.1289062e-01 -5.9179688e-01 -5.7373047e-01
4.8398972e-04]
[RAW] 32 head5: [ 8.5986328e-01 5.1562500e-01 -4.3579102e-01 -4.8779297e-01
7.5531006e-04]
[RAW] 64 head5: [ 0.6069336 0.47436523 -0.42138672 -0.39257812 0.00462723]
[05/07/2025-04:14:46] [TRT] [E] 1: [genericReformat.cuh::copyVectorizedRunKernel::1579] Error Code 1: Cuda Runtime (invalid resource handle)
[RAW] 16 head5: [ 6.3183594e-01 7.1289062e-01 -5.9179688e-01 -5.7373047e-01
4.8398972e-04]
[RAW] 32 head5: [ 8.5986328e-01 5.1562500e-01 -4.3579102e-01 -4.8779297e-01
7.5531006e-04]
[RAW] 64 head5: [ 0.6069336 0.47436523 -0.42138672 -0.39257812 0.00462723]
[05/07/2025-04:14:46] [TRT] [E] 1: [genericReformat.cuh::copyVectorizedRunKernel::1579] Error Code 1: Cuda Runtime (invalid resource handle)
[RAW] 16 head5: [ 6.3183594e-01 7.1289062e-01 -5.9179688e-01 -5.7373047e-01
4.8398972e-04]
[RAW] 32 head5: [ 8.5986328e-01 5.1562500e-01 -4.3579102e-01 -4.8779297e-01
7.5531006e-04]
[RAW] 64 head5: [ 0.6069336 0.47436523 -0.42138672 -0.39257812 0.00462723]

Dear @sangminsuh ,
Could you please share the docker container environment , TRT/jetpack details to reproduce the issue