TensorRT multi stream


I am trying to make inference from several threads at same time, in sync mode every thread should wait until other one done with CUDA ( via custom mutex ) otherwise its crash with memory problem
Which slow down the framerate from 60 FPS to 10~15FPS with 4 threads ( with 30~50% GPU usage ), I found out what in trtexec possible to setup stream so will be possible to inference several frame at the same time, so after hours google was made “this” but application crashing

[TRT] [E] 1: [pointWiseV2Helpers.cpp::launchPwgenKernel::280] Error Code 1: Cuda Driver (invalid resource handle)

Full code, I just tried to make more or less working example when several threads are living together and not blocking each other, working fine without cuda.Device(0).make_context() and in one thread, but dont want with and several threads

import math
import threading
import time

import tensorrt as trt  # NOQA
from collections import OrderedDict, namedtuple, deque
import numpy as np
import torch
import cv2
import pycuda.driver as cuda  # NOQA
#import pycuda.autoinit # NOQA
from PIL import Image

Binding = namedtuple('Binding', ('name', 'dtype', 'shape', 'data', 'ptr'))
logger = trt.Logger(trt.Logger.INFO)
trt.init_libnvinfer_plugins(logger, namespace="")

trt_runtime = trt.Runtime(logger)
trt_engine = None
with open('/root/model_v9_8.engine', 'rb') as f:
    trt_engine = trt_runtime.deserialize_cuda_engine(f.read())

streams = []

class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()

def allocate_buffers(engine):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()

    # Current NMS implementation in TRT only supports DataType.FLOAT but
    # it may change in the future, which could brake this sample here
    # when using lower precision [e.g. NMS output would not be np.float32
    # anymore, even though this is assumed in binding_to_type]
    binding_to_type = {
        "images": np.float32,
        "Input": np.float32,
        "NMS": np.float32,
        "det_scores": np.float32,
        "det_boxes": np.float32,
        "NMS_1": np.int32,
        "num_dets": np.int32,
        "det_classes": np.int32,

    for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
        dtype = binding_to_type[str(binding)]
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream

def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    return [out.host for out in outputs]

def load_into_numpy(image):
    (im_width, im_height) = image.size
    return np.array(image).reshape(
        (im_height, im_width, 3)

def load_frame(arr):
    image = Image.fromarray(np.uint8(arr))
    model_input_width = 640
    model_input_height = 640
    # Note: Bilinear interpolation used by Pillow is a little bit
    # different than the one used by Tensorflow, so if network receives
    # an image that is not 300x300, the network output may differ
    # from the one output by Tensorflow
    image_resized = image.resize(
        size=(model_input_width, model_input_height),
    img_np = load_into_numpy(image_resized)
    # HWC -> CHW
    img_np = img_np.transpose((2, 0, 1))
    # Normalize to [-1.0, 1.0] interval (expected by model)
    img_np = (2.0 / 255.0) * img_np - 1.0
    img_np = img_np.ravel()
    return img_np

cfx = cuda.Device(0).make_context()
inputs, outputs, bindings, stream = allocate_buffers(trt_engine)
context = trt_engine.create_execution_context()

def cam(id: int):
    global trt_engine, cfx, inputs, outputs, bindings, stream, context

    cap = cv2.VideoCapture('test_videos/double.avi')
    fps = deque([0], maxlen=100)
    while cap.isOpened():
        p1 = time.perf_counter()
        print('===' * 10)
        _, frame = cap.read()

        img = load_frame(frame)

        np.copyto(inputs[0].host, img.ravel())

        detection_out = do_inference(
        for det in detection_out:

        current_fps = int(1000 / ((time.perf_counter() - p1) * 1000))
        avg_fps = sum(fps) / len(fps)
        print(f"Stream: {id}, NOW: {current_fps} AVG: {avg_fps}")

# for id in range(0, 3):
#     threading.Thread(target=cam, args=[id], daemon=True).start()
# while True:
#     time.sleep(1)


TensorRT Version: TensorRT v8502
GPU Type: Jetson XavierNX
CUDA Version: 11.4
Operating System + Version: Ubuntu 20.04 ( Jetpack 5.1 )
Python Version (if applicable): 3.8.10


The below links might be useful for you.


For multi-threading/streaming, will suggest you to use Deepstream or TRITON

For more details, we recommend you raise the query in Deepstream forum.


raise the query in Triton Inference Server Github instance issues section.