How do I do an inference of a tensorrt.plan model utilizing python?


Whenever we try to do any inference with our model it fails. Something between the lines of allocating buffers or streams fails.

import tensorrt as trt
import numpy as np
import os

import pycuda.driver as cuda
import pycuda.autoinit

class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem): = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str( + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()

class TrtModel:
    def __init__(self,engine_path,max_batch_size=1,dtype=np.float32):
        self.engine_path = engine_path
        self.dtype = dtype
        self.logger = trt.Logger(trt.Logger.WARNING)
        self.runtime = trt.Runtime(self.logger)
        self.engine = self.load_engine(self.runtime, self.engine_path)
        self.max_batch_size = max_batch_size
        self.inputs, self.outputs, self.bindings, = self.allocate_buffers()
        self.context = self.engine.create_execution_context()

    def load_engine(trt_runtime, engine_path):
        trt.init_libnvinfer_plugins(None, "")             
        with open(engine_path, 'rb') as f:
            engine_data =
        engine = trt_runtime.deserialize_cuda_engine(engine_data)
        return engine
    def allocate_buffers(self):
        inputs = []
        outputs = []
        bindings = []
        stream = cuda.Stream()
        for binding in self.engine:
            size = trt.volume(self.engine.get_binding_shape(binding)) * self.max_batch_size
            host_mem = cuda.pagelocked_empty(size, self.dtype)
            device_mem = cuda.mem_alloc(host_mem.nbytes)

            if self.engine.binding_is_input(binding):
                inputs.append(HostDeviceMem(host_mem, device_mem))
                outputs.append(HostDeviceMem(host_mem, device_mem))
        return inputs, outputs, bindings, stream
    def __call__(self,x:np.ndarray,batch_size=2):
        x = x.astype(self.dtype)
        for inp in self.inputs:
        self.context.execute_async(batch_size=batch_size, bindings=self.bindings,
        for out in self.outputs:
            cuda.memcpy_dtoh_async(, out.device, 
        return [,-1) for out in self.outputs]

if __name__ == "__main__":
    batch_size = 1
    trt_engine_path = "model.plan"
    model = TrtModel(trt_engine_path)
    shape = model.engine.get_binding_shape(0)

    data = np.random.randint(0,255,(batch_size,*shape[1:]))/255
    result = model(data,batch_size)


TensorRT Version:
GPU Type: RTX 3090
Nvidia Driver Version: 5.15
NVIDIA-SMI 515.86.01 Driver Version: 515.86.01 CUDA Version: 11.7
CUDA Version:
CUDNN Version:
Operating System + Version: ubuntu 2004
Python Version (if applicable): 3.9
TensorFlow Version (if applicable):
PyTorch Version (if applicable):
Baremetal or Container (if container which image + tag):

Relevant Files

Please attach or include links to any models, data, files, or scripts necessary to reproduce your issue. (Github repo, Google Drive, Dropbox, etc.)

Steps To Reproduce

Please include:

  • Exact steps/commands to build your repro
  • Exact steps/commands to run your repro
  • Full traceback of errors encountered

Hi @mahmoud_saad ,
Can you plesae share more details with us,
like the error logs, onnx model and script to reproduce the case, so that we can assist better.