TensorRT inference out of memory error

Hi.
I created a tensorRT engine from .etlt model. But I get this error for inference on Jetson nano with jetpack 4.6. Even when I create engine with batch_size=1 I get the same error:

pycuda._driver.MemoryError: cuMemHostAlloc failed: out of memory

This is my script for inference:

import tensorrt as trt
import numpy as np
from PIL import Image
import os
import cv2
import pycuda.driver as cuda
import pycuda.autoinit



class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()

class TrtEngine:
    #Initializes TensorRT objects needed for model inference.
    def __init__(self,engine_path, input_height, input_width, input_channels, max_batch_size, dtype):
        
        self.engine_path = engine_path
        self.input_height = input_height
        self.input_width = input_width
        self.input_channels = input_channels
        self.dtype = dtype
        self.logger = trt.Logger(trt.Logger.VERBOSE)
        self.runtime = trt.Runtime(self.logger)
        self.engine = self.load_engine(self.runtime, self.engine_path)
        self.max_batch_size = max_batch_size
        self.inputs, self.outputs, self.bindings, self.stream = self.allocate_buffers()
        self.context = self.engine.create_execution_context()
        
        # Allocate memory for multiple usage [e.g. multiple batch inference]
        # self.context.set_binding_shape(0, (self.max_batch_size , 3, self.input_height, self.input_width))
        input_volume = trt.volume((self.input_channels, self.input_width, self.input_height))
        self.numpy_array = np.zeros((self.engine.max_batch_size, input_volume))

                
                
    @staticmethod
    def load_engine(trt_runtime, engine_path):
        trt.init_libnvinfer_plugins(None, "")             
        with open(engine_path, 'rb') as f:
            engine_data = f.read()
        engine = trt_runtime.deserialize_cuda_engine(engine_data)
        return engine
    
    def allocate_buffers(self):
        
        inputs = []
        outputs = []
        bindings = []
        stream = cuda.Stream()
        for binding in self.engine:
            size = trt.volume(self.engine.get_binding_shape(binding)) * self.max_batch_size
            host_mem = cuda.pagelocked_empty(size, self.dtype)
            device_mem = cuda.mem_alloc(host_mem.nbytes)
            
            bindings.append(int(device_mem))

            if self.engine.binding_is_input(binding):
                inputs.append(HostDeviceMem(host_mem, device_mem))
            else:
                outputs.append(HostDeviceMem(host_mem, device_mem))
        
        return inputs, outputs, bindings, stream
       
            
    def infer_batch(self, image_paths):
        """Infers model on batch of same sized images resized to fit the model.
        Args:
            image_paths (str): paths to images, that will be packed into batch
                and fed into model
        """

        # Verify if the supplied batch size is not too big
        max_batch_size = self.engine.max_batch_size
        print("max " + str(self.engine.max_batch_size))
        actual_batch_size = len(image_paths)
        if actual_batch_size > max_batch_size:
            raise ValueError(
                "image_paths list bigger ({}) than engine max batch size ({})".format(actual_batch_size, max_batch_size))


        # Load all images to CPU...
        imgs = self._load_imgs(image_paths)
        # ...copy them into appropriate place into memory...
        # (self.inputs was returned earlier by allocate_buffers())

        # print("check for more than 1 image")
        # print(len(self.inputs))
        np.copyto(self.inputs[0].host, imgs.ravel().astype(self.dtype))
        
        # ...fetch model outputs...
        [detection_out, keep_count_out] = do_inference(
            context=self.context, bindings=self.bindings, inputs=self.inputs,
            outputs=self.outputs, stream=self.stream,
            batch_size=self.max_batch_size)
        # ...and return results.
        return detection_out, keep_count_out

    def _load_image_into_numpy_array(self, image):
        # (im_width, im_height) = image.size
        # return np.array(image).reshape(
        #     (im_height, im_width, self.input_channels)
        # ).astype(np.uint8)

        return np.array(image, dtype=self.dtype, order='C')

    def _load_imgs(self, image_paths):
        # batch_size = self.engine.max_batch_size
        for idx, image_path in enumerate(image_paths):
            img_np = self._load_img(image_path)
            self.numpy_array[idx] = img_np
        return self.numpy_array

    def _load_img(self, image_path):
        image = Image.open(image_path)
        r, g, b = image.split()              
        image = Image.merge('RGB', (b, g, r))

        model_input_width = self.input_width
        model_input_height = self.input_width
        # Note: Bilinear interpolation used by Pillow is a little bit
        # different than the one used by Tensorflow, so if network receives
        # an image that is not 300x300, the network output may differ
        # from the one output by Tensorflow
        image_resized = image.resize(
            size=(model_input_width, model_input_height),
            resample=Image.BICUBIC
        )
        img_np = self._load_image_into_numpy_array(image_resized)
        # HWC -> CHW
        img_np = img_np.transpose((2, 0, 1))
        # Normalize to [-1.0, 1.0] interval (expected by model)
        # img_np = img_np / 255.0
        img_np = (2.0 / 255.0) * img_np - 1.0
        img_np = img_np.ravel()
        return img_np

# This function is generalized for multiple inputs/outputs.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
def do_inference(context, bindings, inputs, outputs, stream, batch_size):
    # Transfer input data to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    # Run inference.
    inference = context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # Synchronize the stream
    stream.synchronize()
    # Return only the host outputs  
    return [out.host for out in outputs]

Hi @p.vahidinia,

Hope following similar issue may help you. If you need further assistance, we recommend you to post your concern on related TAO Toolkit (TLT) forum to get better help.

Thank you.