Fps is not increasing while doing the inference for segmentation with tensorrt am getting only 1 frame per 2 seconds i need 2 fps

import cv2
import numpy as np
from PIL import Image
import tensorrt as trt
from torchvision import transforms
import labels # from cityscapes evaluation script
import engine as eng
import time
import inference as inf
#import keras
import argparse #import skimage.transform
import pycuda.driver as cuda
#import pycuda.autoinit

cuda_device = cuda.Device(0)
cuda_ctx = cuda_device.make_context()

TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
trt_runtime = trt.Runtime(TRT_LOGGER)
HEIGHT = 720
WIDTH = 1280
new_frame = 0
prev_frame = 0

def preprocess(image):

# Mean normalization
mean = np.array([0.0, 0.0, 0.0]).astype('float32')
stddev = np.array([1.0, 1.0, 1.0]).astype('float32')
data = (np.asarray(image).astype('float32')/float(255.0) - mean) / stddev
return np.moveaxis(data, 2, 0)

def rescale_image(image, output_shape, order=1):
return image

def main(args,frame,stream,context):

#input_file_path = args.input_image
serialized_plan_fp32 = args.engine_file
HEIGHT = args.height
WIDTH = args.width

#image = np.asarray(Image.open(input_file_path))
img = rescale_image(frame, (WIDTH,HEIGHT),order=1)
im = np.array(img, dtype=np.float32, order='C')

#engine = eng.load_engine(trt_runtime, serialized_plan_fp32)
#h_input, d_input, h_output, d_output, stream = inf.allocate_buffers(engine, 1, trt.float32)

out = inf.do_inference(engine, im, h_input, d_input, h_output, d_output, stream,context, 1, HEIGHT, WIDTH)
thr=out.max() * 0.65
kmo=out* 255
_,thresh=cv2.threshold(kmo,int(thr * 255),255,cv2.THRESH_BINARY)
sp_im[np.where(thresh== 255)]=(0,0,255)
out_mask[np.where(out > thr)]=255
# cv2.imwrite("out_mask.jpg",out_mask)


return sp_im,out_mask

if name == “main”:
parser = argparse.ArgumentParser()
#parser.add_argument(‘–input_image’, type=str)
parser.add_argument(‘–engine_file’, type=str)

parser.add_argument(‘–hdf5_file’, type=str)

parser.add_argument('--height', type=int, default= 720)
parser.add_argument('--width', type=int, default= 1280)

serialized_plan_fp32 = args.engine_file
output_vid = "output_rail.mp4"
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
vid_shape = (1280, 720)
out = cv2.VideoWriter(output_vid, fourcc, 20.0, vid_shape, True)
#video_path = args.dataset
cap = cv2.VideoCapture("trim.mp4")
engine = eng.load_engine(trt_runtime, serialized_plan_fp32)
h_input, d_input, h_output, d_output, stream = inf.allocate_buffers(engine, 1, trt.float32)
#with engine.create_execution_context() as context:
#    context.execute(batch_size=1, bindings=[int(d_input), int(d_output)])
while True:
        ret, frame = cap.read()
        new_frame = time.time()
        fps = 1/(new_frame-prev_frame)
        prev_frame = new_frame
        if ret:
            if k%2==0:

            #segimg = cv2.cvtColor((segimg * 255).astype(np.uint8), cv2.COLOR_BGR2RGB)
            #segimg = cv2.cvtColor(segimg, cv2.COLOR_RGB2BGR)

A couple of suggestions NVdali can convert to fp32, normalize and rescale/resize your input image in GPU memory in preparation for inferencing with TensorRT. Here is some tutorials of various NVdali image manipulation routines in python. https://docs.nvidia.com/deeplearning/dali/user-guide/docs/examples/general/normalize.html NVdali could also be used to threshold the image and convert to 8bit on the backend of inference operation.

The NVIDIA Deepstream SDK is made to handle this entire pipeline of inference operations on the GPU with the highest performance. Explanation of the Deepstream pipeline, https://youtu.be/hSegX0P170s

preprocessing of the image is not a concern,
here is my tensorrt inference script.
with this script, inferencing one frame it is taking 1.5-sec which means 0.5fps. I want it t have a better fps.
I’m sharing my script below.

particularly,context.execute(batch_size=1, bindings=[int(d_input_1), int(d_output)])
this line is taking 1.4sec to run

import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np

def allocate_buffers(engine, batch_size, data_type):

This is the function to allocate buffers for input and output in the device
engine : The path to the TensorRT engine.
batch_size : The batch size for execution time.
data_type: The type of the data for input and output, for example trt.float32.

h_input_1: Input in the host.
d_input_1: Input in the device.
h_output_1: Output in the host.
d_output_1: Output in the device.
stream: CUDA stream.


Determine dimensions and create page-locked memory buffers (which won’t be swapped to disk) to hold host inputs/outputs.

h_input_1 = cuda.pagelocked_empty(batch_size * trt.volume(engine.get_binding_shape(0)), dtype=trt.nptype(data_type))
h_output = cuda.pagelocked_empty(batch_size * trt.volume(engine.get_binding_shape(1)), dtype=trt.nptype(data_type))

Allocate device memory for inputs and outputs.

d_input_1 = cuda.mem_alloc(h_input_1.nbytes)

d_output = cuda.mem_alloc(h_output.nbytes)

Create a stream in which to copy inputs/outputs and run inference.

stream = cuda.Stream()
return h_input_1, d_input_1, h_output, d_output, stream

def load_images_to_buffer(pics, pagelocked_buffer):

preprocessed = np.asarray(pics).ravel()
np.copyto(pagelocked_buffer, preprocessed)

def do_inference(engine, pics_1, h_input_1, d_input_1, h_output, d_output, stream, batch_size, height, width):

This is the function to run the inference
engine : Path to the TensorRT engine.
pics_1 : Input images to the model.
h_input_1: Input in the host.
d_input_1: Input in the device.
h_output_1: Output in the host.
d_output_1: Output in the device.
stream: CUDA stream.
batch_size : Batch size for execution time.
height: Height of the output image.
width: Width of the output image.

The list of output images.


load_images_to_buffer(pics_1, h_input_1)

with engine.create_execution_context() as context:
# Transfer input data to the GPU.
cuda.memcpy_htod_async(d_input_1, h_input_1, stream)

   # Run inference.

   context.profiler = trt.Profiler()
   context.execute(batch_size=1, bindings=[int(d_input_1), int(d_output)])

   # Transfer predictions back from the GPU.
   cuda.memcpy_dtoh_async(h_output, d_output, stream)
   # Synchronize the stream.
   # Return the host output.
   out = h_output.reshape((batch_size,-1, height, width))
   return out

Your batchsize is 1, imagesize is 1280x720 (large), float32 tensor formatted data input. What GPU are you using (nvidia-smi.exe) and what display driver( CUDA version). I think you want an Ampere or better GPU. Higher framerates may require smaller input images. There is a tool NVIDIA Nsight Compute (see guided analysis video) that will point out which CNN layer is your bottleneck. What CNN are you inferencing? In the NVIDIA transfer learning toolkit hosted on ngc.nvidia.com many trained networks (PeopleNet, VehicleNet, etc) are available for comparison in performance. Even YOLO typically starts with a 446x446 image.