Secondary-GIE result in DeepStream messed, python tensorrt api works fine

Hi,

I am working with RepVGG classification network recently, and I want to integrate the classification model as a secondary GIE in DeepStream. The pipeline I am using is a primary GIE for person detection, and a secondary GIE to classify whether the person is smoking.

I converted the original RepVGG PyTorch model to onnx and then to tensorrt. I checked the result by using tensorrt python api, which worked fine, it produced good result. But when I integrated the model into DeepStream, it produced bad result. I checked the image preprocess, it only contains resize and normalization which is divided by 255.

I can not figure out where els may cause this inaccuray.
Hope someone could help. Thanks

Here is my full system configuration:
• Hardware Platform (Jetson / GPU): Xavier NX
• DeepStream Version: 5.0
• JetPack Version (valid for Jetson only): 4.4
• TensorRT Version: 7.1.3

Here is the python api code I used for classification model test:

import os
import cv2
from PIL import Image
import numpy as np

INPUT_W = 224
INPUT_H = 448
CONF_THRESH = 0.5
IOU_THRESHOLD = 0.5

import pycuda.driver as cuda
import pycuda.autoinit
import tensorrt as trt
import pycuda.driver as cuda
import time


class RepVGGTRT(object):

    def __init__(self, engine_file_path):
        # Create a Context on this device,
        self.cfx = cuda.Device(0).make_context()
        stream = cuda.Stream()
        TRT_LOGGER = trt.Logger(trt.Logger.INFO)
        runtime = trt.Runtime(TRT_LOGGER)

        # Deserialize the engine from file
        with open(engine_file_path, "rb") as f:
            engine = runtime.deserialize_cuda_engine(f.read())
        context = engine.create_execution_context()

        host_inputs = []
        cuda_inputs = []
        host_outputs = []
        cuda_outputs = []
        bindings = []

        for binding in engine:
            size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
            # Append the device buffer to device bindings.
            bindings.append(int(cuda_mem))
            # Append to the appropriate list.
            if engine.binding_is_input(binding):
                host_inputs.append(host_mem)
                cuda_inputs.append(cuda_mem)
            else:
                host_outputs.append(host_mem)
                cuda_outputs.append(cuda_mem)

        # Store
        self.stream = stream
        self.context = context
        self.engine = engine
        self.host_inputs = host_inputs
        self.cuda_inputs = cuda_inputs
        self.host_outputs = host_outputs
        self.cuda_outputs = cuda_outputs
        self.bindings = bindings

    def infer(self, frame):
        # Make self the active context, pushing it on top of the context stack.
        self.cfx.push()
        # Restore
        stream = self.stream
        context = self.context
        engine = self.engine
        host_inputs = self.host_inputs
        cuda_inputs = self.cuda_inputs
        host_outputs = self.host_outputs
        cuda_outputs = self.cuda_outputs
        bindings = self.bindings
        # Do image preprocess
        input_image, image_raw, origin_h, origin_w = self.preprocess_image(frame)
        np.copyto(host_inputs[0], input_image.ravel())
        cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
        context.execute_async(bindings=bindings, stream_handle=stream.handle)
        cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
        stream.synchronize()
        self.cfx.pop()
        output = host_outputs[0]

        return output

    def destroy(self):
        # Remove any context from the top of the context stack, deactivating it.
        self.cfx.pop()

    def preprocess_image(self, image_raw):
        """
        description: Read an image from image path, convert it to RGB,
                     resize and pad it to target size, normalize to [0,1],
                     transform to NCHW format.
        param:
            image_raw: image array
        return:
            image:  the processed image
            image_raw: the original image
            h: original height
            w: original width
        """
        h, w, c = image_raw.shape
        # image = image_raw
        image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
        # Calculate widht and height and paddings
        r_w = INPUT_W / w
        r_h = INPUT_H / h
        if r_h > r_w:
            tw = INPUT_W
            th = int(r_w * h)
            tx1 = tx2 = 0
            ty1 = int((INPUT_H - th) / 2)
            ty2 = INPUT_H - th - ty1
        else:
            tw = int(r_h * w)
            th = INPUT_H
            tx1 = int((INPUT_W - tw) / 2)
            tx2 = INPUT_W - tw - tx1
            ty1 = ty2 = 0
        # Resize the image with long side while maintaining ratio
        image = cv2.resize(image, (tw, th))
        # Pad the short side with (128,128,128)
        image = cv2.copyMakeBorder(
            image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, (128, 128, 128)
        )
        image = image.astype(np.float32)
        # Normalize to [0,1]
        image /= 255.0
        image = np.transpose(image, [2, 0, 1])
        image = np.expand_dims(image, axis=0)
        image = np.ascontiguousarray(image)
        return image, image_raw, h, w


cls_trt = RepVGGTRT('RepVGG-A2.engine')


def main(input_dir):
    names = []
    labels = []
    topk_ids = []
    probs_all = []
    for root, dirs, files in os.walk(input_dir):
        for image_name in files:
            if os.path.basename(root) == 'normal':
                labels.append(0)
            else:
                labels.append(1)
            names.append(image_name)
            input = cv2.imread(os.path.join(root, image_name))
            probs = cls_trt.infer(input)
            probs_all.append(probs)
            topk = np.where(probs[1] > CONF_THRESH , 1, 0)
            topk_ids.append([topk])

    topk_ids = np.concatenate(topk_ids, axis=0)
    probs_all = np.concatenate(probs_all, axis=0)

    with open(os.path.join('./topk_ids.csv'), 'w') as out_file:
        for name, label, cls, prob in zip(names, labels, topk_ids, probs_all):
            out_file.write('{0},{1},{2}\n'.format(
                name, cls, label))


if __name__ == '__main__':
    main('/opt/mot_test_videos/smoke_person')

Here is the secondary gie configuration I used:

[property]
gpu-id=0
net-scale-factor=0.0039215697906911373
labelfile-path=./labels_smoking.txt
force-implicit-batch-dim=1
batch-size=4
model-color-format=0
network-mode=2
process-mode=2
is-classifier=1
output-blob-names=prob
classifier-async-mode=0
input-object-min-width=64
input-object-min-height=64
operate-on-gie-id=1
operate-on-class-ids=0
classifier-threshold=0.5

Hope some one could help

@bcao Could you please give me some advice?

Hey customer, can you try to deploy the engine generated via tensorRT to deepstream and see if it can work well?

@bcao Thanks for the quick reply. I used tensorRT engine file in both methods, and I think the prepocess methods are the same which only includes resize and division by 255. But the tensorrt predicted result I got from deepstream is much worse than tensorRT python api

You can refer DeepStream SDK FAQ - #9 by mchi to dump the inference input and see if there are any differences?