Getting [0, 0, 0, 0] output from TAO multitask classifier

@Morganh
I have followed the TLT multi-task classification tutorial and developed a model using opensource dataset. I am able to load the model, but when I tried to do the inference I am not getting any output from the model. Apart from that I am getting below mentioned TRTERROR. :

[TensorRT] ERROR: Parameter check failed at: engine.cpp::enqueue::451, condition: bindings != nullptr

root@ahamad:/workspace/analytics/jothi# python class_clf_trtmodel.py 
Using TensorFlow backend.
2022-04-05 17:15:02.328958: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
WARNING:tensorflow:Deprecation warnings have been disabled. Set TF_ENABLE_DEPRECATION_WARNINGS=1 to re-enable them.
model dimensions: (14400,)
[TensorRT] ERROR: Parameter check failed at: engine.cpp::enqueue::451, condition: bindings[x] != nullptr
h_output, h_input [0. 0. 0. 0.] [151.061 151.061 151.061 ... 131.32  131.32  131.32 ]
Traceback (most recent call last):
  File "class_clf_trtmodel.py", line 151, in <module>
    output = classifier.predict(image)
  File "class_clf_trtmodel.py", line 97, in predict
    pred = self.class_mapping[np.argmax(h_output)]
KeyError: 0

Here I am sharing the script and the model file for your reference.

import os
import time

import cv2
# import matplotlib.pyplot as plt
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt
from PIL import Image
from keras.applications.imagenet_utils import preprocess_input

import logging
lg = logging.getLogger(__name__)


TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()

class TLTClfModel(object):

    def __init__(self,
                 engine_path,
                 model_w=80,
                 model_h=60,
                 classid_map=None
                 ):

        self.trt_engine_path = engine_path
        if "132" in engine_path:
            model_w = 132
            model_h = 132
        self.model_w=model_w
        self.model_h=model_h
        self.labels = classid_map
        self.output_name_0 = "base_color/Softmax"
        self.output_name_1 = "category/Softmax"
        self.output_name_2 = "season/Softmax"
        self.task_name = ["base_color", "category", "season"]
        self.class_mapping = {"base_color": {"0": "Black", "1": "Blue", "2": "Brown", "3": "Green", \
                              "4": "Grey", "5": "Navy Blue", "6": "Pink", "7": "Purple", "8": "Red", \
                              "9": "Silver", "10": "White"}, 
                              "category": {"0": "Bags", "1": "Bottomwear", "2": "Eyewear", "3": "Fragrance", \
                              "4": "Innerwear", "5": "Jewellery", "6": "Sandal", "7": "Shoes", "8": "Topwear", \
                              "9": "Watches"}, 
                              "season": {"0": "Fall", "1": "Spring", "2": "Summer", "3": "Winter"}}
        self.trt_runtime = trt.Runtime(TRT_LOGGER)
        self.trt_engine = self.load_engine(self.trt_runtime, self.trt_engine_path)
        self.inputs, self.dinputs, self.outputs, self.doutputs, self.stream = self.allocate_buffers(self.trt_engine)
        self.context = self.trt_engine.create_execution_context()
        self.cuda_ctx = cuda.Context.attach()

    @staticmethod
    def load_engine(trt_runtime, engine_path):
        with open(engine_path, "rb") as f:
            engine_data = f.read()
        engine = trt_runtime.deserialize_cuda_engine(engine_data)
        return engine

    @staticmethod
    def allocate_buffers(engine):
        # Determine dimensions and create page-locked memory buffers (i.e. won't be swapped to disk) to hold host inputs/outputs.
        h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), dtype=trt.nptype(trt.float32))
        h_output = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)), dtype=trt.nptype(trt.float32))
        # Allocate device memory for inputs and outputs.
        d_input = cuda.mem_alloc(h_input.nbytes)
        d_output = cuda.mem_alloc(h_output.nbytes)
        # Create a stream in which to copy inputs/outputs and run inference.
        stream = cuda.Stream()
        return h_input, d_input, h_output, d_output, stream

    def predict(self, image, confidence_thresh=0.5):
        self.load_image_to_pagelocked_memory(image,
                                             self.inputs,
                                             self.model_w,
                                             self.model_h)

        h_output, h_input = self.do_inference(self.context,
                                         self.inputs,
                                         self.dinputs,
                                         self.outputs,
                                         self.doutputs,
                                         self.stream)
        
        print("h_output, h_input", h_output, h_input)        
        confidence = np.max(h_output)
        pred = self.class_mapping[np.argmax(h_output)]
        lg.info("prediction result: %s, %0.2f"%(h_output, confidence))
        #if confidence > confidence_thresh:
        return pred, confidence
        #else:
        #    return "unclear", confidence

    @staticmethod
    def load_image_to_pagelocked_memory(test_image, pagelocked_buffer, w, h):
        # Converts the input image to a CHW Numpy array
        def normalize_image(image, w, h):
            # Resize, antialias and transpose the image to CHW.
            # return np.asarray(image.resize((w, h), Image.ANTIALIAS)).transpose([2, 0, 1]).astype(trt.nptype(trt.float32)).ravel()
            lg.info("model dimensions: %d, %d"%(w, h))
            normalized_image = preprocess_input(
                cv2.resize(image, (w, h))
                    .transpose([2, 0, 1])
                    .astype(trt.nptype(trt.float32)),
                mode='caffe', data_format='channels_first').ravel()
            print("model dimensions:", normalized_image.shape)
            return normalized_image

        # Normalize the image and copy to pagelocked memory.
        np.copyto(pagelocked_buffer, normalize_image(test_image[:,:,::-1], w, h))
        return test_image

    @staticmethod
    def do_inference(context, h_input, d_input, h_output, d_output, stream):
        # Transfer input data to the GPU.
        cuda.memcpy_htod_async(d_input, h_input, stream)
        # Run inference.
        context.execute_async(bindings=[int(d_input), int(d_output)], stream_handle=stream.handle)
        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(h_output, d_output, stream)
        # Synchronize the stream
        stream.synchronize()
        return h_output, h_input

    def __del__(self):
        self.cuda_ctx.pop()
        # self.cuda_ctx.detach()




if __name__ == '__main__':
    classifier = TLTClfModel(
            engine_path="./assets/multitask_classifier/mcls_export.etlt.engine",
            model_w=60,
            model_h=80
        )
    image = cv2.imread('10000.jpg')
    # rects = pp.start(image)
    image  = cv2.resize(image, (80, 60))
    output = classifier.predict(image)

class_clf_trtmodel.py (6.0 KB)
10000
mcls_export.etlt (1.8 MB)
etlt_to_engine_mltsk.sh (535 Bytes)

Please refer to multi-task classification section in GitHub - NVIDIA-AI-IOT/tao-toolkit-triton-apps: Sample app code for deploying TAO Toolkit trained models to Triton

Preprocessing: tao-toolkit-triton-apps/frame.py at ae6b5ec41c3a9651957c4dddfc262a43f47e263c · NVIDIA-AI-IOT/tao-toolkit-triton-apps · GitHub
Postprocessing: tao-toolkit-triton-apps/multitask_classification_postprocessor.py at ae6b5ec41c3a9651957c4dddfc262a43f47e263c · NVIDIA-AI-IOT/tao-toolkit-triton-apps · GitHub

@Morganh, thanks for the reply. The project that I am working on requires me to do realtime processing, I need to do the classification on top of the detection model output.

I have a trt optimized detection model which is working well on the TAO container. If I can integrate this model, then my pipeline completes.

I understood that Triton inference server is much like a rest api, where we need to send data. Do you think, I can use the above scripts in my context. I am little confused because of that.

I did not inspect your code. So, please debug by yourself.
I share the triton app in order to share the preprocessing and postprocessing of multitask classification.

I am able to solve the errors and able to run the multi-task classification using Tensorrt. Here is the script

import os
import time

import cv2
# import matplotlib.pyplot as plt
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt
from PIL import Image
from keras.applications.imagenet_utils import preprocess_input

import logging
lg = logging.getLogger(__name__)


TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()

class TLTClfModel(object):

    def __init__(self,
                 engine_path,
                 model_w=80,
                 model_h=60,
                 classid_map=None
                 ):

        self.trt_engine_path = engine_path
        if "132" in engine_path:
            model_w = 132
            model_h = 132
        self.model_w=model_w
        self.model_h=model_h
        self.labels = classid_map
        self.output_name_0 = "base_color/Softmax"
        self.output_name_1 = "category/Softmax"
        self.output_name_2 = "season/Softmax"
        self.task_name = ["base_color", "category", "season"]
        self.class_mapping = {"base_color": {"0": "Black", "1": "Blue", "2": "Brown", "3": "Green", \
                              "4": "Grey", "5": "Navy Blue", "6": "Pink", "7": "Purple", "8": "Red", \
                              "9": "Silver", "10": "White"}, 
                              "category": {"0": "Bags", "1": "Bottomwear", "2": "Eyewear", "3": "Fragrance", \
                              "4": "Innerwear", "5": "Jewellery", "6": "Sandal", "7": "Shoes", "8": "Topwear", \
                              "9": "Watches"}, 
                              "season": {"0": "Fall", "1": "Spring", "2": "Summer", "3": "Winter"}}
        self.trt_runtime = trt.Runtime(TRT_LOGGER)
        self.trt_engine = self.load_engine(self.trt_runtime, self.trt_engine_path)
        # self.inputs, self.dinputs, self.outputs, self.doutputs, self.stream = self.allocate_buffers(self.trt_engine)
        self.inputs, self.dinputs, self.outputs_1, self.outputs_2, self.outputs_3, self.doutputs_1, self.doutputs_2, self.doutputs_3, self.stream =  self.allocate_buffers(self.trt_engine)
        self.context = self.trt_engine.create_execution_context()
        self.cuda_ctx = cuda.Context.attach()

    @staticmethod
    def load_engine(trt_runtime, engine_path):
        with open(engine_path, "rb") as f:
            engine_data = f.read()
        engine = trt_runtime.deserialize_cuda_engine(engine_data)
        return engine

    @staticmethod
    def allocate_buffers(engine):
        # Determine dimensions and create page-locked memory buffers (i.e. won't be swapped to disk) to hold host inputs/outputs.
        h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), dtype=trt.nptype(trt.float32))
        h_output_1 = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)), dtype=trt.nptype(trt.float32)) #'season/Softmax'
        h_output_2 = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(2)), dtype=trt.nptype(trt.float32)) #'category/Softmax'
        h_output_3 = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(3)), dtype=trt.nptype(trt.float32)) #'base_color/Softmax'
        # Allocate device memory for inputs and outputs.
        d_input = cuda.mem_alloc(h_input.nbytes)
        d_output_1 = cuda.mem_alloc(h_output_1.nbytes)
        d_output_2 = cuda.mem_alloc(h_output_2.nbytes)
        d_output_3 = cuda.mem_alloc(h_output_3.nbytes)
        # Create a stream in which to copy inputs/outputs and run inference.
        stream = cuda.Stream()
        return h_input, d_input, h_output_1, h_output_2, h_output_3, d_output_1, d_output_2, d_output_3, stream

    def predict(self, image, confidence_thresh=0.5):
        self.load_image_to_pagelocked_memory(image,
                                             self.inputs,
                                             self.model_w,
                                             self.model_h)

        h_output_1, h_output_2, h_output_3, h_input = self.do_inference(self.context,
                                                        self.inputs,
                                                        self.dinputs,
                                                        self.outputs_1,
                                                        self.outputs_2,
                                                        self.outputs_3,
                                                        self.doutputs_1,
                                                        self.doutputs_2,
                                                        self.doutputs_3,
                                                        self.stream)
                        
        # print("h_output_1, h_output_2, h_output_3, h_input", h_output_1, h_output_2, h_output_3, h_input)        
        confidence_1 = np.max(h_output_1) # Season
        pred_1 = self.class_mapping[self.task_name[2]][str(np.argmax(h_output_1))]

        confidence_2 = np.max(h_output_2) #category
        pred_2 = self.class_mapping[self.task_name[1]][str(np.argmax(h_output_2))]

        confidence_3 = np.max(h_output_3) #base_color
        pred_3 = self.class_mapping[self.task_name[0]][str(np.argmax(h_output_3))]

        # print("prediction result: %s, %0.2f"%(pred_1, confidence_1))
        # print("prediction result: %s, %0.2f"%(pred_2, confidence_2))
        # print("prediction result: %s, %0.2f"%(pred_3, confidence_3))

        #if confidence > confidence_thresh:
        return (pred_1, pred_2, pred_3), (confidence_1, confidence_2, confidence_3)
        #else:
        #    return "unclear", confidence

    @staticmethod
    def load_image_to_pagelocked_memory(test_image, pagelocked_buffer, w, h):
        # Converts the input image to a CHW Numpy array
        def normalize_image(image, w, h):
            # Resize, antialias and transpose the image to CHW.
            # return np.asarray(image.resize((w, h), Image.ANTIALIAS)).transpose([2, 0, 1]).astype(trt.nptype(trt.float32)).ravel()
            lg.info("model dimensions: %d, %d"%(w, h))
            normalized_image = preprocess_input(
                cv2.resize(image, (w, h))
                    .transpose([2, 0, 1])
                    .astype(trt.nptype(trt.float32)),
                mode='caffe', data_format='channels_first').ravel()
            # print("model dimensions:", normalized_image.shape)
            return normalized_image

        # Normalize the image and copy to pagelocked memory.
        np.copyto(pagelocked_buffer, normalize_image(test_image[:,:,::-1], w, h))
        return test_image

    @staticmethod
    def do_inference(context, h_input, d_input, h_output_1, h_output_2, h_output_3, d_output_1, d_output_2, d_output_3, stream):
        # Transfer input data to the GPU.
        cuda.memcpy_htod_async(d_input, h_input, stream)
        # Run inference.
        context.execute_async(bindings=[int(d_input), int(d_output_1), int(d_output_2), int(d_output_3)], stream_handle=stream.handle)
        # Transfer predictions back from the GPU.
        # ret = self.context.execute_async(batch_size=1, bindings=[int(self.device_input), int(self.device_output_0),
        #                                                          int(self.device_output_1)],
        #                                  stream_handle=self.stream.handle)
        cuda.memcpy_dtoh_async(h_output_1, d_output_1, stream)
        cuda.memcpy_dtoh_async(h_output_2, d_output_2, stream)
        cuda.memcpy_dtoh_async(h_output_3, d_output_3, stream)

        # Synchronize the stream
        stream.synchronize()
        return h_output_1, h_output_2, h_output_3, h_input

    def __del__(self):
        self.cuda_ctx.pop()
        # self.cuda_ctx.detach()




if __name__ == '__main__':
    classifier = TLTClfModel(
            engine_path="./assets/multitask_classifier/mcls_export.etlt.engine",
            model_w=60,
            model_h=80
        )
    image = cv2.imread('10000.jpg')
    # rects = pp.start(image)
    # image  = cv2.resize(image, (80, 60))
    time_list = []
    for i in range(10000):
        a = time.time()
        preds, confids = classifier.predict(image)
        b = time.time()-a
        print("Inference time Iteration - %d: %0.5f"%(i, b))
        time_list.append(b)

    print("Mean Values: %0.4f"%np.mean(time_list))
    print(preds, confids)

Thank you @Morganh for the support.

1 Like

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.