Getting [0, 0, 0, 0] output from TAO multitask classifier

sk.ahmed401 · April 5, 2022, 11:51am

@Morganh
I have followed the TLT multi-task classification tutorial and developed a model using opensource dataset. I am able to load the model, but when I tried to do the inference I am not getting any output from the model. Apart from that I am getting below mentioned TRTERROR. :

[TensorRT] ERROR: Parameter check failed at: engine.cpp::enqueue::451, condition: bindings != nullptr

root@ahamad:/workspace/analytics/jothi# python class_clf_trtmodel.py 
Using TensorFlow backend.
2022-04-05 17:15:02.328958: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
WARNING:tensorflow:Deprecation warnings have been disabled. Set TF_ENABLE_DEPRECATION_WARNINGS=1 to re-enable them.
model dimensions: (14400,)
[TensorRT] ERROR: Parameter check failed at: engine.cpp::enqueue::451, condition: bindings[x] != nullptr
h_output, h_input [0. 0. 0. 0.] [151.061 151.061 151.061 ... 131.32  131.32  131.32 ]
Traceback (most recent call last):
  File "class_clf_trtmodel.py", line 151, in <module>
    output = classifier.predict(image)
  File "class_clf_trtmodel.py", line 97, in predict
    pred = self.class_mapping[np.argmax(h_output)]
KeyError: 0

Here I am sharing the script and the model file for your reference.

import os
import time

import cv2
# import matplotlib.pyplot as plt
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt
from PIL import Image
from keras.applications.imagenet_utils import preprocess_input

import logging
lg = logging.getLogger(__name__)


TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()

class TLTClfModel(object):

    def __init__(self,
                 engine_path,
                 model_w=80,
                 model_h=60,
                 classid_map=None
                 ):

        self.trt_engine_path = engine_path
        if "132" in engine_path:
            model_w = 132
            model_h = 132
        self.model_w=model_w
        self.model_h=model_h
        self.labels = classid_map
        self.output_name_0 = "base_color/Softmax"
        self.output_name_1 = "category/Softmax"
        self.output_name_2 = "season/Softmax"
        self.task_name = ["base_color", "category", "season"]
        self.class_mapping = {"base_color": {"0": "Black", "1": "Blue", "2": "Brown", "3": "Green", \
                              "4": "Grey", "5": "Navy Blue", "6": "Pink", "7": "Purple", "8": "Red", \
                              "9": "Silver", "10": "White"}, 
                              "category": {"0": "Bags", "1": "Bottomwear", "2": "Eyewear", "3": "Fragrance", \
                              "4": "Innerwear", "5": "Jewellery", "6": "Sandal", "7": "Shoes", "8": "Topwear", \
                              "9": "Watches"}, 
                              "season": {"0": "Fall", "1": "Spring", "2": "Summer", "3": "Winter"}}
        self.trt_runtime = trt.Runtime(TRT_LOGGER)
        self.trt_engine = self.load_engine(self.trt_runtime, self.trt_engine_path)
        self.inputs, self.dinputs, self.outputs, self.doutputs, self.stream = self.allocate_buffers(self.trt_engine)
        self.context = self.trt_engine.create_execution_context()
        self.cuda_ctx = cuda.Context.attach()

    @staticmethod
    def load_engine(trt_runtime, engine_path):
        with open(engine_path, "rb") as f:
            engine_data = f.read()
        engine = trt_runtime.deserialize_cuda_engine(engine_data)
        return engine

    @staticmethod
    def allocate_buffers(engine):
        # Determine dimensions and create page-locked memory buffers (i.e. won't be swapped to disk) to hold host inputs/outputs.
        h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), dtype=trt.nptype(trt.float32))
        h_output = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)), dtype=trt.nptype(trt.float32))
        # Allocate device memory for inputs and outputs.
        d_input = cuda.mem_alloc(h_input.nbytes)
        d_output = cuda.mem_alloc(h_output.nbytes)
        # Create a stream in which to copy inputs/outputs and run inference.
        stream = cuda.Stream()
        return h_input, d_input, h_output, d_output, stream

    def predict(self, image, confidence_thresh=0.5):
        self.load_image_to_pagelocked_memory(image,
                                             self.inputs,
                                             self.model_w,
                                             self.model_h)

        h_output, h_input = self.do_inference(self.context,
                                         self.inputs,
                                         self.dinputs,
                                         self.outputs,
                                         self.doutputs,
                                         self.stream)
        
        print("h_output, h_input", h_output, h_input)        
        confidence = np.max(h_output)
        pred = self.class_mapping[np.argmax(h_output)]
        lg.info("prediction result: %s, %0.2f"%(h_output, confidence))
        #if confidence > confidence_thresh:
        return pred, confidence
        #else:
        #    return "unclear", confidence

    @staticmethod
    def load_image_to_pagelocked_memory(test_image, pagelocked_buffer, w, h):
        # Converts the input image to a CHW Numpy array
        def normalize_image(image, w, h):
            # Resize, antialias and transpose the image to CHW.
            # return np.asarray(image.resize((w, h), Image.ANTIALIAS)).transpose([2, 0, 1]).astype(trt.nptype(trt.float32)).ravel()
            lg.info("model dimensions: %d, %d"%(w, h))
            normalized_image = preprocess_input(
                cv2.resize(image, (w, h))
                    .transpose([2, 0, 1])
                    .astype(trt.nptype(trt.float32)),
                mode='caffe', data_format='channels_first').ravel()
            print("model dimensions:", normalized_image.shape)
            return normalized_image

        # Normalize the image and copy to pagelocked memory.
        np.copyto(pagelocked_buffer, normalize_image(test_image[:,:,::-1], w, h))
        return test_image

    @staticmethod
    def do_inference(context, h_input, d_input, h_output, d_output, stream):
        # Transfer input data to the GPU.
        cuda.memcpy_htod_async(d_input, h_input, stream)
        # Run inference.
        context.execute_async(bindings=[int(d_input), int(d_output)], stream_handle=stream.handle)
        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(h_output, d_output, stream)
        # Synchronize the stream
        stream.synchronize()
        return h_output, h_input

    def __del__(self):
        self.cuda_ctx.pop()
        # self.cuda_ctx.detach()




if __name__ == '__main__':
    classifier = TLTClfModel(
            engine_path="./assets/multitask_classifier/mcls_export.etlt.engine",
            model_w=60,
            model_h=80
        )
    image = cv2.imread('10000.jpg')
    # rects = pp.start(image)
    image  = cv2.resize(image, (80, 60))
    output = classifier.predict(image)

class_clf_trtmodel.py (6.0 KB)
10000
mcls_export.etlt (1.8 MB)
etlt_to_engine_mltsk.sh (535 Bytes)

Morganh · April 6, 2022, 2:35am

Please refer to multi-task classification section in GitHub - NVIDIA-AI-IOT/tao-toolkit-triton-apps: Sample app code for deploying TAO Toolkit trained models to Triton

Preprocessing: tao-toolkit-triton-apps/frame.py at ae6b5ec41c3a9651957c4dddfc262a43f47e263c · NVIDIA-AI-IOT/tao-toolkit-triton-apps · GitHub
Postprocessing: tao-toolkit-triton-apps/multitask_classification_postprocessor.py at ae6b5ec41c3a9651957c4dddfc262a43f47e263c · NVIDIA-AI-IOT/tao-toolkit-triton-apps · GitHub

sk.ahmed401 · April 6, 2022, 5:29am

@Morganh, thanks for the reply. The project that I am working on requires me to do realtime processing, I need to do the classification on top of the detection model output.

I have a trt optimized detection model which is working well on the TAO container. If I can integrate this model, then my pipeline completes.

I understood that Triton inference server is much like a rest api, where we need to send data. Do you think, I can use the above scripts in my context. I am little confused because of that.

Morganh · April 6, 2022, 6:36am

I did not inspect your code. So, please debug by yourself.
I share the triton app in order to share the preprocessing and postprocessing of multitask classification.

sk.ahmed401 · April 6, 2022, 11:40am

I am able to solve the errors and able to run the multi-task classification using Tensorrt. Here is the script

import os
import time

import cv2
# import matplotlib.pyplot as plt
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt
from PIL import Image
from keras.applications.imagenet_utils import preprocess_input

import logging
lg = logging.getLogger(__name__)


TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()

class TLTClfModel(object):

    def __init__(self,
                 engine_path,
                 model_w=80,
                 model_h=60,
                 classid_map=None
                 ):

        self.trt_engine_path = engine_path
        if "132" in engine_path:
            model_w = 132
            model_h = 132
        self.model_w=model_w
        self.model_h=model_h
        self.labels = classid_map
        self.output_name_0 = "base_color/Softmax"
        self.output_name_1 = "category/Softmax"
        self.output_name_2 = "season/Softmax"
        self.task_name = ["base_color", "category", "season"]
        self.class_mapping = {"base_color": {"0": "Black", "1": "Blue", "2": "Brown", "3": "Green", \
                              "4": "Grey", "5": "Navy Blue", "6": "Pink", "7": "Purple", "8": "Red", \
                              "9": "Silver", "10": "White"}, 
                              "category": {"0": "Bags", "1": "Bottomwear", "2": "Eyewear", "3": "Fragrance", \
                              "4": "Innerwear", "5": "Jewellery", "6": "Sandal", "7": "Shoes", "8": "Topwear", \
                              "9": "Watches"}, 
                              "season": {"0": "Fall", "1": "Spring", "2": "Summer", "3": "Winter"}}
        self.trt_runtime = trt.Runtime(TRT_LOGGER)
        self.trt_engine = self.load_engine(self.trt_runtime, self.trt_engine_path)
        # self.inputs, self.dinputs, self.outputs, self.doutputs, self.stream = self.allocate_buffers(self.trt_engine)
        self.inputs, self.dinputs, self.outputs_1, self.outputs_2, self.outputs_3, self.doutputs_1, self.doutputs_2, self.doutputs_3, self.stream =  self.allocate_buffers(self.trt_engine)
        self.context = self.trt_engine.create_execution_context()
        self.cuda_ctx = cuda.Context.attach()

    @staticmethod
    def load_engine(trt_runtime, engine_path):
        with open(engine_path, "rb") as f:
            engine_data = f.read()
        engine = trt_runtime.deserialize_cuda_engine(engine_data)
        return engine

    @staticmethod
    def allocate_buffers(engine):
        # Determine dimensions and create page-locked memory buffers (i.e. won't be swapped to disk) to hold host inputs/outputs.
        h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), dtype=trt.nptype(trt.float32))
        h_output_1 = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)), dtype=trt.nptype(trt.float32)) #'season/Softmax'
        h_output_2 = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(2)), dtype=trt.nptype(trt.float32)) #'category/Softmax'
        h_output_3 = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(3)), dtype=trt.nptype(trt.float32)) #'base_color/Softmax'
        # Allocate device memory for inputs and outputs.
        d_input = cuda.mem_alloc(h_input.nbytes)
        d_output_1 = cuda.mem_alloc(h_output_1.nbytes)
        d_output_2 = cuda.mem_alloc(h_output_2.nbytes)
        d_output_3 = cuda.mem_alloc(h_output_3.nbytes)
        # Create a stream in which to copy inputs/outputs and run inference.
        stream = cuda.Stream()
        return h_input, d_input, h_output_1, h_output_2, h_output_3, d_output_1, d_output_2, d_output_3, stream

    def predict(self, image, confidence_thresh=0.5):
        self.load_image_to_pagelocked_memory(image,
                                             self.inputs,
                                             self.model_w,
                                             self.model_h)

        h_output_1, h_output_2, h_output_3, h_input = self.do_inference(self.context,
                                                        self.inputs,
                                                        self.dinputs,
                                                        self.outputs_1,
                                                        self.outputs_2,
                                                        self.outputs_3,
                                                        self.doutputs_1,
                                                        self.doutputs_2,
                                                        self.doutputs_3,
                                                        self.stream)
                        
        # print("h_output_1, h_output_2, h_output_3, h_input", h_output_1, h_output_2, h_output_3, h_input)        
        confidence_1 = np.max(h_output_1) # Season
        pred_1 = self.class_mapping[self.task_name[2]][str(np.argmax(h_output_1))]

        confidence_2 = np.max(h_output_2) #category
        pred_2 = self.class_mapping[self.task_name[1]][str(np.argmax(h_output_2))]

        confidence_3 = np.max(h_output_3) #base_color
        pred_3 = self.class_mapping[self.task_name[0]][str(np.argmax(h_output_3))]

        # print("prediction result: %s, %0.2f"%(pred_1, confidence_1))
        # print("prediction result: %s, %0.2f"%(pred_2, confidence_2))
        # print("prediction result: %s, %0.2f"%(pred_3, confidence_3))

        #if confidence > confidence_thresh:
        return (pred_1, pred_2, pred_3), (confidence_1, confidence_2, confidence_3)
        #else:
        #    return "unclear", confidence

    @staticmethod
    def load_image_to_pagelocked_memory(test_image, pagelocked_buffer, w, h):
        # Converts the input image to a CHW Numpy array
        def normalize_image(image, w, h):
            # Resize, antialias and transpose the image to CHW.
            # return np.asarray(image.resize((w, h), Image.ANTIALIAS)).transpose([2, 0, 1]).astype(trt.nptype(trt.float32)).ravel()
            lg.info("model dimensions: %d, %d"%(w, h))
            normalized_image = preprocess_input(
                cv2.resize(image, (w, h))
                    .transpose([2, 0, 1])
                    .astype(trt.nptype(trt.float32)),
                mode='caffe', data_format='channels_first').ravel()
            # print("model dimensions:", normalized_image.shape)
            return normalized_image

        # Normalize the image and copy to pagelocked memory.
        np.copyto(pagelocked_buffer, normalize_image(test_image[:,:,::-1], w, h))
        return test_image

    @staticmethod
    def do_inference(context, h_input, d_input, h_output_1, h_output_2, h_output_3, d_output_1, d_output_2, d_output_3, stream):
        # Transfer input data to the GPU.
        cuda.memcpy_htod_async(d_input, h_input, stream)
        # Run inference.
        context.execute_async(bindings=[int(d_input), int(d_output_1), int(d_output_2), int(d_output_3)], stream_handle=stream.handle)
        # Transfer predictions back from the GPU.
        # ret = self.context.execute_async(batch_size=1, bindings=[int(self.device_input), int(self.device_output_0),
        #                                                          int(self.device_output_1)],
        #                                  stream_handle=self.stream.handle)
        cuda.memcpy_dtoh_async(h_output_1, d_output_1, stream)
        cuda.memcpy_dtoh_async(h_output_2, d_output_2, stream)
        cuda.memcpy_dtoh_async(h_output_3, d_output_3, stream)

        # Synchronize the stream
        stream.synchronize()
        return h_output_1, h_output_2, h_output_3, h_input

    def __del__(self):
        self.cuda_ctx.pop()
        # self.cuda_ctx.detach()




if __name__ == '__main__':
    classifier = TLTClfModel(
            engine_path="./assets/multitask_classifier/mcls_export.etlt.engine",
            model_w=60,
            model_h=80
        )
    image = cv2.imread('10000.jpg')
    # rects = pp.start(image)
    # image  = cv2.resize(image, (80, 60))
    time_list = []
    for i in range(10000):
        a = time.time()
        preds, confids = classifier.predict(image)
        b = time.time()-a
        print("Inference time Iteration - %d: %0.5f"%(i, b))
        time_list.append(b)

    print("Mean Values: %0.4f"%np.mean(time_list))
    print(preds, confids)

Thank you @Morganh for the support.

system · April 26, 2022, 3:02am

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.

Topic		Replies	Views
Batch processing on tao engine TAO Toolkit	11	395	March 27, 2024
Cannot use TensorRT model exported by NVIDIA TAO TAO Toolkit	8	1097	May 17, 2022
Tao-converted .plan model running in triton-server turned to bad accurate TAO Toolkit	46	3544	April 1, 2022
Classification inference huge performance degradation TAO Toolkit	11	1519	February 18, 2022
TAO exported Classification Pytorch model not woking :: engine binding size negative TAO Toolkit	2	468	October 8, 2023
Custom TAO unet model classifying only two classes on Deepstream! TAO Toolkit	34	1691	May 12, 2022
Error in TAO-toolkit classification_tf2 train TAO Toolkit	21	558	January 26, 2024
TAO MaskRCNN inference output problem TAO Toolkit	36	1001	November 30, 2023
Converting etlt file to .engine for jetson TAO Toolkit	17	2780	October 25, 2022
Wrong results while running multi task classification model with custom code TAO Toolkit	2	269	February 15, 2024

Getting [0, 0, 0, 0] output from TAO multitask classifier

Related topics