Cannot use TensorRT model exported by NVIDIA TAO

Hi, I’m new to TAO toolkit and I have just finished training my first model of yolov3 with the example notebook.

I’m now trying to use the exported tensorrt engine in my code.

I have this class:

class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()


class TrtModel:

    """
    This class is copied from: https://stackoverflow.com/questions/59280745/inference-with-tensorrt-engine-file-on-python
    and follows instruction provided in the "NVIDIA TensorRT Developer Guide"
    """

    def __init__(self, engine_path, max_batch_size=1, dtype=np.float32):

        self.engine_path = engine_path
        self.dtype = dtype
        self.logger = trt.Logger(trt.Logger.WARNING)
        self.runtime = trt.Runtime(self.logger)
        self.engine = self.load_engine(self.runtime, self.engine_path)
        self.engine_batch_size = self.engine.max_batch_size
        self.max_batch_size = max_batch_size
        self.inputs, self.outputs, self.bindings, self.stream = self.allocate_buffers()
        self.context = self.engine.create_execution_context()

    @staticmethod
    def load_engine(trt_runtime, engine_path):
        trt.init_libnvinfer_plugins(None, "")
        with open(engine_path, 'rb') as f:
            engine_data = f.read()
        engine = trt_runtime.deserialize_cuda_engine(engine_data)
        return engine

    def allocate_buffers(self):

        inputs = []
        outputs = []
        bindings = []
        stream = cuda.Stream()

        for binding in self.engine:
            size = trt.volume(self.engine.get_binding_shape(binding)) # * self.max_batch_size
            host_mem = cuda.pagelocked_empty(size, self.dtype)
            device_mem = cuda.mem_alloc(host_mem.nbytes)

            bindings.append(int(device_mem))

            if self.engine.binding_is_input(binding):
                inputs.append(HostDeviceMem(host_mem, device_mem))
            else:
                outputs.append(HostDeviceMem(host_mem, device_mem))

        return inputs, outputs, bindings, stream

    def __call__(self, x: np.ndarray, batch_size=1):

        x = x.astype(self.dtype)

        np.copyto(self.inputs[0].host, x.ravel())
        # np.copyto(self.inputs[1].host, x.ravel())

        for inp in self.inputs:
            cuda.memcpy_htod_async(inp.device, inp.host, self.stream)

        # self.context.execute_async(batch_size=batch_size, bindings=self.bindings, stream_handle=self.stream.handle)
        self.context.execute_async(batch_size=self.engine_batch_size, bindings=self.bindings, stream_handle=self.stream.handle)
        for out in self.outputs:
            cuda.memcpy_dtoh_async(out.host, out.device, self.stream)

        self.stream.synchronize()
        # return [out.host.reshape(batch_size, -1) for out in self.outputs]
        return [out.host.reshape(batch_size, -1) for out in self.outputs]

and I want to allocate a TrtModel using the model exported with the notebook but when I do it, I get this error:

[04/19/2022-17:44:59] [TRT] [E] 1: [stdArchiveReader.cpp::StdArchiveReader::35] Error Code 1: Serialization (Serialization assertion safeVersionRead == safeSerializationVersion failed.Version tag does not match. Note: Current Version: 0, Serialized Engine Version: 43)
[04/19/2022-17:44:59] [TRT] [E] 4: [runtime.cpp::deserializeCudaEngine::50] Error Code 4: Internal Error (Engine deserialization failed.)

resulting in a “None” engine.

Can someone explain how to use the exported models from Nvidia TAO? Thank you!

Please provide the following information when requesting support.

• Hardware (T4/V100/Xavier/Nano/etc):
RTX 2080 Ti

• Network Type (Detectnet_v2/Faster_rcnn/Yolo_v4/LPRnet/Mask_rcnn/Classification/etc)
YoloV3

• TLT Version (Please run “tlt info --verbose” and share “docker_tag” here)

Configuration of the TAO Toolkit Instance

dockers: 		
	nvidia/tao/tao-toolkit-tf: 			
		v3.21.11-tf1.15.5-py3: 				
			docker_registry: nvcr.io
			tasks: 
				1. augment
				2. bpnet
				3. classification
				4. dssd
				5. emotionnet
				6. efficientdet
				7. fpenet
				8. gazenet
				9. gesturenet
				10. heartratenet
				11. lprnet
				12. mask_rcnn
				13. multitask_classification
				14. retinanet
				15. ssd
				16. unet
				17. yolo_v3
				18. yolo_v4
				19. yolo_v4_tiny
				20. converter
		v3.21.11-tf1.15.4-py3: 				
			docker_registry: nvcr.io
			tasks: 
				1. detectnet_v2
				2. faster_rcnn
	nvidia/tao/tao-toolkit-pyt: 			
		v3.21.11-py3: 				
			docker_registry: nvcr.io
			tasks: 
				1. speech_to_text
				2. speech_to_text_citrinet
				3. text_classification
				4. question_answering
				5. token_classification
				6. intent_slot_classification
				7. punctuation_and_capitalization
				8. action_recognition
		v3.22.02-py3: 				
			docker_registry: nvcr.io
			tasks: 
				1. spectro_gen
				2. vocoder
	nvidia/tao/tao-toolkit-lm: 			
		v3.21.08-py3: 				
			docker_registry: nvcr.io
			tasks: 
				1. n_gram
format_version: 2.0
toolkit_version: 3.22.02
published_date: 02/28/2022

• Training spec file(If have, please share here)
Training:

random_seed: 42
yolov3_config {
  big_anchor_shape: "[(114.94, 60.67), (159.06, 114.59), (297.59, 176.38)]"
  mid_anchor_shape: "[(42.99, 31.91), (79.57, 31.75), (56.80, 56.93)]"
  small_anchor_shape: "[(15.60, 13.88), (30.25, 20.25), (20.67, 49.63)]"
  matching_neutral_box_iou: 0.7
  arch: "resnet"
  nlayers: 18
  arch_conv_blocks: 2
  loss_loc_weight: 0.8
  loss_neg_obj_weights: 100.0
  loss_class_weights: 1.0
  freeze_bn: false
  #freeze_blocks: 0
  force_relu: false
}
training_config {
  batch_size_per_gpu: 8
  num_epochs: 80
  enable_qat: false
  checkpoint_interval: 10
  learning_rate {
  soft_start_annealing_schedule {
    min_learning_rate: 1e-6
    max_learning_rate: 1e-4
    soft_start: 0.1
    annealing: 0.5
    }
  }
  regularizer {
    type: L1
    weight: 3e-5
  }
  optimizer {
    adam {
      epsilon: 1e-7
      beta1: 0.9
      beta2: 0.999
      amsgrad: false
    }
  }
  pretrain_model_path: "/workspace/tao-experiments/yolo_v3/pretrained_resnet18/pretrained_object_detection_vresnet18/resnet_18.hdf5"
}
eval_config {
  average_precision_mode: SAMPLE
  batch_size: 8
  matching_iou_threshold: 0.5
}
nms_config {
  confidence_threshold: 0.001
  clustering_iou_threshold: 0.5
  top_k: 200
  force_on_cpu: True
}
augmentation_config {
  hue: 0.1
  saturation: 1.5
  exposure:1.5
  vertical_flip:0
  horizontal_flip: 0.5
  jitter: 0.3
  output_width: 1248
  output_height: 384
  output_channel: 3
  randomize_input_shape_period: 0
}
dataset_config {
  data_sources: {
      tfrecords_path: "/workspace/tao-experiments/data/tfrecords/kitti_trainval/kitti_trainval*"
      image_directory_path: "/workspace/tao-experiments/data/training"
  }
  include_difficult_in_training: true
  image_extension: "png"
  target_class_mapping {
      key: "car"
      value: "car"
  }
  target_class_mapping {
      key: "pedestrian"
      value: "pedestrian"
  }
  target_class_mapping {
      key: "cyclist"
      value: "cyclist"
  }
  target_class_mapping {
      key: "van"
      value: "car"
  }
  target_class_mapping {
      key: "person_sitting"
      value: "pedestrian"
  }
  validation_fold: 0
}

Re-Training

random_seed: 42
yolov3_config {
  big_anchor_shape: "[(114.94, 60.67), (159.06, 114.59), (297.59, 176.38)]"
  mid_anchor_shape: "[(42.99, 31.91), (79.57, 31.75), (56.80, 56.93)]"
  small_anchor_shape: "[(15.60, 13.88), (30.25, 20.25), (20.67, 49.63)]"
  matching_neutral_box_iou: 0.7
  arch: "resnet"
  nlayers: 18
  arch_conv_blocks: 2
  loss_loc_weight: 0.8
  loss_neg_obj_weights: 100.0
  loss_class_weights: 1.0
  freeze_bn: false
  #freeze_blocks: 0
  force_relu: false
}
training_config {
  batch_size_per_gpu: 8
  num_epochs: 80
  enable_qat: false
  checkpoint_interval: 10
  learning_rate {
  soft_start_annealing_schedule {
    min_learning_rate: 1e-6
    max_learning_rate: 1e-4
    soft_start: 0.1
    annealing: 0.5
    }
  }
  regularizer {
    type: NO_REG
    weight: 3e-9
  }
  optimizer {
    adam {
      epsilon: 1e-7
      beta1: 0.9
      beta2: 0.999
      amsgrad: false
    }
  }
  pruned_model_path: "/workspace/tao-experiments/yolo_v3/experiment_dir_pruned/yolov3_resnet18_pruned.tlt"
}
eval_config {
  average_precision_mode: SAMPLE
  batch_size: 8
  matching_iou_threshold: 0.5
}
nms_config {
  confidence_threshold: 0.001
  clustering_iou_threshold: 0.5
  top_k: 200
  force_on_cpu: True
}
augmentation_config {
  hue: 0.1
  saturation: 1.5
  exposure:1.5
  vertical_flip:0
  horizontal_flip: 0.5
  jitter: 0.3
  output_width: 1248
  output_height: 384
  output_channel: 3
  randomize_input_shape_period: 0
}
dataset_config {
  data_sources: {
      tfrecords_path: "/workspace/tao-experiments/data/tfrecords/kitti_trainval/kitti_trainval*"
      image_directory_path: "/workspace/tao-experiments/data/training"
  }
  include_difficult_in_training: true
  image_extension: "png"
  target_class_mapping {
      key: "car"
      value: "car"
  }
  target_class_mapping {
      key: "pedestrian"
      value: "pedestrian"
  }
  target_class_mapping {
      key: "cyclist"
      value: "cyclist"
  }
  target_class_mapping {
      key: "van"
      value: "car"
  }
  target_class_mapping {
      key: "person_sitting"
      value: "pedestrian"
  }
  validation_fold: 0
}

• How to reproduce the issue ? (This is for errors. Please share the command line and the detailed log here.)
Execute the example notebook “cv_samples_v1.3.0/yolo_v3/yolo_v3.ipynb” and try to instantiate a TrtModel from the above code.

Above error implies the tensorrt version where you generate the tensorrt engine is not the same as the tensorrt version where you run inference.
You can run inference under the environment where you generate the tensorrt engine.
Or you can generate a new tensorrt engine under the environment where you want to run inference.

You are correct, allocating the TrtModel object in the same container of NVIDIA TAO that produced it solves the problem. Unfortunately my workstation has TensorRT 8.2 and the container is using TensorRT 8.0.

Is there a simple way to downgrade tensorrt libraries?

I have seen that tao-converted cannot be used with tensorrt 8.2 (TensorRT — TAO Toolkit 3.22.05 documentation )

I’m now experimenting with the engine using the TAO Docker container and my Python class.

I have successfully extracted the YoloV3 output tensors (num_detections, nmsed_boxes, nmsed_scores and nmsed_classes) but I get different detections than using the command tao yolo_v3 inference.

Have someone already used an exported engine with custom code? (not DeepStream)

I’m attaching my script:

class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()


class TrtModel:

    """
    This class is copied from: https://stackoverflow.com/questions/59280745/inference-with-tensorrt-engine-file-on-python
    and follows instruction provided in the "NVIDIA TensorRT Developer Guide"
    """

    def __init__(self, engine_path, max_batch_size=1, dtype=np.float32):

        self.engine_path = engine_path
        self.dtype = dtype
        self.logger = trt.Logger(trt.Logger.WARNING)
        self.runtime = trt.Runtime(self.logger)
        self.engine = self.load_engine(self.runtime, self.engine_path)
        self.engine_batch_size = self.engine.max_batch_size
        self.max_batch_size = max_batch_size
        self.inputs, self.outputs, self.bindings, self.stream = self.allocate_buffers()
        self.context = self.engine.create_execution_context()
        print("TRT model initialized")

    @staticmethod
    def load_engine(trt_runtime, engine_path):
        trt.init_libnvinfer_plugins(None, "")
        with open(engine_path, 'rb') as f:
            engine_data = f.read()
        engine = trt_runtime.deserialize_cuda_engine(engine_data)
        return engine

    def allocate_buffers(self):

        inputs = []
        outputs = []
        bindings = []
        stream = cuda.Stream()

        for binding in self.engine:
            size = trt.volume(self.engine.get_binding_shape(binding)) # * self.max_batch_size
            print(f"binding shape: {self.engine.get_binding_shape(binding)}")
            host_mem = cuda.pagelocked_empty(size, self.dtype)
            device_mem = cuda.mem_alloc(host_mem.nbytes)

            bindings.append(int(device_mem))

            if self.engine.binding_is_input(binding):
                inputs.append(HostDeviceMem(host_mem, device_mem))
            else:
                outputs.append(HostDeviceMem(host_mem, device_mem))

        return inputs, outputs, bindings, stream

    def format_output(self, batch_size):
        return [out.host.reshape(batch_size, -1) for out in self.outputs]

    def __call__(self, x: np.ndarray, batch_size=1):

        x = x.astype(self.dtype)

        print(x.shape)

        np.copyto(self.inputs[0].host, x.ravel())
        # np.copyto(self.inputs[1].host, x.ravel())

        for inp in self.inputs:
            cuda.memcpy_htod_async(inp.device, inp.host, self.stream)

        # self.context.execute_async(batch_size=batch_size, bindings=self.bindings, stream_handle=self.stream.handle)
        self.context.execute_async(batch_size=self.engine_batch_size, bindings=self.bindings, stream_handle=self.stream.handle)
        for out in self.outputs:
            cuda.memcpy_dtoh_async(out.host, out.device, self.stream)

        self.stream.synchronize()
        # return [out.host.reshape(batch_size, -1) for out in self.outputs]
        return self.format_output(batch_size)

class TAOYoloV3(TrtModel):

    def __init__(self, engine_path, max_batch_size=1, dtype=np.float32):
        super().__init__(engine_path, max_batch_size, dtype)

    def format_output(self, batch_size):
        num_detections = self.outputs[0].host
        nmsed_boxes = self.outputs[1].host
        nmsed_boxes = nmsed_boxes.reshape(batch_size, -1, 4)
        nmsed_scores = self.outputs[2].host
        nmsed_classes = self.outputs[3].host
        return num_detections, nmsed_boxes, nmsed_scores, nmsed_classes


if __name__ == '__main__':
    import cv2

    # instantiate tensorrt model handler
    e = TAOYoloV3("/root/export/trt_bs1.engine")

    # read input image with OpenCV (BGR Image)
    test_input = cv2.imread("/root/test_data/000009.png")
    
    # preprocess image:
    # - resize to input width and height
    # - change color to RGB (it is really needed?)
    
    test_input = cv2.resize(test_input, (1248, 384))
    test_input = cv2.cvtColor(test_input, cv2.COLOR_BGR2RGB)  
    test_input_cpy = test_input.copy()

    # create mini-batch
    test_input = np.array([test_input], dtype=np.float32)

    # - reshape to have NCHW format (it is really needed?)
    test_input = np.transpose(test_input, (0, 3, 2, 1))

    print(f"input shape: {test_input.shape}")
    out = e(test_input)
    num_detections, nmsed_boxes, nmsed_scores, nmsed_classes = out
    print(f"num detections: {num_detections}")
 
    # draw boxes
    w = 1248
    h = 384
    for box, s, c in zip(nmsed_boxes[0], nmsed_scores, nmsed_classes):

        if s > 0.1:

            x, y, X, Y = box

           # I'm not sure of this conversion... TAO does not provide any documentation on
           # the box format.
            x, y, X, Y = int(x*w), int(y*h), int(X*w), int(Y*h)

            print(f"box: {x, y, X, Y}, score: {s:.3f}, class: {c}")

            cv2.rectangle(test_input_cpy, (x, y), (X, Y), color=(0, 255, 0), thickness=2)

    cv2.imshow("test", cv2.cvtColor(test_input_cpy, cv2.COLOR_RGB2BGR))
    cv2.waitKey(0)
    cv2.destroyAllWindows()

For postprocessing of yolov3, you can refer to tao-toolkit-triton-apps/yolov3_postprocessor.py at main · NVIDIA-AI-IOT/tao-toolkit-triton-apps · GitHub

Thank you! And for the pre-processing? I mean from when I read the image with opencv to when the image is passed to the model.

See tao-toolkit-triton-apps/frame.py at main · NVIDIA-AI-IOT/tao-toolkit-triton-apps · GitHub

Thank you again, I now see more detections than using my previous method for preprocessing.

I want to thank you for your support!

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.