I have been trying to quantize YOLOX from float32 to int8. After that, I want that onnx output to be converted into TensorRT engine.
Quantization process seems OK, however I get several different exceptions while trying to convert it into TRT.
Below is the code that I use for quantization:
import numpy as np
from onnxruntime.quantization import quantize_static, CalibrationMethod, CalibrationDataReader, QuantType, QuantFormat
# loading the float32 ONNX model
onnx_model_input_path = "yolox_l.onnx"
onnx_model_output_path = "output.onnx"
# calibration dataset (dummy data for calibration)
class DummyDataReader(CalibrationDataReader):
def __init__(self, num_samples):
self.num_samples = num_samples
self.current_sample = 0
def get_next(self):
if self.current_sample < self.num_samples:
input_data = self.generate_random_input()
self.current_sample += 1
return {'images': input_data}
else:
return None
def generate_random_input(self):
input_data = np.random.uniform(-1, 1, size=input_shape).astype(np.float32)
return input_data
num_calibration_samples = 100
input_shape = (1, 3, 640, 640)
calibration_data_reader = DummyDataReader(num_samples=num_calibration_samples)
# Quantize the model to int8
quantized_model = quantize_static(
model_input=onnx_model_input_path,
model_output=onnx_model_output_path,
calibration_data_reader=calibration_data_reader,
activation_type=QuantType.QInt8,
weight_type=QuantType.QInt8,
quant_format=QuantFormat.QDQ,
per_channel=False,
calibrate_method=CalibrationMethod.MinMax
)
This outputs a ~55 MB onnx file where the original YOLOX-Large model is ~450MB.
Here comes the errors now, below is the code that I use to convert onnx output model to TRT engine:
import pycuda.driver as cuda
import pycuda.autoinit
from typing import List
import tensorrt as trt
import numpy as np
import time
import cv2
import os
.
.
.
.
TRT_LOGGER = trt.Logger(trt.Logger.ERROR)
EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
def build_engine(self,):
with trt.Builder(TRT_LOGGER) as builder, builder.create_network(EXPLICIT_BATCH) as network, builder.create_builder_config() as config, trt.OnnxParser(network, TRT_LOGGER) as parser, trt.Runtime(TRT_LOGGER) as runtime:
config.max_workspace_size = 1 << self.max_workspace_size
builder.max_batch_size = self.max_batch_size
assert os.path.exists(self.onnx_file_path), print('ONNX file {} not found, please first to generate it.'.format(self.onnx_file_path))
self.logger.info('Loading ONNX file from path {}...'.format(self.onnx_file_path))
with open(self.onnx_file_path, 'rb') as model:
self.logger.info('Beginning ONNX file parsing')
if not parser.parse(model.read()):
self.logger.error ('ERROR: Failed to parse the ONNX file.')
for error in range(parser.num_errors):
self.logger.error(parser.get_error(error))
return None
network.get_input(0).shape = self.input_shape
plan = builder.build_serialized_network(network, config)
engine = runtime.deserialize_cuda_engine(plan)
with open(self.engine_file_path, "wb") as f:
f.write(plan)
return engine
I can successfully convert original yolox_l.onnx to TRT engine with above method. However, it returns None with the quantized model because it cannot parse the onnx and gives the error below:
[09/04/2023-10:46:19] [TRT] [E] head.cls_preds.0.bias_DequantizeLinear_dequantize_scale_node: only activation types allowed as input to this layer.
ERROR:root:ERROR: Failed to parse the ONNX file.
ERROR:root:In node 0 (parseGraph): INVALID_NODE: Invalid Node - head.cls_preds.0.bias_DequantizeLinear
head.cls_preds.0.bias_DequantizeLinear_dequantize_scale_node: only activation types allowed as input to this layer.
Traceback (most recent call last):
File "./web_server/app.py", line 45, in <module>
detect.initialize()
File "src/detector/detector.py", line 81, in initialize
self.context = self.engine.create_execution_context()
AttributeError: 'NoneType' object has no attribute 'create_execution_context'
I tried to quantize the model with quantize_dynamic(), but it gives another lines of exception.
I have also changed the parameters of quantize_static(). When I set the weight and activation types to QUINT8, it gave “asymmetric quantization is not supported” error. Also, I have tried it with Quant type QOperator instead of QDQ, it then gave another error at TRT conversion phase.
Simply, I need to quantize YOLOX large model to int8 and then I need to convert the quantized model to TensorRT engine. Any help will be appreciated. Thank you in advance