Description
I want to write a plugin that only support int8 in/out, so I here is my code in supportsFormatCombination
.
Given conv2d as an example
bool GroupConv2dPlugin::supportsFormatCombination(
int32_t pos, const nvinfer1::PluginTensorDesc* inOut, int32_t nbInputs,
int32_t nbOutputs) noexcept {
// support
// input = filter = output = int8, bias = float, input, filter, output format = ncxhwx<32>
if (pos == 0) {
return ((inOut[pos].type == nvinfer1::DataType::kINT8 && inOut[pos].format == nvinfer1::TensorFormat::kCHW32));
}
// // bias
if (pos == 2) {
return (inOut[0].type == nvinfer1::DataType::kINT8 && inOut[pos].type == nvinfer1::DataType::kFLOAT && inOut[pos].format == nvinfer1::TensorFormat::kLINEAR);
}
// filter and output
return inOut[pos].type == inOut[0].type &&
inOut[pos].format == inOut[0].format;
}
And when I want to build engine using tensorrt python API with those codes
# get onnx model
torch.onnx.export(
module.float(), # including my own plugin
args,
path,
input_names=list(inputs.keys()),
output_names=list(module.output_shapes.keys()),
opset_version=opset_version,
operator_export_type=OperatorExportTypes.ONNX_FALLTHROUGH,
)
# from onnx to engine
TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE)
EXPLICIT_BATCH = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
def build_engine(
onnx_file,
int8=False, # True in my case
fp16=False, # False in my case
max_workspace_size=1,
calibrator=None,
):
"""Takes an ONNX file and creates a TensorRT engine to run inference with"""
with trt.Builder(TRT_LOGGER) as builder, builder.create_network(
EXPLICIT_BATCH
) as network, builder.create_builder_config() as config, trt.OnnxParser(
network, TRT_LOGGER
) as parser, trt.Runtime(
TRT_LOGGER
) as runtime:
config.profiling_verbosity = trt.ProfilingVerbosity.DETAILED
# max_workspace_size GB
config.set_memory_pool_limit(
trt.MemoryPoolType.WORKSPACE, (1 << 32) * max_workspace_size
)
# Parse model file
if not parser.parse(onnx_file.read()):
for error in range(parser.num_errors):
print(parser.get_error(error))
return None
if int8:
config.set_flag(trt.BuilderFlag.INT8)
config.int8_calibrator = calibrator
if fp16:
config.set_flag(trt.BuilderFlag.FP16)
plan = builder.build_serialized_network(network, config)
engine = runtime.deserialize_cuda_engine(plan)
return engine
However, I get a None type returned by builder.build_serialized_network
, with those errors
From TRT LOG
[06/18/2024-07:38:42] [TRT] [I] Timing cache disabled. Turning it on will improve builder speed.
[06/18/2024-07:38:42] [TRT] [V] Constructing optimization profile number 0 [1/1].
[06/18/2024-07:38:42] [TRT] [E] 9: [pluginV2Builder.cpp::reportPluginError::23] Error Code 9: Internal Error (/$MYPLUGIN: could not find any supported formats consistent with input/output data types)
[06/18/2024-07:38:42] [TRT] [E] 2: [builder.cpp::buildSerializedNetwork::751] Error Code 2: Internal Error (Assertion engine != nullptr failed. )
Then python errors:
onnx_file = <_io.BufferedReader name='/tmp/tmp50t98b9f'>, int8 = True, fp16 = False,
max_workspace_size = 1, calibrator = <tests.common.utils.Calibrator object at 0x7f078d419770>
def build_engine(
onnx_file,
int8=False,
fp16=False,
int8_fp16=False,
max_workspace_size=1,
calibrator=None,
):
"""Takes an ONNX file and creates a TensorRT engine to run inference with"""
with trt.Builder(TRT_LOGGER) as builder, builder.create_network(
EXPLICIT_BATCH
) as network, builder.create_builder_config() as config, trt.OnnxParser(
network, TRT_LOGGER
) as parser, trt.Runtime(
TRT_LOGGER
) as runtime:
config.profiling_verbosity = trt.ProfilingVerbosity.DETAILED
# max_workspace_size GB
config.set_memory_pool_limit(
trt.MemoryPoolType.WORKSPACE, (1 << 32) * max_workspace_size
)
# Parse model file
if not parser.parse(onnx_file.read()):
for error in range(parser.num_errors):
print(parser.get_error(error))
return None
if int8:
config.set_flag(trt.BuilderFlag.INT8)
config.int8_calibrator = calibrator
if fp16:
config.set_flag(trt.BuilderFlag.FP16)
plan = builder.build_serialized_network(network, config)
> engine = runtime.deserialize_cuda_engine(plan)
E TypeError: deserialize_cuda_engine(): incompatible function arguments. The following argument types are supported:
E 1. (self: tensorrt.tensorrt.Runtime, serialized_engine: buffer) -> tensorrt.tensorrt.ICudaEngine
E
E Invoked with: <tensorrt.tensorrt.Runtime object at 0x7f07707848b0>, None
Environment
TensorRT Version: 8.5.3
GPU Type: RTX 4090
Nvidia Driver Version: 545.23.08
CUDA Version: 11.8
Operating System + Version: ubuntu 22.04
Python Version (if applicable): 3.8.10
PyTorch Version (if applicable): 2.3
Question
When I add float implementation, Those errors gone. So I was wondering can I write a plugin that only support int8. Or I make some mistake when building trt engines