Description
Try to test “CustomSkipLayerNormPluginDynamic” for int8. I can build the engine for fp16/32, but will get [PluginV2DynamicExt]: could not find any supported formats consistent with input/output data types for int8
Environment
docker image: tensorrt-ubuntu20.04-cuda12.0:latest
GPU Type: A30
Relevant Files
Please attach or include links to any models, data, files, or scripts necessary to reproduce your issue. (Github repo, Google Drive, Dropbox, etc.)
Steps To Reproduce
Log
trtuser@2107802eea6a:/workspace/TensorRT/demo/plugin_test$ python builder.py
[04/11/2023-07:43:06] [TRT] [I] [MemUsageChange] Init CUDA: CPU +455, GPU +0, now: CPU 478, GPU 15798 (MiB)
[04/11/2023-07:43:35] [TRT] [I] [MemUsageChange] Init builder kernel library: CPU +1368, GPU +302, now: CPU 1923, GPU 16100 (MiB)
[04/11/2023-07:43:35] [TRT] [W] CUDA lazy loading is not enabled. Enabling it can significantly reduce device memory usage and speed up TensorRT initialization. See "Lazy Loading" section of CUDA documentation https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#lazy-loading
builder.py:95: DeprecationWarning: Use build_serialized_network instead.
engine = builder.build_engine(network, builder_config)
[04/11/2023-07:43:37] [TRT] [I] Graph optimization time: 7.6554e-05 seconds.
[04/11/2023-07:43:44] [TRT] [I] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +1711, GPU +370, now: CPU 3681, GPU 16482 (MiB)
[04/11/2023-07:43:46] [TRT] [I] [MemUsageChange] Init cuDNN: CPU +251, GPU +58, now: CPU 3932, GPU 16540 (MiB)
[04/11/2023-07:43:46] [TRT] [I] Timing cache disabled. Turning it on will improve builder speed.
[04/11/2023-07:43:46] [TRT] [E] 9: (Unnamed Layer* 0) [PluginV2DynamicExt]: could not find any supported formats consistent with input/output data types
[04/11/2023-07:43:46] [TRT] [E] 9: [pluginV2Builder.cpp::reportPluginError::24] Error Code 9: Internal Error ((Unnamed Layer* 0) [PluginV2DynamicExt]: could not find any supported formats consistent with input/output data types)
Traceback (most recent call last):
File "builder.py", line 114, in <module>
main()
File "builder.py", line 106, in main
serialized_engine = engine.serialize()
AttributeError: 'NoneType' object has no attribute 'serialize'
Code:
import argparse
import ctypes
import json
import numpy as np
import os
import os.path
import re
import sys
import time
import onnx
import pycuda.autoinit
import pdb
# TensorRT
import tensorrt as trt
sys.path.insert(0, '/workspace/TensorRT/demo/BERT')
from helpers.calibrator import BertCalibrator as BertCalibrator
"""
TensorRT Initialization
"""
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
trt_version = [int(n) for n in trt.__version__.split('.')]
# Import necessary plugins for demoBERT
plugin_lib_name = "nvinfer_plugin.dll" if sys.platform == "win32" else "libnvinfer_plugin.so"
env_name_to_add_path = "PATH" if sys.platform == "win32" else "LD_LIBRARY_PATH"
handle = ctypes.CDLL(plugin_lib_name, mode=ctypes.RTLD_GLOBAL)
if not handle:
raise RuntimeError("Could not load plugin library. Is `{}` on your {}?".format(plugin_lib_name, env_name_to_add_path))
trt.init_libnvinfer_plugins(TRT_LOGGER, "")
plg_registry = trt.get_plugin_registry()
skln_plg_creator = plg_registry.get_plugin_creator("CustomSkipLayerNormPluginDynamic", "1", "")
bs = 10
seq = 512
h = 768
# dtype = trt.float32
# dtype = trt.float16
dtype = trt.DataType.INT8
def skipln(init_dict, network, input_tensor, skip, bias=None):
"""
Add the skip layer
"""
idims = input_tensor.shape
hidden_size = idims[2]
pf_ld = trt.PluginField("ld", np.array([hidden_size], np.int32), trt.PluginFieldType.INT32)
wbeta = init_dict["beta"]
pf_beta = trt.PluginField("beta", wbeta.numpy(), trt.PluginFieldType.FLOAT32)
wgamma = init_dict["gamma"]
pf_gamma = trt.PluginField("gamma", wgamma.numpy(), trt.PluginFieldType.FLOAT32)
pf_type = trt.PluginField("type_id", np.array([int(dtype)], np.int32), trt.PluginFieldType.INT32)
fields = [pf_ld, pf_beta, pf_gamma, pf_type ]
if bias:
pf_bias = trt.PluginField("bias", bias.numpy(), trt.PluginFieldType.FLOAT32)
fields.append(pf_bias)
pfc = trt.PluginFieldCollection(fields)
skipln_plug = skln_plg_creator.create_plugin("skipln", pfc)
skipln_inputs = [input_tensor, skip]
layer = network.add_plugin_v2(skipln_inputs, skipln_plug)
return layer
def build_engine(init_dict):
explicit_batch_flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
with trt.Builder(TRT_LOGGER) as builder, builder.create_network(explicit_batch_flag) as network, builder.create_builder_config() as builder_config:
if dtype == trt.float16:
builder_config.set_flag(trt.BuilderFlag.FP16)
if dtype == trt.DataType.INT8:
builder_config.set_flag(trt.BuilderFlag.INT8)
calibrationCacheFile = "BertSquadL{}H{}A{}S{}CalibCache".format(1, h, 1, seq)
calibrator = BertCalibrator('/workspace/TensorRT/demo/BERT/squad/dev-v1.1.json', '/workspace/TensorRT/demo/BERT/models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/vocab.txt', calibrationCacheFile, 1, seq, 100)
builder_config.set_quantization_flag(trt.QuantizationFlag.CALIBRATE_BEFORE_FUSION)
builder_config.int8_calibrator = calibrator
builder_config.profiling_verbosity = trt.ProfilingVerbosity.DETAILED
input_tensor0 = network.add_input(name="input_tensor0", dtype=dtype, shape=(bs, seq, h))
input_tensor1 = network.add_input(name="input_tensor1", dtype=dtype shape=(bs, seq, h))
skiplayer = skipln(init_dict, network, input_tensor0, input_tensor1, None)
# skiplayer.set_output_type(0, trt.DataType.FLOAT)
out = skiplayer.get_output(0)
network.mark_output(out)
engine = builder.build_engine(network, builder_config)
calibrator.free()
return engine
def main():
init_dict={
'beta': trt.Weights(np.ascontiguousarray(np.float32(np.random.rand(1,1,h)))),
'gamma': trt.Weights(np.ascontiguousarray(np.float32(np.random.rand(1,1,h))))
}
engine = build_engine(init_dict)
TRT_LOGGER.log(TRT_LOGGER.VERBOSE, "Serializing Engine...")
serialized_engine = engine.serialize()
TRT_LOGGER.log(TRT_LOGGER.INFO, "Saving Engine to {:}".format('./ln{0}_{1}_{2}.engine'.format(bs, seq, h)))
with open('./ln{0}_{1}_{2}.engine'.format(bs, seq, h), "wb") as fout:
fout.write(serialized_engine)
TRT_LOGGER.log(TRT_LOGGER.INFO, "Done.")
if __name__ == "__main__":
main()