Description
first I converted torch model to onnx using the following code in terminal :
python -m transformers.onnx --model=monologg/bert-base-cased-goemotions-original bertbase-cased-goemotion-original/ --feature causal-lm
then using the following python code I built an engine :
import pycuda.driver as cuda
import pycuda.autoinit
import onnx
import argparse
import tensorrt as trt
import os
import sys
def build_engine(model_file, shapes, max_ws=512*1024*1024, fp16=True):
TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE)
builder = trt.Builder(TRT_LOGGER)
#builder.fp16_mode = fp16
config = builder.create_builder_config()
# config.max_workspace_size = max_ws
config.max_workspace_size = 1 << 31
if fp16:
config.set_flag(trt.BuilderFlag.FP16)
# config.flags |= 1 << int(trt.BuilderFlag.FP16)
profile = builder.create_optimization_profile()
for s in shapes:
profile.set_shape(s['name'], min=s['min'], opt=s['opt'], max=s['max'])
config.add_optimization_profile(profile)
#Added for further optimization
config.set_tactic_sources(1 << int(trt.TacticSource.CUBLAS))
explicit_batch = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
network = builder.create_network(explicit_batch)
with trt.OnnxParser(network, TRT_LOGGER) as parser:
with open(model_file, 'rb') as model:
parsed = parser.parse(model.read())
last_layer = network.get_layer(network.num_layers - 1)
# Check if last layer recognizes it's output
if not last_layer.get_output(0):
# If not, then mark the output using TensorRT API
network.mark_output(last_layer.get_output(0))
for i in range(parser.num_errors):
print("TensorRT ONNX parser error:", parser.get_error(i))
engine = builder.build_serialized_network(network, config=config)
return engine
sys.path.append('./')
#from trt_utils import build_engine
model_name = "bert-base-cased-goemotions-ekman_model"
fp16 = False
engine_prec = "_fp16" if fp16 else "_fp32"
out_folder = "/3" if fp16 else "/1"
model = "/yazdani/bert-base-cased-goemotion-original/model.onnx"
output = "/yazdani/bert-base-cased-goemotion-original/trt"
shape=[{"name": "input_ids", "min": (1,3), "opt": (1,20), "max": (1,80)},
{"name": "attention_mask", "min": (1,3), "opt": (1,20), "max": (1,80)},
{"name": "token_type_ids", "min": (1,3), "opt": (1,20), "max": (1,80)}
]
if model != "":
print("Building model ...")
model_engine = build_engine(model, shapes = shape ,fp16=fp16)
if model_engine is not None:
engine_path = os.path.join(output, "model"+engine_prec+".plan")
with open(engine_path, 'wb') as f:
f.write(model_engine)
# f.write(model_engine.serialize())
else:
print("Failed to build engine from", model)
sys.exit()
finally I wanted to get inference from the engine using the following script :
import tensorrt as trt
from transformers import BertTokenizer
import torch
import pycuda.driver as cuda
import pycuda.autoinit
import time
import data_processing as dp
import numpy as np
import ctypes
import os
ctypes.CDLL("libnvinfer_plugin.so", mode=ctypes.RTLD_GLOBAL)
def is_dimension_dynamic(dim):
return dim is None or dim <= 0
def is_shape_dynamic(shape):
return any([is_dimension_dynamic(dim) for dim in shape])
def run_trt_engine(context, engine, tensors):
bindings = [None]*engine.num_bindings
for name,tensor in tensors['inputs'].items():
idx = engine.get_binding_index(name)
bindings[idx] = tensor.data_ptr()
if engine.is_shape_binding(idx) and is_shape_dynamic(context.get_shape(idx)):
context.set_shape_input(idx, tensor)
elif is_shape_dynamic(engine.get_binding_shape(idx)):
context.set_binding_shape(idx, tensor.shape)
for name,tensor in tensors['outputs'].items():
idx = engine.get_binding_index(name)
bindings[idx] = tensor.data_ptr()
stream = cuda.Stream()
context.execute_v2(bindings=bindings)
def load_engine(engine_filepath, trt_logger):
with open(engine_filepath, "rb") as f, trt.Runtime(trt_logger) as runtime:
engine = runtime.deserialize_cuda_engine(f.read())
return engine
def engine_info(engine_filepath):
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
engine = load_engine(engine_filepath, TRT_LOGGER)
binding_template = r"""{btype} {{
name: "{bname}"
data_type: {dtype}
dims: {dims}}}"""
type_mapping = {"DataType.HALF": "TYPE_FP16",
"DataType.FLOAT": "TYPE_FP32",
"DataType.INT32": "TYPE_INT32",
"DataType.BOOL" : "TYPE_BOOL"}
print("engine name", engine.name)
print("has_implicit_batch_dimension", engine.has_implicit_batch_dimension)
start_dim = 0 if engine.has_implicit_batch_dimension else 1
print("num_optimization_profiles", engine.num_optimization_profiles)
print("max_batch_size:", engine.max_batch_size)
print("device_memory_size:", engine.device_memory_size)
print("num_layers:", engine.num_layers)
for i in range(engine.num_bindings):
btype = "input" if engine.binding_is_input(i) else "output"
bname = engine.get_binding_name(i)
dtype = engine.get_binding_dtype(i)
bdims = engine.get_binding_shape(i)
config_values = {
"btype": btype,
"bname": bname,
"dtype": type_mapping[str(dtype)],
"dims": list(bdims[start_dim:])
}
final_binding_str = binding_template.format_map(config_values)
return engine
engine_filepath = "/yazdani/bert-base-cased-goemotion-original/trt/model_fp32.plan"
engine = engine_info(engine_filepath)
context = engine.create_execution_context()
batch_size = 1
tokenizer = BertTokenizer.from_pretrained("monologg/bert-base-cased-goemotions-original")
inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np")
seq_len = len(inputs.input_ids[0])
bert_output = torch.zeros((batch_size, seq_len,28996)).cpu().detach().numpy()
d_input_ids = cuda.mem_alloc(batch_size * inputs.input_ids.nbytes)
d_token_type_ids = cuda.mem_alloc(batch_size * inputs.token_type_ids.nbytes)
d_attention_mask = cuda.mem_alloc(batch_size * inputs.attention_mask.nbytes)
d_output = cuda.mem_alloc(batch_size * bert_output.nbytes)
bindings = [int(d_input_ids), int(d_token_type_ids), int(d_attention_mask), int(d_output)]
stream = cuda.Stream()
cuda.memcpy_htod_async(d_input_ids, inputs.input_ids, stream)
cuda.memcpy_htod_async(d_token_type_ids, inputs.token_type_ids, stream)
cuda.memcpy_htod_async(d_attention_mask, inputs.attention_mask, stream)
context.execute_async(batch_size, bindings, stream.handle, None)
cuda.memcpy_dtoh_async(bert_output, d_output, stream)
stream.synchronize()
pred = torch.tensor(bert_output)
print("##########done#########")
but I encountered the error below:
[04/19/2022-12:18:02] [TRT] [E] 3: [executionContext.cpp::resolveSlots::1480] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/executionContext.cpp::resolveSlots::1480, condition: allInputDimensionsSpecified(routine)
)
[04/19/2022-12:18:02] [TRT] [E] 2: [executionContext.cpp::enqueueInternal::366] Error Code 2: Internal Error (Could not resolve slots: )
Environment
TensorRT Version: 8.2
GPU Type: V100
Nvidia Driver Version: 495.29.05
CUDA Version: 11.5
Operating System + Version: Linux x86
Python Version (if applicable): 3.8