Description
I am trying to export ONNX BERT model to TensorRT engine.
ONNX model works well but the converted TensorRT engine works like untrained model.
I think either the export process or TensorRT inference caused this problem.
My export & inference code is below.
Is there something wrong with my code?
Environment
TensorRT Version: 8.4.3.1
GPU Type: NVIDIA A100
Nvidia Driver Version: 470.141.03
CUDA Version: 11.4
CUDNN Version:
Operating System + Version: Ubuntu 20.04.3 LTS
Python Version (if applicable): 3.8.10
TensorFlow Version (if applicable):
PyTorch Version (if applicable):
Baremetal or Container (if container which image + tag):
Relevant Files
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
explicit_batch_flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
with trt.Builder(TRT_LOGGER) as builder, builder.create_network(explicit_batch_flag) as network,\
builder.create_builder_config() as builder_config:
parser = trt.OnnxParser(network, TRT_LOGGER)
onnx_path = "./bert.onnx"
with open(onnx_path, "rb") as f:
if not parser.parse(f.read()):
print(f"Failed to load ONNX file: {onnx_path}")
for error in range(parser.num_errors):
print(parser.get_error(error))
inputs = [network.get_input(i) for i in range(network.num_inputs)]
profile = builder.create_optimization_profile()
min_shape = (1, 128)
opt_shape = (8, 128)
max_shape = (16, 128)
profile.set_shape(inputs[0].name, min_shape, opt_shape, max_shape)
profile.set_shape(inputs[1].name, min_shape, opt_shape, max_shape)
profile.set_shape(inputs[2].name, min_shape, opt_shape, max_shape)
builder_config.add_optimization_profile(profile)
engine = builder.build_engine(network, builder_config)
serialized_engine = engine.serialize()
with open("./bert.engine", "wb") as fout:
fout.write(serialized_engine)
with open("./bert.engine", "rb") as f, \
trt.Runtime(TRT_LOGGER) as runtime, \
runtime.deserialize_cuda_engine(f.read()) as engine, \
engine.create_execution_context() as context:
input_shape = (1, max_seq_length)
input_nbytes = trt.volume(input_shape) * trt.int32.itemsize
d_inputs = [cuda.mem_alloc(input_nbytes) for binding in range(3)]
stream = cuda.Stream()
for binding in range(3):
context.set_binding_shape(binding, input_shape)
assert context.all_binding_shapes_specified
h_output1 = cuda.pagelocked_empty(tuple(context.get_binding_shape(3)), dtype=np.float32)
h_output2 = cuda.pagelocked_empty(tuple(context.get_binding_shape(4)), dtype=np.float32)
d_output1 = cuda.mem_alloc(h_output1.nbytes)
d_output2 = cuda.mem_alloc(h_output2.nbytes)
for feature_index, feature in enumerate(features):
input_ids = cuda.register_host_memory(np.ascontiguousarray(feature.input_ids.ravel()))
segment_ids = cuda.register_host_memory(np.ascontiguousarray(feature.segment_ids.ravel()))
input_mask = cuda.register_host_memory(np.ascontiguousarray(feature.input_mask.ravel()))
eval_start_time = time.time()
cuda.memcpy_htod_async(d_inputs[0], input_ids, stream)
cuda.memcpy_htod_async(d_inputs[1], segment_ids, stream)
cuda.memcpy_htod_async(d_inputs[2], input_mask, stream)
context.execute_async_v2(bindings=[int(d_inp) for d_inp in d_inputs] + [int(d_output1), int(d_output2)], stream_handle=stream.handle)
stream.synchronize()
eval_time_elapsed += (time.time() - eval_start_time)
cuda.memcpy_dtoh_async(h_output1, d_output1, stream)
cuda.memcpy_dtoh_async(h_output2, d_output2, stream)
stream.synchronize()