ERROR when trying to convert PyTorch model to TensorRT
Hi,
I am trying to convert a segmentation model made in PyTorch to ONNX and then to TensorRT.
The segmentation model consists of a ‘efficientnet-b2’ encoder and a ‘FPN’ decoder (made with this repo GitHub - qubvel/segmentation_models.pytorch: Segmentation models with pretrained backbones. PyTorch.).
If I run this line of code (see below for steps to reproduce) i get the following error:
latency = compute_latency_ms_tensorrt('./model.onnx', (1, 3, 704, 1280))
ERROR:
[TensorRT] ERROR: Network must have at least one output
[TensorRT] ERROR: Network validation failed.
Traceback (most recent call last):
File "onnxTrial.py", line 100, in <module>
latency = compute_latency_ms_tensorrt('./model.onnx', (1, 3, 704, 1280))
File "onnxTrial.py", line 89, in compute_latency_ms_tensorrt
with build_engine("model.onnx") as engine:
AttributeError: __enter__
I tried to visualize the ONNX model with the Netron library whichs positively confirms there is a INPUT and OUTPUT.
I tried to solve the ERROR above by setting the output manually with this line of code after parsing the ONNX model:
with open(model_file, 'rb') as model:
parser.parse(model.read())
last_layer = network.get_layer(network.num_layers - 1)
network.mark_output(last_layer.get_output(0))
return builder.build_cuda_engine(network)
The ERROR now becomes:
python: ../builder/Network.cpp:1205: virtual nvinfer1::ILayer* nvinfer1::Network::getLayer(int) const: Assertion `layerIndex >= 0' failed.
Aborted (core dumped)
Any suggestions on how I can fix my TensorRT pipeline?
Thanks in advance,
Michiel
Environment
TensorRT Version: 7.2.1
GPU Type: 2070 Super
Nvidia Driver Version: 455.23.04
CUDA Version: 11.1
CUDNN Version: 8.0.4
Operating System + Version: Ubuntu 20.04
Python Version (if applicable): 3.6.9
PyTorch Version (if applicable): 1.6.0
Baremetal or Container (if container which image + tag): docker pull nvcr.io/nvidia/tensorrt:20.10-py3
Steps To Reproduce
ONNX conversion:
model = torch.load('./model.pth')
model.encoder.set_swish(memory_efficient=False)
model = model.cuda()
model.eval()
dummy_input = torch.randn(1, 3, 704, 1280, device='cuda')
torch.onnx.export(model, dummy_input, "model.onnx", input_names=["input"], output_names=["output"], verbose=True, opset_version=11)
TensorRT conversion:
MAX_BATCH_SIZE = 1
MAX_WORKSPACE_SIZE = 1 << 30
TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE)
DTYPE = trt.float32
# Model
INPUT_NAME = 'input'
OUTPUT_NAME = 'output'
def allocate_buffers(engine):
h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), dtype=trt.nptype(DTYPE))
h_output = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)), dtype=trt.nptype(DTYPE))
d_input = cuda.mem_alloc(h_input.nbytes)
d_output = cuda.mem_alloc(h_output.nbytes)
return h_input, d_input, h_output, d_output
def build_engine(model_file):
with trt.Builder(TRT_LOGGER) as builder, builder.create_network() as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
builder.max_workspace_size = MAX_WORKSPACE_SIZE
builder.max_batch_size = MAX_BATCH_SIZE
with open(model_file, 'rb') as model:
parser.parse(model.read())
return builder.build_cuda_engine(network)
def load_input(input_size, host_buffer):
assert len(input_size) == 4
b, c, h, w = input_size
dtype = trt.nptype(DTYPE)
img_array = np.random.randn(c, h, w).astype(dtype).ravel()
np.copyto(host_buffer, img_array)
def do_inference(context, h_input, d_input, h_output, d_output, iterations=None):
# Transfer input data to the GPU.
cuda.memcpy_htod(d_input, h_input)
# warm-up
for _ in range(10):
context.execute(batch_size=1, bindings=[int(d_input), int(d_output)])
# test proper iterations
if iterations is None:
elapsed_time = 0
iterations = 100
while elapsed_time < 1:
t_start = time.time()
for _ in range(iterations):
context.execute(batch_size=1, bindings=[int(d_input), int(d_output)])
elapsed_time = time.time() - t_start
iterations *= 2
FPS = iterations / elapsed_time
iterations = int(FPS * 3)
# Run inference.
t_start = time.time()
for _ in tqdm(range(iterations)):
context.execute(batch_size=1, bindings=[int(d_input), int(d_output)])
elapsed_time = time.time() - t_start
latency = elapsed_time / iterations * 1000
return latency
def compute_latency_ms_tensorrt(model, input_size, iterations=None):
with build_engine("model.onnx") as engine:
h_input, d_input, h_output, d_output = allocate_buffers(engine)
load_input(input_size, h_input)
with engine.create_execution_context() as context:
latency = do_inference(context, h_input, d_input, h_output, d_output, iterations=iterations)
# FPS = 1000 / latency (in ms)
return latency