[TensorRT] ERROR: Network must have at least one output

Hello,

I am getting the follow error

[TensorRT] ERROR: Network must have at least one output
Traceback (most recent call last):
File “/home/bplus/Desktop/UNIT_MASTER_THESIS/UNIT/unit1/UNIT_Working/sample_code.py”, line 63, in
context = engine.create_execution_context()
AttributeError: ‘NoneType’ object has no attribute ‘create_execution_context’

Solutions tried:

tried using ONNX-simplifier
but still getting the same issue

the version of pytorch I am using is 1.3.0

please find the code below

import tensorrt as trt
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
import time

model_path = "/home/bplus/Desktop/UNIT_MASTER_THESIS/UNIT/unit1/UNIT_Working/result2.onnx"
input_size = 256
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
#EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)


def build_engine(model_path):
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network() as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
        builder.max_workspace_size = 1<<20
        builder.max_batch_size = 1
        with open(model_path, "rb") as f:
            parser.parse(f.read())
        #network.mark_output(network.get_layer(network.num_layers - 1).get_output(0))
        engine = builder.build_cuda_engine(network)
    return engine

# def inference(engine, context, inputs, out_cpu, in_gpu, out_gpu, stream):
# async version
# with engine.create_execution_context() as context:  # cost time to initialize
# cuda.memcpy_htod_async(in_gpu, inputs, stream)
# context.execute_async(1, [int(in_gpu), int(out_gpu)], stream.handle, None)
# cuda.memcpy_dtoh_async(out_cpu, out_gpu, stream)
# stream.synchronize()
def inference(engine, context, inputs, h_input, h_output, d_input, d_output, stream):
    cuda.memcpy_htod_async(d_input, h_input, stream)
    # Run inference.
    context.execute_async(bindings=[int(d_input), int(d_output)], stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    cuda.memcpy_dtoh_async(h_output, d_output, stream)
    return h_output

    '''
    # sync version
    cuda.memcpy_htod(in_gpu, inputs,stream)
    context.execute(1, [int(in_gpu), int(out_gpu)])
    cuda.memcpy_dtoh(out_cpu, out_gpu,stream)
    return out_cpu'''


def alloc_buf(engine):
    h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), dtype=np.float32)
    h_output = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)), dtype=np.float32)
    # Allocate device memory for inputs and outputs.
    d_input = cuda.mem_alloc(h_input.nbytes)
    d_output = cuda.mem_alloc(h_output.nbytes)
    stream = cuda.Stream()

    return h_input, h_output, d_input, d_output, stream


if __name__ == "__main__":
    for i in range(3):
        if (i == 0):
            inputs = np.random.random((1, 3, input_size, input_size)).astype(np.float32)
            engine = build_engine(model_path)
            print("Engine Created :", type(engine))
            context = engine.create_execution_context()
            print("Context executed ", type(context))
            serialized_engine = engine.serialize()
            t1 = time.time()
            # in_cpu, out_cpu, in_gpu, out_gpu, stream = alloc_buf(engine)
            h_input, h_output, d_input, d_output, stream = alloc_buf(engine)
            res = inference(engine, context, inputs.reshape(-1), h_input, h_output, d_input, d_output, stream)
            # print(type(res))
            print("using fp32 mode:")
            print("cost time: ", time.time() - t1)
        if (i == 1):
            inputs = np.random.random((1, 3, input_size, input_size)).astype(np.float16)
            engine = build_engine(model_path)
            print("Engine Created :", type(engine))
            context = engine.create_execution_context()
            print("Context executed ", type(context))
            serialized_engine = engine.serialize()
            t1 = time.time()
            # in_cpu, out_cpu, in_gpu, out_gpu, stream = alloc_buf(engine)
            h_input, h_output, d_input, d_output, stream = alloc_buf(engine)
            res = inference(engine, context, inputs.reshape(-1), h_input, h_output, d_input, d_output, stream)
            print(type(res))
            print("using fp16 mode:")
            print("cost time: ", time.time() - t1)
        if (i == 2):
            inputs = np.random.random((1, 3, input_size, input_size)).astype(np.int8)
            engine = build_engine(model_path)
            print("Engine Created :", type(engine))
            context = engine.create_execution_context()
            print("Context executed ", type(context))
            serialized_engine = engine.serialize()
            t1 = time.time()
            # in_cpu, out_cpu, in_gpu, out_gpu, stream = alloc_buf(engine)
            h_input, h_output, d_input, d_output, stream = alloc_buf(engine)
            res = inference(engine, context, inputs.reshape(-1), h_input, h_output, d_input, d_output, stream)
            # print(type(res))
            print("using int8 mode:")
            print("cost time: ", time.time() - t1)
    engine_path = "FLtask.trt"
    with open(engine_path, "wb") as f:
        f.write(serialized_engine)
        print("Serialized engine")

any help would be appreciated.

Thanks!!

Hi,

This specific issue is arising because the ONNX Parser isn’t currently compatible with the ONNX models exported from Pytorch 1.3 - If you downgrade to Pytorch 1.2, this issue should go away.

Or Upgrade to TRT 7. Latest TRT 7 supports Pytorch 1.3.

Thanks

Hi

Some one reproduced this issue in TRT7 and PT 1.3. here is a link
https://github.com/NVIDIA/TensorRT/issues/286

BR,
Michael