Error when sum multiple tensor

Description

Hi.

When I add multiple tensors(>=3) with add_elementwise(), There is a error happened:

[TensorRT] ERROR: ../rtExt/cuda/cudaPointWiseRunner.cpp (117) - Assertion Error in PointWiseRunner: 0 (defaultParams.inputs.size() == static_cast<size_t>(nbInputs))

Environment

TensorRT Version: 7.1.3.4
GPU Type: 2070super
Nvidia Driver Version: 440.31
CUDA Version: 10.2
CUDNN Version: 8
Operating System + Version: ubuntu18.04
Python Version (if applicable): 3.7
TensorFlow Version (if applicable):
PyTorch Version (if applicable): 1.6
Baremetal or Container (if container which image + tag):

Steps To Reproduce

here is the script to reproduce the error:

import tensorrt as trt 
import torch
import numpy as np


def main():

    print("create trt model")
    log_level=trt.Logger.ERROR
    logger = trt.Logger(log_level)
    builder = trt.Builder(logger)

    ## build network
    EXPLICIT_BATCH = 1 << (int)(
        trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
    network = builder.create_network(EXPLICIT_BATCH)
    input_name = 'input'
    output_name = 'output'
    input_trt = network.add_input(name=input_name, shape=[1,-1,-1,-1], dtype=trt.float32)

    # sum
    sum_trt = network.add_elementwise(input_trt, input_trt, trt.ElementWiseOperation.SUM).get_output(0)
    sum_trt = network.add_elementwise(sum_trt, input_trt, trt.ElementWiseOperation.SUM).get_output(0)

    output = sum_trt
    output.name = output_name
    network.mark_output(output)

    ## builder config
    max_workspace_size = 1<<30
    fp16_mode = False

    builder.max_workspace_size = max_workspace_size
    builder.fp16_mode = fp16_mode

    config = builder.create_builder_config()
    config.max_workspace_size = max_workspace_size
    profile = builder.create_optimization_profile()

    min_shape = (1, 4, 200, 200)
    opt_shape = (1, 16, 400, 400)
    max_shape = (1, 32, 800, 800)
    profile.set_shape(
        input_name, min_shape, opt_shape, max_shape)
    config.add_optimization_profile(profile)
    if fp16_mode:
        config.set_flag(trt.BuilderFlag.FP16)

    engine = builder.build_engine(network, config)
    context = engine.create_execution_context()

    print("inference")
    input_torch = torch.rand(1,16,400,400).cuda().contiguous()

    bindings = [None] * 2
    idx = engine.get_binding_index(input_name)
    context.set_binding_shape(idx, tuple(input_torch.shape))
    bindings[idx] = input_torch.data_ptr()

    idx = engine.get_binding_index(output_name)
    shape = tuple(context.get_binding_shape(idx))
    output_torch = torch.empty(shape,dtype=torch.float32).cuda()
    bindings[idx] = output_torch.data_ptr()

    context.execute_async_v2(bindings, torch.cuda.current_stream().cuda_stream)

    print(output_torch.shape)
    print(output_torch.view(-1)[:10])



if __name__ == "__main__":
    main()

if I remove the second add_elementwise(), the code works fine. Is there any limit on elementwise op?

Thanks.

Hi @TJWindows,

In this case, one possibility could be dimension mismatch between sum_trt and input_trt .
Please refer to the below link
https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/infer/Graph/Network.html#tensorrt.INetworkDefinition.add_elementwise

Thanks!

but sum_trt=input_trt+input_trt, It should have same dimension, right?

The output tensor has the same number of dimensions as the inputs. For each dimension, its length is the maximum of the lengths of the corresponding input dimension.