Description
Hi.
When I add multiple tensors(>=3) with add_elementwise(), There is a error happened:
[TensorRT] ERROR: ../rtExt/cuda/cudaPointWiseRunner.cpp (117) - Assertion Error in PointWiseRunner: 0 (defaultParams.inputs.size() == static_cast<size_t>(nbInputs))
Environment
TensorRT Version: 7.1.3.4
GPU Type: 2070super
Nvidia Driver Version: 440.31
CUDA Version: 10.2
CUDNN Version: 8
Operating System + Version: ubuntu18.04
Python Version (if applicable): 3.7
TensorFlow Version (if applicable):
PyTorch Version (if applicable): 1.6
Baremetal or Container (if container which image + tag):
Steps To Reproduce
here is the script to reproduce the error:
import tensorrt as trt
import torch
import numpy as np
def main():
print("create trt model")
log_level=trt.Logger.ERROR
logger = trt.Logger(log_level)
builder = trt.Builder(logger)
## build network
EXPLICIT_BATCH = 1 << (int)(
trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
network = builder.create_network(EXPLICIT_BATCH)
input_name = 'input'
output_name = 'output'
input_trt = network.add_input(name=input_name, shape=[1,-1,-1,-1], dtype=trt.float32)
# sum
sum_trt = network.add_elementwise(input_trt, input_trt, trt.ElementWiseOperation.SUM).get_output(0)
sum_trt = network.add_elementwise(sum_trt, input_trt, trt.ElementWiseOperation.SUM).get_output(0)
output = sum_trt
output.name = output_name
network.mark_output(output)
## builder config
max_workspace_size = 1<<30
fp16_mode = False
builder.max_workspace_size = max_workspace_size
builder.fp16_mode = fp16_mode
config = builder.create_builder_config()
config.max_workspace_size = max_workspace_size
profile = builder.create_optimization_profile()
min_shape = (1, 4, 200, 200)
opt_shape = (1, 16, 400, 400)
max_shape = (1, 32, 800, 800)
profile.set_shape(
input_name, min_shape, opt_shape, max_shape)
config.add_optimization_profile(profile)
if fp16_mode:
config.set_flag(trt.BuilderFlag.FP16)
engine = builder.build_engine(network, config)
context = engine.create_execution_context()
print("inference")
input_torch = torch.rand(1,16,400,400).cuda().contiguous()
bindings = [None] * 2
idx = engine.get_binding_index(input_name)
context.set_binding_shape(idx, tuple(input_torch.shape))
bindings[idx] = input_torch.data_ptr()
idx = engine.get_binding_index(output_name)
shape = tuple(context.get_binding_shape(idx))
output_torch = torch.empty(shape,dtype=torch.float32).cuda()
bindings[idx] = output_torch.data_ptr()
context.execute_async_v2(bindings, torch.cuda.current_stream().cuda_stream)
print(output_torch.shape)
print(output_torch.view(-1)[:10])
if __name__ == "__main__":
main()
if I remove the second add_elementwise(), the code works fine. Is there any limit on elementwise op?
Thanks.