Serious bug of residual block in tensorrt 7

Provide details on the platforms you are using:

Linux distro and version 16.04
GPU type GTX 1080TI
Nvidia driver version 418.39
CUDA version10.0
CUDNN version 7.6.5
Python version [if using python] 3.5.2
Tensorflow version N/A
TensorRT version 7.0.0.11

Describe the problem

Residual block in tensorrt 7 produce wrong results compare to other deeplearning frameworks. If I use custom plugin instead of elementwise add or use TensorRT 6.0.1.5, the result is correct.

Files

Log

[TensorRT] WARNING: Current optimization profile is: 0. Please ensure there are no enqueued operations pending in this context prior to switching profiles
[TensorRT] WARNING: Current optimization profile is: 0. Please ensure there are no enqueued operations pending in this context prior to switching profiles
not close where =  (array([0, 0, 0, ..., 0, 0, 0]), array([ 0,  0,  0, ..., 63, 63, 63]), array([  0,   0,   0, ..., 221, 221, 221]), array([  3,   4,   8, ..., 210, 216, 218]))
not close lhs =  [25.611376  8.741138  0.       ...  0.       11.562347 66.4682  ]
not close rhs =  [44.448063  28.066727  37.52344   ...  7.8848243 14.366389  87.094795 ]
not close dif =  [18.836687  19.325588  37.52344   ...  7.8848243  2.8040419 20.626595 ]
not close tol =  [4.5448061e-05 2.9066727e-05 3.8523442e-05 ... 8.8848246e-06 1.5366390e-05
 8.8094792e-05]
dtype = float32, shape = (1, 64, 222, 222)
Traceback (most recent call last):
  File "/root/TensorRT-Perception/test/v2_tensor/test_basic.py", line 193, in <module>
    test_add()
  File "/root/TensorRT-Perception/test/v2_tensor/test_basic.py", line 190, in test_add
    TestCase().assertAllClose(res, a0 + a1)
  File "/usr/local/lib/python3.5/dist-packages/codeai/utils/debug.py", line 72, in assertAllClose
    self._assertArrayLikeAllClose(a, b, rtol=rtol, atol=atol)
  File "/usr/local/lib/python3.5/dist-packages/codeai/utils/debug.py", line 104, in _assertArrayLikeAllClose
    np.testing.assert_allclose(a, b, rtol=rtol, atol=atol, err_msg=msg)
  File "/usr/local/lib/python3.5/dist-packages/numpy/testing/_private/utils.py", line 1452, in assert_allclose
    verbose=verbose, header=header, equal_nan=equal_nan)
  File "/usr/local/lib/python3.5/dist-packages/numpy/testing/_private/utils.py", line 789, in assert_array_compare
    raise AssertionError(msg)
AssertionError: 
Not equal to tolerance rtol=1e-06, atol=1e-06
None
(mismatch 24.995815071828588%)
 x: array([  0.      ,  30.879768, 106.13895 , ...,  46.144886,  50.4092  ,
        37.847828], dtype=float32)
 y: array([  0.      ,  30.879768, 106.13895 , ...,  46.144886,  50.4092  ,
        37.847828], dtype=float32)

Sample Code

class Conv2dRelu(trtplus.Module):
    def __init__(self, weight):
        super().__init__()
        self.weight = weight

    def build(self):
        x = T.placeholder(trt.float32, (1, 3, 224, 224), name="x")
        x = T.nn.conv2d(x, self.weight)
        return T.relu(x)

class AddDev(trtplus.Module):
    def __init__(self, weight):
        super().__init__()
        self.weight = weight

    def build(self):
        x = T.placeholder(trt.float32, (1, 3, 224, 224), name="x")
        y = T.placeholder(trt.float32, (1, 3, 224, 224), name="y")
        x = T.nn.conv2d(x, self.weight)
        y = T.nn.conv2d(y, self.weight)
        x = T.relu(x)
        y = T.relu(y)
        net = T.get_trt_network()
        layer = net.add_elementwise(x.wrapped, y.wrapped, trt.ElementWiseOperation.SUM)
        return layer.get_output(0)
        # return x + y # equivalent to above
        # return T.plugin("Binary", x, y, type=1) # ok if use custom plugin add


def test_add():
    from codeai.utils.debug import TestCase
    wconv = np.random.uniform(-5, 5, size=[64, 3, 3, 3]).astype(np.float32)
    mod_conv = Conv2dRelu(wconv).build_engine(workspace=2**16)
    mod = AddDev(wconv).build_engine(workspace=2**16)
    x = np.random.uniform(-5, 5, size=[1, 3, 224, 224]).astype(np.float32)
    y = np.random.uniform(-5, 5, size=[1, 3, 224, 224]).astype(np.float32)
    res = mod(x, y)
    a0 = mod_conv(x).copy() # zero copy
    a1 = mod_conv(y)
    TestCase().assertAllClose(res, a0 + a1)

if __name__ == "__main__":
    test_add()

Another network example:

class ResidualBlock(trtplus.Module):
    def __init__(self, weight, bug=False):
        super().__init__()
        self.weight = weight
        self.bug = bug 

    def build(self):
        x = T.placeholder(trt.float32, (1, 3, 224, 224), name="x")
        y = T.nn.conv2d(x, self.weight, padding=1)
        y = T.relu(y)
        if not self.bug:
            return T.plugin("Binary", x, y, type=1) # ok if use custom plugin add
        else:
            return x + y

I think this bug is related to add optimization of graph.

Hi,

Could you please share the full repro code so that we can debug the issue and help better?

Thanks

put following code to tensorrt samples/python/network_api_pytorch_mnist and run.
In TensorRT 7, the output is ~784, in 6.0.1.5, the output is 0.0

import model
from PIL import Image
import numpy as np

import pycuda.driver as cuda
import pycuda.autoinit

import tensorrt as trt

import sys, os
sys.path.insert(1, os.path.join(sys.path[0], ".."))
import common

# You can set the logger severity higher to suppress messages (or lower to display more messages).
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

class ModelData(object):
    INPUT_NAME = "data"
    INPUT_SHAPE = (3, 224, 224)
    OUTPUT_NAME = "prob"
    OUTPUT_SIZE = 10
    DTYPE = trt.float32

def residual_bug(network, weights):
    # Configure the network layers based on the weights provided.
    input_tensor = network.add_input(name=ModelData.INPUT_NAME, dtype=ModelData.DTYPE, shape=ModelData.INPUT_SHAPE)

    conv1_w = trt.Weights(weights['conv1.weight'])
    conv1_b = trt.Weights()

    conv1 = network.add_convolution(input=input_tensor, num_output_maps=3, kernel_shape=(3, 3), kernel=conv1_w, bias=conv1_b)
    conv1.padding = (1, 1)
    relu1 = network.add_activation(input=conv1.get_output(0), type=trt.ActivationType.RELU)
    
    add1 = network.add_elementwise(relu1.get_output(0), input_tensor, trt.ElementWiseOperation.SUM)
    network.mark_output(tensor=add1.get_output(0))

def no_residual(network, weights):
    # Configure the network layers based on the weights provided.
    input_tensor = network.add_input(name=ModelData.INPUT_NAME, dtype=ModelData.DTYPE, shape=ModelData.INPUT_SHAPE)

    conv1_w = trt.Weights(weights['conv1.weight'])
    conv1_b = trt.Weights()

    conv1 = network.add_convolution(input=input_tensor, num_output_maps=3, kernel_shape=(3, 3), kernel=conv1_w, bias=conv1_b)
    conv1.padding = (1, 1)
    relu1 = network.add_activation(input=conv1.get_output(0), type=trt.ActivationType.RELU)
    network.mark_output(tensor=relu1.get_output(0))


def build_bug_engine(weights):
    # For more information on TRT basics, refer to the introductory samples.
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network() as network:
        builder.max_workspace_size = common.GiB(1)
        # Populate the network using weights from the PyTorch model.
        residual_bug(network, weights)
        # Build and return an engine.
        return builder.build_cuda_engine(network)

def build_engine(weights):
    # For more information on TRT basics, refer to the introductory samples.
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network() as network:
        builder.max_workspace_size = common.GiB(1)
        # Populate the network using weights from the PyTorch model.
        no_residual(network, weights)
        # Build and return an engine.
        return builder.build_cuda_engine(network)

def main():
    common.add_help(description="Runs an MNIST network using a PyTorch model")
    # Train the PyTorch model
    wconv = np.random.uniform(-5, 5, size=[3, 3, 3, 3]).astype(np.float32)
    x = np.random.uniform(-5, 5, size=[1, 3, 224, 224]).astype(np.float32)
    weights = {"conv1.weight": wconv}
    # Do inference with TensorRT.
    with build_engine(weights) as engine:
        # Build an engine, allocate buffers and create a stream.
        # For more information on buffer allocation, refer to the introductory samples.
        inputs, outputs, bindings, stream = common.allocate_buffers(engine)
        with engine.create_execution_context() as context:
            inputs[0].host[:] = x.reshape(-1)
            # For more information on performing inference, refer to the introductory samples.
            # The common.do_inference function will return a list of outputs - we only have one in this case.
            [output] = common.do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)
    output_ref = output.reshape(1, 3, 224, 224) + x # residual add outside

    with build_bug_engine(weights) as engine:
        # Build an engine, allocate buffers and create a stream.
        # For more information on buffer allocation, refer to the introductory samples.
        inputs, outputs, bindings, stream = common.allocate_buffers(engine)
        with engine.create_execution_context() as context:
            inputs[0].host[:] = x.reshape(-1)
            # For more information on performing inference, refer to the introductory samples.
            # The common.do_inference function will return a list of outputs - we only have one in this case.
            [output] = common.do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)

    print(np.linalg.norm(output_ref - output.reshape(1, 3, 224, 224)))

if __name__ == '__main__':
    print(trt.__version__)
    main()

Thanks for sharing the script, we will look into it and update you accordingly.

This issue has been fixed and should be included in the next release.
Please stay tuned for next TRT release.

Thanks

could you tell me what is the next release version? 7.1.x, 7.0.1.x or 8.0? we wish to get fixed version as quickly as possible.