Issues with dynamic shapes Try increasing the workspace size with IBuilderConfig::setMaxWorkspaceSize()

Description

I trying to deal with dynamic shapes and create small net (lenet5 like) to understand how it works

Environment

TensorRT Version: 7.1.3
GPU Type: GeForce RTX 2080 Ti
Nvidia Driver Version: 470.103.01
CUDA Version: 10.2
CUDNN Version: 8.0.5
Operating System + Version: Ubuntu 18.04
Python Version (if applicable): 3.6.9
TensorFlow Version (if applicable):
PyTorch Version (if applicable): 1.8.1
Baremetal or Container (if container which image + tag):

Steps To Reproduce

Here is code to reproduce:

import torch
import torch.nn as nn

import common
import numpy as np

import time

import tensorrt as trt

TRT_LOGGER = trt.Logger(trt.Logger.WARNING)


global BATCH_SIZE 
BATCH_SIZE = 8

class LeNet5(nn.Module):
    def __init__(self):
        super().__init__()
        self._body = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5), 
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2),
            
            nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2),
        )
        
        self._head = nn.Sequential(
            nn.Linear(in_features=16 * 5 * 5, out_features=120), 
            nn.ReLU(inplace=True),
            nn.Linear(in_features=120, out_features=84), 
            nn.ReLU(inplace=True),
            nn.Linear(in_features=84, out_features=10)
        )

    def forward(self, x):
        x = self._body(x)
        x = x.view(x.size()[0], -1)
        x = self._head(x)
        return x

# initialize the model
lenet5_model = LeNet5()


class ModelData(object):
    INPUT_NAME = "data"
    MODEL_PATH = 'models/lenet5_mnist.pt'
    INPUT_SHAPE = (-1, 1, -1, -1)
    OUTPUT_NAME = "prob"
    DTYPE = trt.float32


class LeNet5TRT(object):
    def __init__(self, weights) -> None:
        super().__init__()
        self.weights = weights
        self.engine = self.build_engine()

    def populate_network(self):
        # Configure the network layers based on the self.weights provided.
        input_tensor = self.network.add_input(name=ModelData.INPUT_NAME, dtype=ModelData.DTYPE, shape=ModelData.INPUT_SHAPE)

        # body
        _body_conv1_w = self.weights['_body.0.weight'].numpy()
        _body_conv1_b = self.weights['_body.0.bias'].numpy()
        _body_conv1 = self.network.add_convolution(input=input_tensor,
                                             num_output_maps=6,
                                             kernel_shape=(5, 5),
                                             kernel=_body_conv1_w ,
                                             bias=_body_conv1_b)
        _body_conv1.stride = (1, 1)
        _body_conv1.padding = (0, 0)

        _body_relu1 = self.network.add_activation(
                                            input=_body_conv1.get_output(0), 
                                            type=trt.ActivationType.RELU)

        _body_maxpool1 = self.network.add_pooling(input=_body_relu1.get_output(0), 
                                        type=trt.PoolingType.MAX, 
                                        window_size=(2, 2))

        _body_conv2_w = self.weights['_body.3.weight'].numpy()
        _body_conv2_b = self.weights['_body.3.bias'].numpy()
        _body_conv2 = self.network.add_convolution(input=_body_maxpool1.get_output(0),
                                             num_output_maps=16,
                                             kernel_shape=(5, 5),
                                             kernel=_body_conv2_w,
                                             bias=_body_conv2_b)
        _body_conv2.stride = (1, 1)
        _body_conv2.padding = (0, 0)

        _body_relu2 = self.network.add_activation(
                                            input=_body_conv2.get_output(0), 
                                            type=trt.ActivationType.RELU)
        _body_maxpool2 = self.network.add_pooling(input=_body_relu2.get_output(0), 
                                        type=trt.PoolingType.MAX, 
                                        window_size=(2, 2))

        # head
        _head_linear1_w = self.weights['_head.0.weight'].numpy()
        _head_linear1_b = self.weights['_head.0.bias'].numpy()
        _head_linear1 = self.network.add_fully_connected(
                                                         input=_body_maxpool2.get_output(0),
                                                         num_outputs=120,
                                                         kernel=_head_linear1_w,
                                                         bias=_head_linear1_b)
        _head_relu1 = self.network.add_activation(
                                                  input=_head_linear1 .get_output(0), 
                                                  type=trt.ActivationType.RELU)

        _head_linear2_w = self.weights['_head.2.weight'].numpy()
        _head_linear2_b = self.weights['_head.2.bias'].numpy()
        _head_linear2 = self.network.add_fully_connected(
                                                         input=_head_relu1.get_output(0),
                                                         num_outputs=84,
                                                         kernel=_head_linear2_w,
                                                         bias=_head_linear2_b)

        _head_relu2 = self.network.add_activation(
                                                  input=_head_linear2 .get_output(0), 
                                                  type=trt.ActivationType.RELU)

        _head_linear3_w = self.weights['_head.4.weight'].numpy()
        _head_linear3_b = self.weights['_head.4.bias'].numpy()
        _head_linear3 = self.network.add_fully_connected(
                                                         input=_head_relu2.get_output(0),
                                                         num_outputs=10,
                                                         kernel=_head_linear3_w,
                                                         bias=_head_linear3_b)
        
        _head_linear3.get_output(0).name = "prob"
        self.network.mark_output(tensor=_head_linear3 .get_output(0))

    def GiB(self, val):
        return val * 1 << 30


    def build_engine(self):
        logger= trt.Logger(trt.Logger.INFO)

        with trt.Builder(logger) as builder:
                builder.max_batch_size = BATCH_SIZE

                network_flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
            
                with builder.create_network(network_flag) as net:
                    self.network = net
                    self.populate_network()
                    self.network.get_input(0).dtype=trt.DataType.HALF
                    self.network.get_output(0).dtype=trt.DataType.HALF
                    # we set the inputs and outputs to be float16 type to enable
                    # maximum fp16 acceleration. Also helps for int8

                    config=builder.create_builder_config()
                    # we specify all the important parameters like precision, 
                    # device type, fallback in config object

                    config.max_workspace_size =  1 << 30 # 10 * (2 ** 30) # 1 gb

                    config.set_flag(trt.BuilderFlag.GPU_FALLBACK)
                    config.set_flag(trt.BuilderFlag.FP16)

                    config.profiling_verbosity = trt.ProfilingVerbosity.VERBOSE
                    #building with verbose profiling helps debug the engine if there are
                    #errors in inference output. Does not impact throughput.
                    profile = builder.create_optimization_profile();
                    profile.set_shape(ModelData.INPUT_NAME, 
                                     (BATCH_SIZE, 1, 16, 16), 
                                     (BATCH_SIZE, 1, 32, 32), 
                                     (BATCH_SIZE, 1, 64, 64)) 
                    config.add_optimization_profile(profile)
            
                    return builder.build_engine(net, config)


def load_random_test_case(pagelocked_buffer):
    # Select an image at random to be the test case.
    img = np.random.rand(BATCH_SIZE,1,32,32).astype(np.float32)
    # Copy to the pagelocked input buffer
    np.copyto(pagelocked_buffer, img.ravel())
    return img

def main():
    common.add_help(description="Yeah!")
    # Get the PyTorch weights
    lenet5_model = LeNet5()
    lenet5_model.eval()
    lenet5_model.load_state_dict(torch.load(ModelData.MODEL_PATH))
    weights = lenet5_model.state_dict()

    # Do inference with TensorRT.
    with LeNet5TRT(weights).engine as engine:
        # Build an engine, allocate buffers and create a stream.
        # For more information on buffer allocation, refer to the introductory samples.
        with open('models/lenet5_mnist.trt', "wb") as f:
            f.write(engine.serialize())

        with open('models/lenet5_mnist.trt', "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
            engine = runtime.deserialize_cuda_engine(f.read())
            inputs, outputs, bindings, stream = common.allocate_buffers(engine, batch_size=1)
            with engine.create_execution_context() as context:
                t = 0
                for _ in range(1):
                    img = load_random_test_case(pagelocked_buffer=inputs[0].host)
                    # For more information on performing inference, refer to the introductory samples.
                    # The common.do_inference function will return a list of outputs 
                    a = time.time()
                    context.set_binding_shape(0, (BATCH_SIZE, 1, 32, 32))
                    context.active_optimization_profile = 0
                    pred_trt = common.do_inference_v2(context, bindings=bindings, inputs=inputs, 
                                                      outputs=outputs, stream=stream)


                    t += time.time() - a

        with torch.no_grad():   
            pred_torch = lenet5_model.cuda()(torch.from_numpy(img).cuda())
            print('baseline: ', pred_torch.cpu().numpy())
        print(np.asarray(pred_trt, dtype=np.float32).shape)
        print('output:   ', np.asarray(pred_trt, dtype=np.float32).reshape((BATCH_SIZE, 10)))
        print('diff:    ', torch.max(torch.abs(pred_torch.cpu() - 
                                     torch.as_tensor(np.asarray(pred_trt, dtype=np.float32).reshape((BATCH_SIZE, 10))))))
    print('Time: ', t)

if __name__ == '__main__':
    main()

When run get the next error:

[TensorRT] ERROR: Try increasing the workspace size with IBuilderConfig::setMaxWorkspaceSize() if using IBuilder::buildEngineWithConfig, or IBuilder::setMaxWorkspaceSize() if using IBuilder::buildCudaEngine.
[TensorRT] ERROR: ../builder/tacticOptimizer.cpp (1715) - TRTInternal Error in computeCosts: 0 (Could not find any implementation for node (Unnamed Layer* 6) [Fully Connected] + (Unnamed Layer* 7) [Activation].)
[TensorRT] ERROR: ../builder/tacticOptimizer.cpp (1715) - TRTInternal Error in computeCosts: 0 (Could not find any implementation for node (Unnamed Layer* 6) [Fully Connected] + (Unnamed Layer* 7) [Activation].)
Traceback (most recent call last):
  File "torch_inf.py", line 261, in <module>
    main()
  File "torch_inf.py", line 224, in main
    with LeNet5TRT(weights).engine as engine:
AttributeError: __enter__

I’ve tried to increase max_workspace_size up to 11 gb (all my gpu memory) but got the same error.

 config.max_workspace_size =  11 

I tried different things and when I set

INPUT_SHAPE = (-1, 1, 32, 32)

and

                    profile.set_shape(ModelData.INPUT_NAME, 
                                     (BATCH_SIZE, 1, 32, 32), 
                                     (BATCH_SIZE, 1, 32, 32), 
                                     (BATCH_SIZE, 1, 32, 32)) 

It works properly.
I wonder what is the reason of that behavior?

Hi, Please refer to the below links to perform inference in INT8

Thanks!

Hi,

There were known similar issues in older TRT version. And those are fixed in the later versions. We recommend you to please use the latest Jetpack version, which comes with TRT 8.

Thank you.

So I suggest to use int8 for inference? not fp16 or fp32?

We recommend you try with latest TRT version. And if you still face this issue, please share with us the issue repro script and model to try from our end for better debugging.

Ok thanks. I replied to @NVES post about int8. Unfortunately for my project I have to use tensorrt 7.1.3