Why convolution layer cannot fuse a clip activation plugin?

Description

Hello,
I wonder how to make sure the supported fusion patterns, e.g., convolution followed by a clip activation, happen when I build an engine? And if there are more details about the exact requirements of the fusion in tensorRT?

Here is the details, I followed the instructions on the plugin section of the TensorRT tutorial, replaced the Relu activation that follows a convolution layer with the Clip activation shipped with TensorRT, and it worked. But according to the profiling results (I used nsight system to profile), the convolution layer did not fuse the Clip activation, which is supported in the TensorRT fusion types: https://docs.nvidia.com/deeplearning/tensorrt/best-practices/index.html#fusion-types.

I wonder why the supported fusion did not happen, thx.

Environment

TensorRT Version: 5.1.5.0
GPU Type: Tesla V100
Nvidia Driver Version: 418.67
CUDA Version: 10.1
CUDNN Version: 7.5
Operating System + Version: Ubuntu 16.04
Python Version (if applicable): 3.6.10
TensorFlow Version (if applicable): 1.12.0
PyTorch Version (if applicable): None
Baremetal or Container (if container which image + tag): None

Relevant Files

Please attach or include links to any models, data, files, or scripts necessary to reproduce your issue. (Github repo, Google Drive, Dropbox, etc.)

Here is the python script:

import sys
import os
import ctypes
import tensorflow as tf
import tensorrt as trt
import graphsurgeon as gs
import numpy as np
import uff
import common

WORKING_DIR = os.path.dirname(os.path.realpath(file))

CLIP_PLUGIN_LIBRARY = os.path.join(
WORKING_DIR,
‘build/libclipplugin.so’
)

MODEL_PATH = os.path.join(
WORKING_DIR,
‘models/freeze_model.pb’
)

ENGINE_PATH = os.path.join(
WORKING_DIR,
‘models/TRT_MODEL’
)

TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

class ModelData(object):
INPUT_NAME = “audio_feature”
INPUT_SHAPE = (1, 1000, 201)
REPLACE_NAME = “net/act1_1/Relu”
OUTPUT_NAME = “net/output_node_logits”
OUTPUT_SHAPE = (2,)
DATA_TYPE = trt.float32
BATCH_SIZE = 16

def model_path_to_uff_path(model_path):
uff_path = os.path.splitext(model_path)[0] + “.uff”
return uff_path

def model_to_uff(model_path):

trt_clip = gs.create_plugin_node(name="trt_clip", op="Clip_TRT", clipMin=0.0, clipMax=6.0)
tmp = gs.create_node(name='tmp', op=None)

namespace_plugin_map = {
    ModelData.REPLACE_NAME: trt_clip,
    "net/add": tmp
}

dynamic_graph = gs.DynamicGraph(model_path)
dynamic_graph.collapse_namespaces(namespace_plugin_map)
removed_node_list = [
    "net/act1_1/Abs",
    "net/act1_1/sub",
    "net/act1_1/mul",
    "net/act1_1/mul_1"
]
dynamic_graph.remove(removed_node_list, remove_exclusive_dependencies=False)
forward_nodel_list = [
    tmp,
]
dynamic_graph.forward_inputs(forward_nodel_list)

output_uff_path = model_path_to_uff_path(model_path)
uff.from_tensorflow(
    dynamic_graph.as_graph_def(),
    [ModelData.OUTPUT_NAME],
    output_filename=output_uff_path,
    text=True
)

return output_uff_path

def build_engine(model_path):
with trt.Builder(TRT_LOGGER) as builder, builder.create_network() as network, trt.UffParser() as parser:
builder.max_batch_size = 48
builder.max_workspace_size = 1<<30
builder.average_find_iterations = 1
builder.min_find_iterations = 1
builder.debug_sync = True
builder.int8_mode = False
builder.fp16_mode = False
builder.int8_calibrator = None

    uff_path = model_to_uff(model_path)
    parser.register_input(ModelData.INPUT_NAME, ModelData.INPUT_SHAPE)
    parser.register_output(ModelData.OUTPUT_NAME)
    parser.parse(uff_path, network)

    return builder.build_cuda_engine(network)

def main():

if not os.path.isfile(CLIP_PLUGIN_LIBRARY):
    raise IOError("\n{}\n{}\n{}\n".format(
        "Failed to load library ({}).".format(CLIP_PLUGIN_LIBRARY),
        "Please build the Clip sample plugin.",
        "For more information, see the included README.md"
    ))

# Load pretrained model
if not os.path.isfile(MODEL_PATH):
    raise IOError("\n{}\n{}\n{}\n".format(
        "Failed to load model file ({}).".format(MODEL_PATH),
        "Please use 'python lenet5.py' to train and save the model.",
        "For more information, see the included README.md"
    ))

trt.init_libnvinfer_plugins(TRT_LOGGER, '')

# Build an engine and retrieve the image mean from the model.
with build_engine(MODEL_PATH) as engine:
    inputs, outputs, bindings, stream = common.allocate_buffers(engine, ModelData.BATCH_SIZE)
    dummy_data = np.random.randn(ModelData.BATCH_SIZE * 201000)
    np.copyto(inputs[0].host, dummy_data)
    with engine.create_execution_context() as context:
        # The common do_inference function will return a list of outputs - we only have one in this case.
        pred = common.do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream, batch_size=ModelData.BATCH_SIZE)
        print("Prediction: ", pred)

if name == “main”:
main()

Steps To Reproduce

Please include:

  • Exact steps/commands to build your repro
  • Exact steps/commands to run your repro
  • Full traceback of errors encountered

Hi @daizt2017,
Can you please help me with the verbose logs and model file, so that I can help you better.

Thanks!