TensorRT 5.X / 6.X Batch Size Problem

Hi all,
I want to optimizing our detection model on tensorrt. However, when I set max_batch_size > 1, inference time increases proportionally to batch size.

Basically:

batch size=1 fp16 infTime is about 10ms
batch size=4 fp16 infTime is about 39ms
batch size=1 fp32 infTime is about 34.4ms
batch size=4 fp32 infTime is about 129ms

How to solved this problem?

Hi,

I will recommend you to use TRT 7 and use optimization profiles to optimize for multiple different batch sizes.
https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#opt_profiles

Thanks

Thanks.
The optimization profiler only supported in the version 7?
sorry, I’m new in this work, if possible a bit more guaidance me to add profiler to my code.
I used like for converting.

Please refer below link:
https://docs.nvidia.com/deeplearning/sdk/tensorrt-archived/tensorrt-710-ea/tensorrt-developer-guide/index.html#work_dynamic_shapes
You can also refer to sample:

Thanks

Hi all,
I see the Deeper dive into TensoRT and TRITON NVIDIA webinar, and tell the TensorRT 7 support dynamin input shape and dynamic batch size but TRT5/6 only support dynamic batch size. I want to know, really the TRT5/6 doesn’t support the dynamic batch size or I don’t correct work.

I used this for converting the model with UFF parser

with trt.Builder(TRT_LOGGER) as builder, builder.create_network() as network, trt.UffParser() as parser:
    builder.max_workspace_size = 1 << 30
    builder.max_batch_size = 5
    builder.fp16_mode = True

    parser.register_input('Input', INPUT_DIMS)
    parser.register_output('MarkOutput_0')
    parser.parse(spec['tmp_uff'], network)
    engine = builder.build_cuda_engine(network)

    buf = engine.serialize()
    with open(spec['output_bin'], 'wb') as f:
        f.write(buf)

Then I used the below code for inference:

class TrtSSD(object):

def _load_plugins(self):
    if trt.__version__[0] < '7':
        ctypes.CDLL("ssd/libflattenconcat.so")
    trt.init_libnvinfer_plugins(self.trt_logger, '')

def _load_engine(self):
    TRTbin = 'ssd/TRT_%s.bin' % self.model
    with open(TRTbin, 'rb') as f, trt.Runtime(self.trt_logger) as runtime:
        return runtime.deserialize_cuda_engine(f.read())

def _allocate_buffers(self):
    host_inputs, host_outputs, cuda_inputs, cuda_outputs, bindings = \
        [], [], [], [], []
    for binding in self.engine:
        size = trt.volume(self.engine.get_binding_shape(binding)) * \
               self.engine.max_batch_size
        host_mem = cuda.pagelocked_empty(size, np.float32)
        cuda_mem = cuda.mem_alloc(host_mem.nbytes)
        bindings.append(int(cuda_mem))
        if self.engine.binding_is_input(binding):
            host_inputs.append(host_mem)
            cuda_inputs.append(cuda_mem)
        else:
            host_outputs.append(host_mem)
            cuda_outputs.append(cuda_mem)
    return host_inputs, host_outputs, cuda_inputs, cuda_outputs, bindings

def __init__(self, model, input_shape, cuda_ctx=None):
    """Initialize TensorRT plugins, engine and conetxt."""
    self.model = model
    self.input_shape = input_shape
    self.cuda_ctx = cuda_ctx
    if self.cuda_ctx:
        self.cuda_ctx.push()

    self.trt_logger = trt.Logger(trt.Logger.INFO)
    self._load_plugins()
    self.engine = self._load_engine()

    try:
        self.context = self.engine.create_execution_context()
        self.stream = cuda.Stream()
        self.host_inputs, self.host_outputs, self.cuda_inputs, self.cuda_outputs, self.bindings = self._allocate_buffers()
    except Exception as e:
        raise RuntimeError('fail to allocate CUDA resources') from e
    finally:
        if self.cuda_ctx:
            self.cuda_ctx.pop()

def __del__(self):
    """Free CUDA memories and context."""
    del self.cuda_outputs
    del self.cuda_inputs
    del self.stream

def detect(self, img, conf_th=0.3):
    """Detect objects in the input image."""
    img_resized = _preprocess_trt(img, self.input_shape)
    np.copyto(self.host_inputs[0], img_resized.ravel())

    if self.cuda_ctx:
        self.cuda_ctx.push()
    cuda.memcpy_htod_async(
        self.cuda_inputs[0], self.host_inputs[0], self.stream)
    self.context.execute_async(
        batch_size=5,
        bindings=self.bindings,
        stream_handle=self.stream.handle)
    cuda.memcpy_dtoh_async(
        self.host_outputs[1], self.cuda_outputs[1], self.stream)
    cuda.memcpy_dtoh_async(
        self.host_outputs[0], self.cuda_outputs[0], self.stream)
    self.stream.synchronize()
    if self.cuda_ctx:
        self.cuda_ctx.pop()

    output = self.host_outputs[0]
    return _postprocess_trt(img, output, conf_th)