Hi all,
I see the Deeper dive into TensoRT and TRITON NVIDIA webinar, and tell the TensorRT 7 support dynamin input shape and dynamic batch size but TRT5/6 only support dynamic batch size. I want to know, really the TRT5/6 doesn’t support the dynamic batch size or I don’t correct work.
I used this for converting the model with UFF parser
with trt.Builder(TRT_LOGGER) as builder, builder.create_network() as network, trt.UffParser() as parser:
builder.max_workspace_size = 1 << 30
builder.max_batch_size = 5
builder.fp16_mode = True
parser.register_input('Input', INPUT_DIMS)
parser.register_output('MarkOutput_0')
parser.parse(spec['tmp_uff'], network)
engine = builder.build_cuda_engine(network)
buf = engine.serialize()
with open(spec['output_bin'], 'wb') as f:
f.write(buf)
Then I used the below code for inference:
class TrtSSD(object):
def _load_plugins(self):
if trt.__version__[0] < '7':
ctypes.CDLL("ssd/libflattenconcat.so")
trt.init_libnvinfer_plugins(self.trt_logger, '')
def _load_engine(self):
TRTbin = 'ssd/TRT_%s.bin' % self.model
with open(TRTbin, 'rb') as f, trt.Runtime(self.trt_logger) as runtime:
return runtime.deserialize_cuda_engine(f.read())
def _allocate_buffers(self):
host_inputs, host_outputs, cuda_inputs, cuda_outputs, bindings = \
[], [], [], [], []
for binding in self.engine:
size = trt.volume(self.engine.get_binding_shape(binding)) * \
self.engine.max_batch_size
host_mem = cuda.pagelocked_empty(size, np.float32)
cuda_mem = cuda.mem_alloc(host_mem.nbytes)
bindings.append(int(cuda_mem))
if self.engine.binding_is_input(binding):
host_inputs.append(host_mem)
cuda_inputs.append(cuda_mem)
else:
host_outputs.append(host_mem)
cuda_outputs.append(cuda_mem)
return host_inputs, host_outputs, cuda_inputs, cuda_outputs, bindings
def __init__(self, model, input_shape, cuda_ctx=None):
"""Initialize TensorRT plugins, engine and conetxt."""
self.model = model
self.input_shape = input_shape
self.cuda_ctx = cuda_ctx
if self.cuda_ctx:
self.cuda_ctx.push()
self.trt_logger = trt.Logger(trt.Logger.INFO)
self._load_plugins()
self.engine = self._load_engine()
try:
self.context = self.engine.create_execution_context()
self.stream = cuda.Stream()
self.host_inputs, self.host_outputs, self.cuda_inputs, self.cuda_outputs, self.bindings = self._allocate_buffers()
except Exception as e:
raise RuntimeError('fail to allocate CUDA resources') from e
finally:
if self.cuda_ctx:
self.cuda_ctx.pop()
def __del__(self):
"""Free CUDA memories and context."""
del self.cuda_outputs
del self.cuda_inputs
del self.stream
def detect(self, img, conf_th=0.3):
"""Detect objects in the input image."""
img_resized = _preprocess_trt(img, self.input_shape)
np.copyto(self.host_inputs[0], img_resized.ravel())
if self.cuda_ctx:
self.cuda_ctx.push()
cuda.memcpy_htod_async(
self.cuda_inputs[0], self.host_inputs[0], self.stream)
self.context.execute_async(
batch_size=5,
bindings=self.bindings,
stream_handle=self.stream.handle)
cuda.memcpy_dtoh_async(
self.host_outputs[1], self.cuda_outputs[1], self.stream)
cuda.memcpy_dtoh_async(
self.host_outputs[0], self.cuda_outputs[0], self.stream)
self.stream.synchronize()
if self.cuda_ctx:
self.cuda_ctx.pop()
output = self.host_outputs[0]
return _postprocess_trt(img, output, conf_th)