I have a TF model format as pb and convert to onnx with shape (1, 112, 112, 3), then using onnx2trt to generate a model.trt engine file. When I use python api to inference with the engine, it seems to work fine when I only passed an image to it. But when I set the batch_size > 1 (e.g. 5), the output except the first sample are all zeros.
by the way, if I bock the line #inputs[0].host=images, which means no input is passed. there is also output for the first sample.
import tensorrt as trt
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit # must added after upper line
trt_engine_path = 'model.trt'
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
class HostDeviceMem(object):
def __init__(self, host_mem, device_mem):
self.host = host_mem
self.device = device_mem
def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
def allocate_buffers(engine):
"""
Allocates all buffers required for the specified engine
"""
inputs = []
outputs = []
bindings = []
# Iterate over binding names in engine
for binding in engine:
# Get binding (tensor/buffer) size
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
# Get binding (tensor/buffer) data type (numpy-equivalent)
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate page-locked memory (i.e., pinned memory) buffers
host_mem = cuda.pagelocked_empty(size, dtype)
# Allocate linear piece of device memory
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings
bindings.append(int(device_mem))
# Append to inputs/ouputs list
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
# Create a stream (to eventually copy inputs/outputs and run inference)
stream = cuda.Stream()
return inputs, outputs, bindings, stream
def infer(context, bindings, inputs, outputs, stream, batch_size=1):
"""
Infer outputs on the IExecutionContext for the specified inputs
"""
# Transfer input data to the GPU
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
# Run inference
flag = context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
if flag:
print('execute successfully.')
# Transfer predictions back from the GPU
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
# Synchronize the stream
stream.synchronize()
# Return the host outputs
return [out.host for out in outputs]
# Read the serialized ICudaEngine
with open(trt_engine_path, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:
# Deserialize ICudaEngine
engine = runtime.deserialize_cuda_engine(f.read())
print('engine.has_implicit_batch_dimension:',engine.has_implicit_batch_dimension)
# Now just as with the onnx2trt samples...
# Create an IExecutionContext (context for executing inference)
with engine.create_execution_context() as context:
# Allocate memory for inputs/outputs
inputs, outputs, bindings, stream = allocate_buffers(engine)
# Set host input to the image
images = np.random.rand(5, 112, 112, 3).astype(np.float32)
#inputs[0].host = images
# Inference
trt_outputs = infer(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream, batch_size=5)
# Prediction
#print(len(trt_outputs))
print(trt_outputs[0].shape)
#print(trt_outputs[0][:200])
rt = np.reshape(trt_outputs[0], (32,-1))
print(rt.shape)
print(rt[:,:5])