Description
Hi, I am trying to run inference on multiple batches in tensorrt. However, I can not get the right output. My desired output shape for one image is [14,] and I want to run the model with batches of 32 images. When running the code below, the out of the trt_outputs is an array with shape [448] (14 * 32), but only the 14 first elements have been updated. The rest are all zero. Would very much appreciate help! I do not belive there is anything wrong with the hardware or a bug with tensorrt.
To begin I exported a tflite model that takes a 80x80 image input to onnx and specified batch size to 32 (in the onnx model). The input shape for the onnx model is therefore [32x3x80x80]. Then, I created a trt engine with the following commands:
./trtexec --onnx=/model.onnx --maxBatch=32 --fp16 --saveEngine=/engine.trt
I have also tried engines with the following command:
./trtexec --onnx=/model.onnx --fp16 --saveEngine=/engine.trt
./trtexec --onnx=/model.onnx --minShapes=input:32x3x80x80 --optShapes=input:32x3x80x80 --maxShapes=input:32x3x80x80 --fp16 --saveEngine=/engine.trt
I then use the following script to run inference:
class HostDeviceMem(object):
""" Helper data class. """
def __init__(self, host_mem, device_mem):
self.host = host_mem
self.device = device_mem
def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
class TrtDetector(object):
COLOUR_CHANNELS = 3 # RGB
def __init__(self, engine_path, img_size):
""" Init function
@param engine_path: Path to the TensorRT serialised engine
@param img_size: Size of each image dimension
"""
self.TRT_LOGGER = trt.Logger()
self.img_size = img_size
self.engine = self.get_engine(engine_path)
self.context = self.engine.create_execution_context()
self.buffers = self.allocate_buffers(batch_size=32)
self.context.set_binding_shape(0, (1, self.COLOUR_CHANNELS, img_size, img_size))
def get_engine(self, engine_path):
""" Load serialised engine from file """
with open(engine_path, "rb") as f, trt.Runtime(self.TRT_LOGGER) as runtime:
return runtime.deserialize_cuda_engine(f.read())
def allocate_buffers(self, batch_size):
""" Allocate necessary buffers for inference on the GPU
@param batch_size: Size of the batches
@return
- inputs: buffer for inputs
- outputs: buffer for outputs
- bindings: device bindings
- stream: GPU stream, sequence of operations
"""
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for binding in self.engine:
size = trt.volume(self.engine.get_binding_shape(binding)) * batch_size
dtype = trt.nptype(self.engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings
bindings.append(int(device_mem))
# Append to the appropriate list
if self.engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream
def do_inference(self, bindings, inputs, outputs, stream):
""" Inference on the GPU """
# Transfer input data to the GPU
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
# Run inference
self.context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
# Synchronize the stream
stream.synchronize()
# Return only the host outputs
return [out.host for out in outputs]
def detect(self, imgs):
""" Run detections on batch of images
@param imgs: Input images
"""
# Pre-processing
images_input = pre_process(imgs)
# images_input.shape = 32x3x80x80
# Do inference on GPU
inputs, outputs, bindings, stream = self.buffers
inputs[0].host = images_input
trt_outputs = self.do_inference(
bindings=bindings, inputs=inputs, outputs=outputs, stream=stream
)
return trt_outputs
The preprocessing function mentioned in detect is as follows:
def pre_process(imgs):
""" Perform pre processing on a batch of images
@param imgs: Images to pre process
"""
imgs_for_inference = []
for img in imgs:
img = cv2.resize(img, (80, 80))
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
img = np.transpose(img, (2, 0, 1)).astype(np.float32)
img /= 255.0
img = np.ascontiguousarray(img)
imgs_for_inference.append(img)
imgs_for_inference = np.array(imgs_for_inference).astype('float16')
return imgs_for_inference
Then to run everything I do:
if __name__ == "__main__":
input_img_path = '/my_img.jpg'
img = cv2.imread(input_img_path)
# array of 32 images
imgs = np.array([img, img, img, img, img, img, img, img, img, img, img, img, img, img, img, img, img, img, img, img, img, img, img, img, img, img, img, img, img, img, img, img])
model = TrtDetector("/engine.trt", 80)
model.detect(imgs)