Thanks for the tip, I did the following.
- Downloaded and unzipped one of the COCO sets
http://images.cocodataset.org/zips/val2017.zip
. Sorry for a hardcoded path, I am just playing around now, it’s not a production code.
- I modified the
onnx_to_tensorrt.py
- diff below. I am now measuring inference with pre-processing and post-processing. The changes I made make the most sense to me, I am not interested with the bounding boxes stored to an output file, I am interested in the inference time. I made sure that I instantiate all objects once, and then perform inference on an initialized engine.
@@ -49,7 +49,7 @@
#
from __future__ import print_function
-
+import time
import numpy as np
import tensorrt as trt
import pycuda.driver as cuda
@@ -137,49 +137,58 @@ def main():
onnx_file_path = 'yolov3.onnx'
engine_file_path = "yolov3.trt"
# Download a dog image and save it to the following file path:
- input_image_path = download_file('dog.jpg',
- 'https://github.com/pjreddie/darknet/raw/f86901f6177dfc6116360a13cc06ab680e0c86b0/data/dog.jpg', checksum_reference=None)
+ #input_image_path = download_file('dog.jpg',
+ #'https://github.com/pjreddie/darknet/raw/f86901f6177dfc6116360a13cc06ab680e0c86b0/data/dog.jpg', checksum_reference=None)
# Two-dimensional tuple with the target network's (spatial) input resolution in HW ordered
input_resolution_yolov3_HW = (608, 608)
# Create a pre-processor object by specifying the required input resolution for YOLOv3
preprocessor = PreprocessYOLO(input_resolution_yolov3_HW)
- # Load an image from the specified input path, and return it together with a pre-processed version
- image_raw, image = preprocessor.process(input_image_path)
# Store the shape of the original input image in WH format, we will need it for later
- shape_orig_WH = image_raw.size
+
+ postprocessor_args = {"yolo_masks": [(6, 7, 8), (3, 4, 5), (0, 1, 2)], # A list of 3 three-dimensional tuples for the YOLO masks
+ "yolo_anchors": [(10, 13), (16, 30), (33, 23), (30, 61), (62, 45), # A list of 9 two-dimensional tuples for the YOLO anchors
+ (59, 119), (116, 90), (156, 198), (373, 326)],
+ "obj_threshold": 0.6, # Threshold for object coverage, float value between 0 and 1
+ "nms_threshold": 0.5, # Threshold for non-max suppression algorithm, float value between 0 and 1
+ "yolo_input_resolution": input_resolution_yolov3_HW}
+
+ postprocessor = PostprocessYOLO(**postprocessor_args)
# Output shapes expected by the post-processor
output_shapes = [(1, 255, 19, 19), (1, 255, 38, 38), (1, 255, 76, 76)]
# Do inference with TensorRT
+ images = [os.path.join("/home/nhd/cocoimgs/val2017/", x) for x in os.listdir("/home/nhd/cocoimgs/val2017/") if x.endswith(".jpg")]
trt_outputs = []
+
with get_engine(onnx_file_path, engine_file_path) as engine, engine.create_execution_context() as context:
inputs, outputs, bindings, stream = common.allocate_buffers(engine)
# Do inference
- print('Running inference on image {}...'.format(input_image_path))
- # Set host input to the image. The common.do_inference function will copy the input to the GPU before executing.
- inputs[0].host = image
- trt_outputs = common.do_inference_v2(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)
+
+ for input_image_path in images:
+ print('Running inference on image {}...'.format(input_image_path))
+ try:
+ # Load an image from the specified input path, and return it together with a pre-processed version
+ image_raw, image = preprocessor.process(input_image_path)
+ shape_orig_WH = image_raw.size
+ # Set host input to the image. The common.do_inference function will copy the input to the GPU before executing
+ time_start = time.time()
+ inputs[0].host = image
+ trt_outputs = common.do_inference_v2(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)
- # Before doing post-processing, we need to reshape the outputs as the common.do_inference will give us flat arrays.
- trt_outputs = [output.reshape(shape) for output, shape in zip(trt_outputs, output_shapes)]
+ # Before doing post-processing, we need to reshape the outputs as the common.do_inference will give us flat arrays.
+ trt_outputs = [output.reshape(shape) for output, shape in zip(trt_outputs, output_shapes)]
- postprocessor_args = {"yolo_masks": [(6, 7, 8), (3, 4, 5), (0, 1, 2)], # A list of 3 three-dimensional tuples for the YOLO masks
- "yolo_anchors": [(10, 13), (16, 30), (33, 23), (30, 61), (62, 45), # A list of 9 two-dimensional tuples for the YOLO anchors
- (59, 119), (116, 90), (156, 198), (373, 326)],
- "obj_threshold": 0.6, # Threshold for object coverage, float value between 0 and 1
- "nms_threshold": 0.5, # Threshold for non-max suppression algorithm, float value between 0 and 1
- "yolo_input_resolution": input_resolution_yolov3_HW}
- postprocessor = PostprocessYOLO(**postprocessor_args)
+ # Run the post-processing algorithms on the TensorRT outputs and get the bounding box details of detected objects
+ boxes, classes, scores = postprocessor.process(trt_outputs, (shape_orig_WH))
+
+ print("Took {} seconds to process {}".format(time.time() - time_start, input_image_path))
+ except Exception:
+ print("Failed to process {}".format(input_image_path))
- # Run the post-processing algorithms on the TensorRT outputs and get the bounding box details of detected objects
- boxes, classes, scores = postprocessor.process(trt_outputs, (shape_orig_WH))
# Draw the bounding boxes onto the original input image and save it as a PNG file
obj_detected_img = draw_bboxes(image_raw, boxes, scores, classes, ALL_CATEGORIES)
- output_image_path = 'dog_bboxes.png'
- obj_detected_img.save(output_image_path, 'PNG')
- print('Saved image with bounding boxes of detected objects to {}.'.format(output_image_path))
if __name__ == '__main__':
main()
Here’s the output of the script:
$ python3 onnx_to_tensorrt.py
Reading engine from file yolov3.trt
Running inference on image /home/nhd/cocoimgs/val2017/000000482436.jpg...
Took 2.227811574935913 seconds to process /home/nhd/cocoimgs/val2017/000000482436.jpg
Running inference on image /home/nhd/cocoimgs/val2017/000000379453.jpg...
Took 2.041841506958008 seconds to process /home/nhd/cocoimgs/val2017/000000379453.jpg
Running inference on image /home/nhd/cocoimgs/val2017/000000474028.jpg...
Took 1.9844512939453125 seconds to process /home/nhd/cocoimgs/val2017/000000474028.jpg
Running inference on image /home/nhd/cocoimgs/val2017/000000158227.jpg...
Took 1.9523558616638184 seconds to process /home/nhd/cocoimgs/val2017/000000158227.jpg
Running inference on image /home/nhd/cocoimgs/val2017/000000114049.jpg...
Took 1.9245216846466064 seconds to process /home/nhd/cocoimgs/val2017/000000114049.jpg
Running inference on image /home/nhd/cocoimgs/val2017/000000124975.jpg...
Took 1.9166526794433594 seconds to process /home/nhd/cocoimgs/val2017/000000124975.jpg
Running inference on image /home/nhd/cocoimgs/val2017/000000068765.jpg...
Took 1.9090898036956787 seconds to process /home/nhd/cocoimgs/val2017/000000068765.jpg
Running inference on image /home/nhd/cocoimgs/val2017/000000370486.jpg...
Took 1.9432857036590576 seconds to process /home/nhd/cocoimgs/val2017/000000370486.jpg
Running inference on image /home/nhd/cocoimgs/val2017/000000372307.jpg...
Took 1.9385781288146973 seconds to process /home/nhd/cocoimgs/val2017/000000372307.jpg
Now, I need just under 2 seconds for a single jpeg, or in other words I’m getting slightly more than 0.5 FPS. Is this expected performance?
Here’s the output of tegrastats
while running this example
RAM 3854/7764MB (lfb 66x4MB) SWAP 0/3882MB (cached 0MB) CPU [1%@1420,0%@1420,82%@1420,0%@1420,off,off] EMC_FREQ 7%@1600 GR3D_FREQ 0%@1109 APE 75 MTS fg 0% bg 1% AO@43C GPU@43C PMIC@100C AUX@43.5C CPU@43.5C thermal@44.1C VDD_IN 5063/2714 VDD_CPU_GPU_CV 1878/509 VDD_SOC 1143/536
RAM 3813/7764MB (lfb 69x4MB) SWAP 0/3882MB (cached 0MB) CPU [0%@596,0%@1190,95%@1190,0%@1083,off,off] EMC_FREQ 7%@1600 GR3D_FREQ 99%@1109 APE 75 MTS fg 0% bg 2% AO@43C GPU@44.5C PMIC@100C AUX@43.5C CPU@44C thermal@43.35C VDD_IN 6043/2714 VDD_CPU_GPU_CV 2572/509 VDD_SOC 1265/536
RAM 3845/7764MB (lfb 69x4MB) SWAP 0/3882MB (cached 0MB) CPU [1%@1420,0%@1420,87%@1420,0%@1420,off,off] EMC_FREQ 7%@1600 GR3D_FREQ 0%@1109 APE 75 MTS fg 0% bg 0% AO@42.5C GPU@43C PMIC@100C AUX@43.5C CPU@43.5C thermal@43.5C VDD_IN 4940/2714 VDD_CPU_GPU_CV 1796/509 VDD_SOC 1143/536
RAM 3813/7764MB (lfb 69x4MB) SWAP 0/3882MB (cached 0MB) CPU [0%@1190,0%@1190,88%@1190,0%@1190,off,off] EMC_FREQ 10%@1600 GR3D_FREQ 99%@1109 APE 75 MTS fg 0% bg 2% AO@43C GPU@45C PMIC@100C AUX@43C CPU@43.5C thermal@43.15C VDD_IN 7676/2714 VDD_CPU_GPU_CV 3756/509 VDD_SOC 1426/536
RAM 3861/7764MB (lfb 65x4MB) SWAP 0/3882MB (cached 0MB) CPU [0%@1420,0%@1420,94%@1420,0%@1420,off,off] EMC_FREQ 6%@1600 GR3D_FREQ 0%@1109 APE 75 MTS fg 0% bg 0% AO@42.5C GPU@43C PMIC@100C AUX@43C CPU@43.5C thermal@43.15C VDD_IN 4818/2714 VDD_CPU_GPU_CV 1714/509 VDD_SOC 1143/536
RAM 3808/7764MB (lfb 67x4MB) SWAP 0/3882MB (cached 0MB) CPU [1%@1420,1%@1420,83%@1420,0%@1420,off,off] EMC_FREQ 13%@1600 GR3D_FREQ 0%@1109 APE 75 MTS fg 0% bg 2% AO@43C GPU@44.5C PMIC@100C AUX@43C CPU@43C thermal@42.85C VDD_IN 8288/2714 VDD_CPU_GPU_CV 4246/509 VDD_SOC 1467/536
RAM 3861/7764MB (lfb 66x4MB) SWAP 0/3882MB (cached 0MB) CPU [0%@1420,0%@1420,100%@1420,0%@1420,off,off] EMC_FREQ 6%@1600 GR3D_FREQ 0%@1109 APE 75 MTS fg 0% bg 3% AO@42.5C GPU@43C PMIC@100C AUX@43C CPU@43C thermal@42.85C VDD_IN 4777/2714 VDD_CPU_GPU_CV 1674/509 VDD_SOC 1143/536
RAM 3814/7764MB (lfb 67x4MB) SWAP 0/3882MB (cached 0MB) CPU [2%@1420,0%@1420,82%@1420,0%@1420,off,off] EMC_FREQ 12%@1600 GR3D_FREQ 0%@1109 APE 75 MTS fg 0% bg 1% AO@43C GPU@43.5C PMIC@100C AUX@43C CPU@43C thermal@43.15C VDD_IN 7513/2714 VDD_CPU_GPU_CV 3674/509 VDD_SOC 1386/536
RAM 3866/7764MB (lfb 66x4MB) SWAP 0/3882MB (cached 0MB) CPU [0%@1420,0%@1420,100%@1420,0%@1420,off,off] EMC_FREQ 6%@1600 GR3D_FREQ 0%@1109 APE 75 MTS fg 0% bg 7% AO@42C GPU@42.5C PMIC@100C AUX@42.5C CPU@43C thermal@43.15C VDD_IN 4703/2714 VDD_CPU_GPU_CV 1633/509 VDD_SOC 1102/536
RAM 3810/7764MB (lfb 68x4MB) SWAP 0/3882MB (cached 0MB) CPU [1%@1420,0%@1420,82%@1420,0%@1420,off,off] EMC_FREQ 11%@1600 GR3D_FREQ 0%@1109 APE 75 MTS fg 0% bg 1% AO@42.5C GPU@43C PMIC@100C AUX@42.5C CPU@43C thermal@42.5C VDD_IN 6900/2714 VDD_CPU_GPU_CV 3225/509 VDD_SOC 1306/536
RAM 3861/7764MB (lfb 67x4MB) SWAP 0/3882MB (cached 0MB) CPU [0%@1420,0%@1420,100%@1420,0%@1420,off,off] EMC_FREQ 5%@1600 GR3D_FREQ 0%@1109 APE 75 MTS fg 0% bg 0% AO@42C GPU@42.5C PMIC@100C AUX@42.5C CPU@43C thermal@42.65C VDD_IN 4614/2714 VDD_CPU_GPU_CV 1554/509 VDD_SOC 1102/536
RAM 3814/7764MB (lfb 67x4MB) SWAP 0/3882MB (cached 0MB) CPU [1%@1420,0%@1420,82%@1420,0%@1420,off,off] EMC_FREQ 10%@1600 GR3D_FREQ 0%@1109 APE 75 MTS fg 0% bg 3% AO@42C GPU@42.5C PMIC@100C AUX@42.5C CPU@43C thermal@42.65C VDD_IN 6410/2714 VDD_CPU_GPU_CV 2899/509 VDD_SOC 1265/536
RAM 3878/7764MB (lfb 63x4MB) SWAP 0/3882MB (cached 0MB) CPU [1%@1420,0%@1420,100%@1420,0%@1420,off,off] EMC_FREQ 5%@1600 GR3D_FREQ 0%@1109 APE 75 MTS fg 0% bg 0% AO@42C GPU@42.5C PMIC@100C AUX@42.5C CPU@42.5C thermal@42.65C VDD_IN 4580/2714 VDD_CPU_GPU_CV 1554/509 VDD_SOC 1102/536
RAM 3803/7764MB (lfb 67x4MB) SWAP 0/3882MB (cached 0MB) CPU [0%@1420,0%@1420,82%@1420,0%@1420,off,off] EMC_FREQ 10%@1600 GR3D_FREQ 0%@1109 APE 75 MTS fg 0% bg 4% AO@42C GPU@42.5C PMIC@100C AUX@42.5C CPU@42.5C thermal@43.1C VDD_IN 5961/2714 VDD_CPU_GPU_CV 2531/509 VDD_SOC 1224/536
RAM 3870/7764MB (lfb 66x4MB) SWAP 0/3882MB (cached 0MB) CPU [3%@1420,0%@1420,100%@1420,0%@1420,off,off] EMC_FREQ 5%@1600 GR3D_FREQ 0%@1109 APE 75 MTS fg 0% bg 1% AO@41.5C GPU@42C PMIC@100C AUX@42C CPU@42.5C thermal@42.35C VDD_IN 4539/2714 VDD_CPU_GPU_CV 1513/509 VDD_SOC 1102/536
II was hoping to get a few FPS out of this.