Hi,
I ran the dense optical flow example in the VPI library, using
/opt/nvidia/vpi2/samples/assets/dashcam.mp4
as input. However, the performance of VPI seems not very good, as each optical flow calculation takes 23ms, which is not as fast as the claimed 3.18±0.01 ms in the documentation. I have set the power of my board to the maximum and the fan speed to the maximum as well. I wonder why this is happening, as 23ms is unacceptable for me.
Here is my test code:
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import cv2
import sys
import vpi
import numpy as np
from os import path
from argparse import ArgumentParser
from contextlib import contextmanager
import time
# ----------------------------
# Some utility functions
def process_motion_vectors(mv):
with mv.rlock_cpu() as data:
# convert S10.5 format to float
flow = np.float32(data)/(1<<5)
# Create an image where the motion vector angle is
# mapped to a color hue, and intensity is proportional
# to vector's magnitude
magnitude, angle = cv2.cartToPolar(flow[:,:,0], flow[:,:,1], angleInDegrees=True)
clip = 5.0
cv2.threshold(magnitude, clip, clip, cv2.THRESH_TRUNC, magnitude)
# build the hsv image
hsv = np.ndarray([flow.shape[0], flow.shape[1], 3], np.float32)
hsv[:,:,0] = angle
hsv[:,:,1] = np.ones((angle.shape[0], angle.shape[1]), np.float32)
hsv[:,:,2] = magnitude / clip
# Convert HSV to BGR8
bgr = cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR)
return np.uint8(bgr*255)
# ----------------------------
# Parse command line arguments
parser = ArgumentParser()
parser.add_argument('backend', choices=['nvenc'],
help='Backend to be used for processing')
parser.add_argument('input',
help='Input video to be processed')
parser.add_argument('quality', choices=['low', 'medium', 'high'],
help='Quality setting')
args = parser.parse_args();
assert args.backend == 'nvenc'
backend = vpi.Backend.NVENC
if args.quality == "low":
quality = vpi.OptFlowQuality.LOW
elif args.quality == "medium":
quality = vpi.OptFlowQuality.MEDIUM
else:
assert args.quality == "high"
quality = vpi.OptFlowQuality.HIGH
# -----------------------------
# Open input and output videos
inVideo = cv2.VideoCapture(args.input)
fourcc = cv2.VideoWriter_fourcc(*'MPEG')
inSize = (int(inVideo.get(cv2.CAP_PROP_FRAME_WIDTH)), int(inVideo.get(cv2.CAP_PROP_FRAME_HEIGHT)))
fps = inVideo.get(cv2.CAP_PROP_FPS)
if backend == vpi.Backend.NVENC:
# NVENC always returns 1/4th resolution
outSize = (inSize[0]//4, inSize[1]//4)
else:
outSize = inSize
outVideo = cv2.VideoWriter('denseoptflow_mv_python'+str(sys.version_info[0])+'_'+args.backend+'.mp4',
fourcc, fps, outSize)
#---------------------------------
# Main processing loop
prevFrame = None
idFrame = 0
while True:
# Read one input frame
ret, cvFrame = inVideo.read()
if not ret:
break
# Convert it to NV12_ER format to be used by VPI
# No single backend can convert from OpenCV's BGR8 to NV12_ER_BL
# required by the algorithm. We must do in two steps using CUDA and VIC.
curFrame = vpi.asimage(cvFrame, vpi.Format.BGR8) \
.convert(vpi.Format.NV12_ER, backend=vpi.Backend.CUDA) \
.convert(vpi.Format.NV12_ER_BL, backend=vpi.Backend.VIC)
# Need at least 2 frames to start processing
if prevFrame is not None:
print("Processing frame {}".format(idFrame))
# Calculate the motion vectors from previous to current frame
with backend:
start=time.time()
motion_vectors = vpi.optflow_dense(prevFrame, curFrame, quality = quality)
end=time.time()
print(f"infer time is :{end-start}")
# Turn motion vectors into an image
motion_image = process_motion_vectors(motion_vectors)
# Save it to output video
outVideo.write(motion_image)
# Prepare next iteration
prevFrame = curFrame
idFrame += 1
Output:
xavier@ubuntu:/media/xavier/xavier/projects_zxd$ cd /media/xavier/xavier/projects_zxd ; /usr/bin/env /bin/python3 /home/xavier/.vscode-server/extensions/ms-python.python-2023.4.1/pythonFiles/lib/python/debugpy/adapter/../../debugpy/launcher 41099 -- /media/xavier/xavier/projects_zxd/NVIDIA_VPI-2.2-samples/13-optflow_dense/main.py nvenc /opt/nvidia/vpi2/samples/assets/dashcam.mp4 high
OpenCV: FFMPEG: tag 0x4745504d/'MPEG' is not supported with codec id 2 and format 'mp4 / MP4 (MPEG-4 Part 14)'
OpenCV: FFMPEG: fallback to use tag 0x7634706d/'mp4v'
Processing frame 1
infer time is :0.16901803016662598
Processing frame 2
infer time is :0.13559317588806152
Processing frame 3
infer time is :0.03138113021850586
Processing frame 4
infer time is :0.023535728454589844
Processing frame 5
infer time is :0.023661375045776367
Processing frame 6
infer time is :0.02332592010498047
Processing frame 7
infer time is :0.023415327072143555
Processing frame 8
infer time is :0.023112773895263672
Processing frame 9
infer time is :0.0230710506439209
Processing frame 10
infer time is :0.024511098861694336
Processing frame 11
infer time is :0.02801513671875
Processing frame 12
infer time is :0.023573875427246094
Processing frame 13
infer time is :0.02794337272644043
Processing frame 14
infer time is :0.024110794067382812
Processing frame 15
infer time is :0.024527549743652344
Processing frame 16
infer time is :0.02358555793762207
Processing frame 17
infer time is :0.04119753837585449
Processing frame 18
infer time is :0.04546713829040527
Processing frame 19
infer time is :0.043999671936035156
Processing frame 20
infer time is :0.04671454429626465