Description
Onnx models from the model zoo produce poor results in deepstream (low fps, stuttering output - actual annotations are good)
Hi, we’re looking to run yolov4 object detection models in deepstream. Unfortunately its not working at the min. Our process at the min is:
- Download a yolov4 model from onnx model zoo GitHub - onnx/models: A collection of pre-trained, state-of-the-art models in the ONNX format
- Convert it with trtexec on the target device (Jetson NX running JP4.6, DS6.0):
/usr/src/tensorrt/bin/trtexec --onnx=/data/models/yolov4_onnx.onnx --saveEngine=/data/models/yolov4_coco_dynamic_kxm.engine --explicitBatch --minShapes=input:1x3x416x416 --optShapes=input:4x3x416x416 --maxShapes=input:16x3x416x416
This works, and I can test it with:
/usr/src/tensorrt/bin/trtexec --loadEngine=/data/models/yolov4_coco_kxm.engine --batch=4 --iterations=100 --avgRuns=10 --dumpProfile --dumpOutput --useCudaGraph
All okay.
However when I come to run it in DeepStream using mp4 inputs, the output is stutters (runs for maybe 0.5-1seconds and then stops for a bit), and the fps is very low (10-15fps when the input videos are 30fps)
This is my config:
[application]
enable-perf-measurement=1
perf-measurement-interval-sec=5
#gie-kitti-output-dir=streamscl
output display details
[tiled-display]
enable=1
rows=2
columns=2
width=1920
height=1080
gpu-id=0
#(0): nvbuf-mem-default - Default memory allocated, specific to particular platform
#(1): nvbuf-mem-cuda-pinned - Allocate Pinned/Host cuda memory, applicable for Tesla
#(2): nvbuf-mem-cuda-device - Allocate Device cuda memory, applicable for Tesla
#(3): nvbuf-mem-cuda-unified - Allocate Unified cuda memory, applicable for Tesla
#(4): nvbuf-mem-surface-array - Allocate Surface Array memory, applicable for Jetson
nvbuf-memory-type=0
mp4 video source
[source0]
enable=1
#Type - 1=CameraV4L2 2=URI 3=MultiURI 4=RTSP
type=3
uri=file:///data/videos-test/RowdenCarpark2.mp4
num-sources=2
gpu-id=0
cudadec-memtype=0
source-id=0
camera-width=1280
camera-height=720
[source1]
enable=1
#Type - 1=CameraV4L2 2=URI 3=MultiURI
type=3
uri=file:///opt/nvidia/deepstream/deepstream-6.0/samples/streams/sample_1080p_h264.mp4
#uri=file:///home/tushar/sample_0_720p.mp4
num-sources=2
gpu-id=0
nvbuf-memory-type=0
rtsp video source
[source2]
enable=0
type=4
#latency=30000
#drop-on-latency=false
#drop-frame-interval=3
buffer-size=5000000
uri=
cudadec-memtype=0
source-id=0
rtsp video out
[sink0]
enable=1
#Type - 1=FakeSink 2=EglSink 3=File 4=RTSPStreaming
type=4
#1=h264 2=h265
codec=1
#encoder type 0=Hardware 1=Software
enc-type=0
sync=0
bitrate=10000000
#bitrate=2700000
#H264 Profile - 0=Baseline 2=Main 4=High
#H265 Profile - 0=Main 1=Main10
profile=0
set below properties in case of RTSPStreaming
rtsp-port=8556
udp-port=5400
#source-id=0
mp4 out
[sink1]
enable=1
type=3
#1=mp4 2=mkv
container=1
enc-type=0
#1=h264 2=h265 3=mpeg4
only SW mpeg4 is supported right now.
codec=1
sync=1
bitrate=4000000
profile=0
output-file=/data/videos-out/21112023_093556_RowdenCarpark2.mp4
source-id=0
[sink2]
enable=0
#Type - 1=FakeSink 2=EglSink 3=File 4=UDPSink 5=nvoverlaysink 6=MsgConvBroker
type=6
msg-conv-config=redis_msg_config.txt
#(0): PAYLOAD_DEEPSTREAM - Deepstream schema payload
#(1): PAYLOAD_DEEPSTREAM_MINIMAL - Deepstream schema payload minimal
#(256): PAYLOAD_RESERVED - Reserved type
#(257): PAYLOAD_CUSTOM - Custom schema payload
msg-conv-payload-type=0
msg-conv-msg2p-new-api=1
msg-conv-frame-interval=100
#msg-broker-proto-lib=/opt/nvidia/deepstream/deepstream-6.0/lib/libnvds_kafka_proto.so
msg-broker-proto-lib=/opt/nvidia/deepstream/deepstream-6.0/lib/libnvds_redis_proto.so
#Provide your msg-broker-conn-str here
msg-broker-conn-str=localhost;6379
#topic=deepstream_detection_messages
topic=metadata
#Optional:
msg-broker-config=/opt/nvidia/deepstream/deepstream/sources/libs/redis_protocol_adaptor/cfg_redis.txt
on screen display
[osd]
enable=1
gpu-id=0
border-width=1
text-size=15
text-color=1;1;1;1;
text-bg-color=0.3;0.3;0.3;1
font=Arial
show-clock=0
clock-x-offset=800
clock-y-offset=820
clock-text-size=12
clock-color=1;0;0;0
nvbuf-memory-type=0
stream mux - forms batches of frames from multiple input sources
[streammux]
gpu-id=0
##Boolean property to inform muxer that sources are live
live-source=1
batch-size=4
##time out in usec, to wait after the first buffer is available
##to push the batch even if the complete batch is not formed
batched-push-timeout=33333
Set muxer output width and height
width=1280
height=720
#enable to maintain aspect ratio wrt source, and allow black borders, works
##along with width, height properties
enable-padding=0
nvbuf-memory-type=0
If set to TRUE, system timestamp will be attached as ntp timestamp
If set to FALSE, ntp timestamp from rtspsrc, if available, will be attached
attach-sys-ts-as-ntp=1
primary gpu inference engine (model)
[primary-gie]
enable=1
bbox-border-color0=1;0;0;1
bbox-border-color1=0;1;1;1
bbox-border-color2=0;1;1;1
bbox-border-color3=0;1;0;1
nvbuf-memory-type=0
config-file=detector_config.txt
[tracker]
enable=1
For NvDCF and DeepSORT tracker, tracker-width and tracker-height must be a multiple of 32, respectively
tracker-width=320
tracker-height=256
ll-lib-file=/opt/nvidia/deepstream/deepstream-6.0/lib/libnvds_nvmultiobjecttracker.so
ll-config-file required to set different tracker types
ll-config-file=/opt/nvidia/deepstream/deepstream-DEEPSTREAM_VER/samples/configs/deepstream-app/config_tracker_IOU.yml
ll-config-file=/opt/nvidia/deepstream/deepstream-6.0/samples/configs/deepstream-app/config_tracker_NvDCF_perf.yml
ll-config-file=/opt/nvidia/deepstream/deepstream-DEEPSTREAM_VER/samples/configs/deepstream-app/config_tracker_NvDCF_accuracy.yml
ll-config-file=/opt/nvidia/deepstream/deepstream-DEEPSTREAM_VER/samples/configs/deepstream-app/config_tracker_DeepSORT.yml
gpu-id=0
enable-batch-process=1
enable-past-frame=1
display-tracking-id=1
secondary gpu inference engine (model)
[secondary-gie]
enable=0
gpu-id=0
batch-size=1
0=FP32, 1=INT8, 2=FP16 mode
nvbuf-memory-type=0
config-file=classifier_config.txt
gie-unique-id=2
operate-on-gie-id=1
[tests]
file-loop=0
And this is my detector_config.txt:
[property]
gpu-id=0
model-engine-file=/data/models/yolov4_coco_kxm.engine
batch-size=4
gie-unique-id=1
maintain-aspect-ratio=1
symmetric-padding=0
network-mode=0
process-mode=1
network-type=0
interval=4
engine-create-func-name=NvDsInferYoloCudaEngineGet
force-implicit-batch-dim=1
from models.json
net-scale-factor=0.003921569790691137
labelfile-path=/data/labels/coco.txt
num-detected-classes=80
cluster-mode=3
#parse-bbox-func-name=NvDsInferParseCustomYoloV3
#custom-lib-path=/opt/nvidia/deepstream/deepstream-6.0/sources/objectDetector_Yolo/nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so
#infer-dims=3;544;960
#output-blob-names=BatchedNMS
#parse-bbox-func-name=NvDsInferParseCustomBatchedNMSTLT
#custom-lib-path=/opt/nvidia/deepstream/deepstream-6.0/sources/deepstream_tlt_apps/post_processor/libnvds_infercustomparser_tlt.so
parse-bbox-func-name=NvDsInferParseYolo
custom-lib-path=/opt/nvidia/deepstream/deepstream-6.0/sources/DeepStream-Yolo/nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so
#parse-bbox-func-name=NvDsInferParseCustomYoloV4
#custom-lib-path=/opt/nvidia/deepstream/deepstream-6.0/sources/objectDetector_Yolo/nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so
#custom-lib-path=/yolo_deepstream/deepstream_yolo/nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so
model-color-format=0
[class-attrs-all]
topk=20
nms-iou-threshold=0.5
roi-top-offset=0
roi-bottom-offset=0
detected-min-w=0
detected-min-h=0
detected-max-w=0
detected-max-h=0
from models.json
pre-cluster-threshold=0.7
Thanks in advance!
Environment
TensorRT Version:
v8.0.1
GPU Type:
Jetson NX
Nvidia Driver Version:
CUDA Version:
10.2
CUDNN Version:
Operating System + Version:
Python Version (if applicable):
TensorFlow Version (if applicable):
PyTorch Version (if applicable):
Baremetal or Container (if container which image + tag):
Relevant Files
Please attach or include links to any models, data, files, or scripts necessary to reproduce your issue. (Github repo, Google Drive, Dropbox, etc.)
Steps To Reproduce
Please include:
- Exact steps/commands to build your repro
- Exact steps/commands to run your repro
- Full traceback of errors encountered