Loading model in deepstream-triton-5.0

Please provide complete information as applicable to your setup.

• Hardware Platform (GPU)
• DeepStream Version 5.0
• TensorRT Version 7.0
• NVIDIA GPU Driver Version (valid for GPU only) 450
• Issue Type( questions, new requirements, bugs)
• How to reproduce the issue ? (This is for bugs. Including which sample app is using, the configuration files content, the command line used and other details for reproducing)
• Requirement details( This is for new requirement. Including the module name-for which plugin or for which sample application, the function description)

Want to load YOLOv4 model on deepstream-5.0 with triton inference server.

Generated a YOLO model with trt-ncg container 20.03 using jkjung repo

config_yolo4.txt


infer_config {
  unique_id: 5
  gpu_ids: [0]
  max_batch_size: 1
  backend {
    inputs: [ {
      name: "000_net"
    }]
    outputs: [
      {name: "(Unnamed Layer* 506) [PluginV2IOExt]_output_0"},
      {name: "(Unnamed Layer* 507) [PluginV2IOExt]_output_0"},
      {name: "(Unnamed Layer* 508) [PluginV2IOExt]_output_0"}
    ]


    trt_is {
      model_name: "yolo-1"
      version: -1
      model_repo {
        root: "/yolo/sec-models"
        log_level: 3
        strict_model_config: true


      }
    }
  }

  preprocess {
    network_format: IMAGE_FORMAT_RGB
    tensor_order: TENSOR_ORDER_LINEAR
    tensor_name: "000_net"
    maintain_aspect_ratio: 0
    frame_scaling_hw: FRAME_SCALING_HW_DEFAULT
    frame_scaling_filter: 1
    normalize {
      scale_factor: 1.0
      channel_offsets: [0, 0, 0]
    }
  }

  postprocess {
    labelfile_path: "/yolo/sec-models/yolo-1/labels.txt"
    detection {
      num_detected_classes: 8
      nms {
        confidence_threshold: 0.5
        iou_threshold: 0.3
        
      }
    }
  }

  extra {
    copy_input_to_host_buffers: false
  }

  custom_lib {
    path: "/yolo/libyolo_layer.so"
  }
}
input_control {
  process_mode: PROCESS_MODE_FULL_FRAME
  interval: 0
}

output_control {
  detect_control {
    default_filter { bbox_filter { min_width: 32, min_height: 32 } }
  }
}

deepstream-app.txt

[application]
enable-perf-measurement=1
perf-measurement-interval-sec=5
#gie-kitti-output-dir=kitti-trtis

[tiled-display]
enable=0
rows=1
columns=1
width=1280
height=720
gpu-id=0
#(0): nvbuf-mem-default - Default memory allocated, specific to particular platform
#(1): nvbuf-mem-cuda-pinned - Allocate Pinned/Host cuda memory applicable for Tesla
#(2): nvbuf-mem-cuda-device - Allocate Device cuda memory applicable for Tesla
#(3): nvbuf-mem-cuda-unified - Allocate Unified cuda memory applicable for Tesla
#(4): nvbuf-mem-surface-array - Allocate Surface Array memory, applicable for Jetson
nvbuf-memory-type=0

[source0]
enable=1
#Type - 1=CameraV4L2 2=URI 3=MultiURI 4=RTSP
type=3
uri=file:///yolo/internal_test.mp4
num-sources=1
#drop-frame-interval=2
gpu-id=0
# (0): memtype_device   - Memory type Device
# (1): memtype_pinned   - Memory type Host Pinned
# (2): memtype_unified  - Memory type Unified
cudadec-memtype=0

[sink0]
enable=1
#Type - 1=FakeSink 2=EglSink 3=File
type=1
sync=0
source-id=0
gpu-id=0
nvbuf-memory-type=0

[sink1]
enable=1
type=3
#1=mp4 2=mkv
container=1
#1=h264 2=h265
codec=1
sync=0
#iframeinterval=10
bitrate=2000000
output-file=out.mp4
source-id=0

[sink2]
enable=0
#Type - 1=FakeSink 2=EglSink 3=File 4=RTSPStreaming
type=4
#1=h264 2=h265
codec=1
sync=0
bitrate=4000000
# set below properties in case of RTSPStreaming
rtsp-port=8554
udp-port=5400

[osd]
enable=1
gpu-id=0
border-width=1
text-size=15
text-color=1;1;1;1;
text-bg-color=0.3;0.3;0.3;1
font=Serif
show-clock=0
clock-x-offset=800
clock-y-offset=820
clock-text-size=12
clock-color=1;0;0;0
nvbuf-memory-type=0

[streammux]
gpu-id=0
##Boolean property to inform muxer that sources are live
live-source=0
batch-size=1
##time out in usec, to wait after the first buffer is available
##to push the batch even if the complete batch is not formed
batched-push-timeout=40000
## Set muxer output width and height
width=1920
height=1080
##Enable to maintain aspect ratio wrt source, and allow black borders, works
##along with width, height properties
enable-padding=0
nvbuf-memory-type=0

# config-file property is mandatory for any gie section.
# Other properties are optional and if set will override the properties set in
# the infer config file.
[primary-gie]
enable=1
#(0): nvinfer; (1): nvinferserver
plugin-type=1
#infer-raw-output-dir=trtis-output
batch-size=1
interval=0
gie-unique-id=1
config-file=config_yolov4.txt

[tests]
file-loop=0

triton-config.pbtxt


name: "yolo-1"
platform: "tensorrt_plan"
max_batch_size: 1
input [
  {
    name: "000_net"
    data_type: TYPE_FP32
    dims: [1, 3, 608, 608 ]
  }
]
output [
  {
    name: "(Unnamed Layer* 506) [PluginV2IOExt]_output_0"
    data_type: TYPE_FP32
    dims: [1,121296,1,1]
    
  },
  {
    name: "(Unnamed Layer* 507) [PluginV2IOExt]_output_0"
    data_type: TYPE_FP32
    dims: [1,30324,1,1]
  },
  {
    name: "(Unnamed Layer* 508) [PluginV2IOExt]_output_0"
    data_type: TYPE_FP32
    dims: [1,7581,1,1]
  }
]
instance_group [
  {
    count:1
    kind: KIND_GPU
  }
]

error

root@aec8e9bcb8ea:/yolo/samples/configs/deepstream-app-trtis# deepstream-app -c source1_primary_detector.txt 
2021-01-29 13:30:24.897592: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.2
0:00:03.024388887  1883 0x7f7af0002390 WARN           nvinferserver gstnvinferserver_impl.cpp:248:validatePluginConfig:<primary_gie> warning: Configuration file unique-id reset to: 1
I0129 13:30:26.978691 1883 metrics.cc:164] found 1 GPUs supporting NVML metrics
I0129 13:30:26.984180 1883 metrics.cc:173]   GPU 0: Tesla P100-SXM2-16GB
I0129 13:30:26.984465 1883 server.cc:120] Initializing Triton Inference Server
I0129 13:30:27.103776 1883 pinned_memory_manager.cc:197] Pinned memory pool is created at '0x7f7acc000000' with size 268435456
I0129 13:30:27.104132 1883 graphdef_backend_factory.cc:48] Create GraphDefBackendFactory
I0129 13:30:27.104166 1883 savedmodel_backend_factory.cc:48] Create SavedModelBackendFactory
I0129 13:30:27.104198 1883 netdef_backend_factory.cc:46] Create NetDefBackendFactory
I0129 13:30:27.104211 1883 plan_backend_factory.cc:48] Create PlanBackendFactory
I0129 13:30:27.104222 1883 plan_backend_factory.cc:55] Registering TensorRT Plugins
I0129 13:30:27.104252 1883 logging.cc:52] Plugin creator registration succeeded - ::GridAnchor_TRT
I0129 13:30:27.104274 1883 logging.cc:52] Plugin creator registration succeeded - ::NMS_TRT
I0129 13:30:27.104295 1883 logging.cc:52] Plugin creator registration succeeded - ::Reorg_TRT
I0129 13:30:27.104310 1883 logging.cc:52] Plugin creator registration succeeded - ::Region_TRT
I0129 13:30:27.104331 1883 logging.cc:52] Plugin creator registration succeeded - ::Clip_TRT
I0129 13:30:27.104346 1883 logging.cc:52] Plugin creator registration succeeded - ::LReLU_TRT
I0129 13:30:27.104359 1883 logging.cc:52] Plugin creator registration succeeded - ::PriorBox_TRT
I0129 13:30:27.104373 1883 logging.cc:52] Plugin creator registration succeeded - ::Normalize_TRT
I0129 13:30:27.104395 1883 logging.cc:52] Plugin creator registration succeeded - ::RPROI_TRT
I0129 13:30:27.104413 1883 logging.cc:52] Plugin creator registration succeeded - ::BatchedNMS_TRT
I0129 13:30:27.104424 1883 logging.cc:52] Plugin creator registration succeeded - ::FlattenConcat_TRT
I0129 13:30:27.104435 1883 logging.cc:52] Plugin creator registration succeeded - ::CropAndResize
I0129 13:30:27.104444 1883 logging.cc:52] Plugin creator registration succeeded - ::DetectionLayer_TRT
I0129 13:30:27.104452 1883 logging.cc:52] Plugin creator registration succeeded - ::Proposal
I0129 13:30:27.104466 1883 logging.cc:52] Plugin creator registration succeeded - ::ProposalLayer_TRT
I0129 13:30:27.104476 1883 logging.cc:52] Plugin creator registration succeeded - ::PyramidROIAlign_TRT
I0129 13:30:27.104492 1883 logging.cc:52] Plugin creator registration succeeded - ::ResizeNearest_TRT
I0129 13:30:27.104502 1883 logging.cc:52] Plugin creator registration succeeded - ::Split
I0129 13:30:27.104514 1883 logging.cc:52] Plugin creator registration succeeded - ::SpecialSlice_TRT
I0129 13:30:27.104530 1883 logging.cc:52] Plugin creator registration succeeded - ::InstanceNormalization_TRT
I0129 13:30:27.104542 1883 onnx_backend_factory.cc:53] Create OnnxBackendFactory
I0129 13:30:27.114051 1883 libtorch_backend_factory.cc:46] Create LibTorchBackendFactory
I0129 13:30:27.114091 1883 custom_backend_factory.cc:46] Create CustomBackendFactory
I0129 13:30:27.114103 1883 ensemble_backend_factory.cc:47] Create EnsembleBackendFactory
I0129 13:30:27.115278 1883 server_status.cc:55] New status tracking for model 'yolo-1'
I0129 13:30:27.115373 1883 model_repository_manager.cc:568] AsyncLoad() 'yolo-1'
I0129 13:30:27.115399 1883 model_repository_manager.cc:622] TriggerNextAction() 'yolo-1' version 1: 1
I0129 13:30:27.115411 1883 model_repository_manager.cc:660] Load() 'yolo-1' version 1
I0129 13:30:27.115421 1883 model_repository_manager.cc:680] loading: yolo-1:1
I0129 13:30:27.115545 1883 model_repository_manager.cc:735] CreateInferenceBackend() 'yolo-1' version 1
W0129 13:30:28.986633 1883 metrics.cc:276] failed to get energy consumption for GPU 0, NVML_ERROR 3
I0129 13:30:30.576018 1883 logging.cc:52] Deserialize required 2948784 microseconds.
I0129 13:30:30.576063 1883 plan_backend.cc:267] Creating instance yolo-1_0_0_gpu0 on GPU 0 (6.0) using model.plan
W0129 13:30:30.579901 1883 logging.cc:46] Current optimization profile is: 0. Please ensure there are no enqueued operations pending in this context prior to switching profiles
I0129 13:30:30.579956 1883 plan_backend.cc:453] Detected 000_net as execution binding for yolo-1
I0129 13:30:30.579978 1883 plan_backend.cc:453] Detected (Unnamed Layer* 506) [PluginV2IOExt]_output_0 as execution binding for yolo-1
I0129 13:30:30.580005 1883 plan_backend.cc:453] Detected (Unnamed Layer* 507) [PluginV2IOExt]_output_0 as execution binding for yolo-1
I0129 13:30:30.580018 1883 plan_backend.cc:453] Detected (Unnamed Layer* 508) [PluginV2IOExt]_output_0 as execution binding for yolo-1
I0129 13:30:30.580380 1883 plan_backend.cc:599] Created instance yolo-1_0_0_gpu0 on GPU 0 with stream priority 0
I0129 13:30:30.580506 1883 dynamic_batch_scheduler.cc:233] Starting dynamic-batch scheduler thread 0 at nice 5...
I0129 13:30:30.580601 1883 plan_backend.cc:298] plan backend for yolo-1
name=yolo-1
contexts:
  name=yolo-1_0_0_gpu0, gpu=0, max_batch_size=1
  bindings:
    0: max possible byte_size=4435968, buffer=0x7f7a77000000 ]
    1: max possible byte_size=485184, buffer=0x7f7aa7c4c000 ]
    2: max possible byte_size=121296, buffer=0x7f7aa7cc2800 ]
    3: max possible byte_size=30324, buffer=0x7f7aa7ce0200 ]

I0129 13:30:30.592899 1883 model_repository_manager.cc:837] successfully loaded 'yolo-1' version 1
I0129 13:30:30.592927 1883 model_repository_manager.cc:622] TriggerNextAction() 'yolo-1' version 1: 0
I0129 13:30:30.592943 1883 model_repository_manager.cc:637] no next action, trigger OnComplete()
I0129 13:30:30.592988 1883 model_repository_manager.cc:492] GetVersionStates() 'yolo-1'
INFO: infer_trtis_backend.cpp:206 TrtISBackend id:1 initialized model: yolo-1
I0129 13:30:30.595602 1883 model_repository_manager.cc:511] GetInferenceBackend() 'yolo-1' version -1
I0129 13:30:30.595713 1883 plan_backend.cc:1499] Running yolo-1_0_0_gpu0 with 1 request payloads
I0129 13:30:30.595742 1883 plan_backend.cc:2131] Optimization profile default [0] is selected for yolo-1_0_0_gpu0
I0129 13:30:30.595800 1883 plan_backend.cc:1744] Context with profile default [0] is being executed for yolo-1_0_0_gpu0

Runtime commands:
	h: Print this help
	q: Quit

	p: Pause
	r: Resume


**PERF:  FPS 0 (Avg)	
**PERF:  0.00 (0.00)	
** INFO: <bus_callback:181>: Pipeline ready

WARNING from primary_gie: Configuration file unique-id reset to: 1
Debug info: gstnvinferserver_impl.cpp(248): validatePluginConfig (): /GstPipeline:pipeline/GstBin:primary_gie_bin/GstNvInferServer:primary_gie
** INFO: <bus_callback:167>: Pipeline running

I0129 13:30:30.836391 1883 model_repository_manager.cc:511] GetInferenceBackend() 'yolo-1' version -1
ERROR: infer_trtis_server.cpp:111 TRTIS: TrtServerRequest failed to create inference request providerV2, trtis_err_str:INVALID_ARG, err_msg:unexpected shape for input '000_net' for model 'yolo-1'. Expected [1,3,608,608], got [3,608,608]
ERROR: infer_trtis_server.cpp:725 TRTIS failed to create request for model: yolo-1 version:-1
ERROR: infer_trtis_backend.cpp:498 TRT-IS run failed to create request for model: yolo-1
ERROR: infer_trtis_backend.cpp:478 TRT-IS failed to run inference on model yolo-1, nvinfer error:NVDSINFER_TRTIS_ERROR
0:00:06.886390008  1883 0x7f7a5c011950 WARN           nvinferserver gstnvinferserver.cpp:519:gst_nvinfer_server_push_buffer:<primary_gie> error: inference failed with unique-id:1
ERROR from primary_gie: inference failed with unique-id:1
Debug info: gstnvinferserver.cpp(519): gst_nvinfer_server_push_buffer (): /GstPipeline:pipeline/GstBin:primary_gie_bin/GstNvInferServer:primary_gie
I0129 13:30:30.839594 1883 model_repository_manager.cc:511] GetInferenceBackend() 'yolo-1' version -1
ERROR: infer_trtis_server.cpp:111 TRTIS: TrtServerRequest failed to create inference request providerV2, trtis_err_str:INVALID_ARG, err_msg:unexpected shape for input '000_net' for model 'yolo-1'. Expected [1,3,608,608], got [3,608,608]
ERROR: infer_trtis_server.cpp:725 TRTIS failed to create request for model: yolo-1 version:-1
ERROR: infer_trtis_backend.cpp:498 TRT-IS run failed to create request for model: yolo-1
ERROR: infer_trtis_backend.cpp:478 TRT-IS failed to run inference on model yolo-1, nvinfer error:NVDSINFER_TRTIS_ERROR
0:00:06.889326410  1883 0x7f7a5c011950 WARN           nvinferserver gstnvinferserver.cpp:519:gst_nvinfer_server_push_buffer:<primary_gie> error: inference failed with unique-id:1
ERROR from primary_gie: inference failed with unique-id:1
Debug info: gstnvinferserver.cpp(519): gst_nvinfer_server_push_buffer (): /GstPipeline:pipeline/GstBin:primary_gie_bin/GstNvInferServer:primary_gie
ERROR from qtdemux0: Internal data stream error.
Debug info: qtdemux.c(6073): gst_qtdemux_loop (): /GstPipeline:pipeline/GstBin:multi_src_bin/GstBin:src_sub_bin0/GstURIDecodeBin:src_elem/GstDecodeBin:decodebin0/GstQTDemux:qtdemux0:
streaming stopped, reason custom-error (-112)
Quitting
I0129 13:30:30.971644 1883 model_repository_manager.cc:568] AsyncLoad() 'yolo-1'
I0129 13:30:30.971675 1883 model_repository_manager.cc:622] TriggerNextAction() 'yolo-1' version 1: 2
I0129 13:30:30.971701 1883 model_repository_manager.cc:701] Unload() 'yolo-1' version 1
I0129 13:30:30.971711 1883 model_repository_manager.cc:708] unloading: yolo-1:1
I0129 13:30:30.971720 1883 plan_backend.cc:88] ~PlanBackend::Context 
I0129 13:30:30.980867 1883 dynamic_batch_scheduler.cc:443] Stopping dynamic-batch scheduler thread 0...
I0129 13:30:30.981041 1883 model_repository_manager.cc:814] OnDestroy callback() 'yolo-1' version 1
I0129 13:30:30.981089 1883 model_repository_manager.cc:816] successfully unloaded 'yolo-1' version 1
I0129 13:30:30.981109 1883 model_repository_manager.cc:622] TriggerNextAction() 'yolo-1' version 1: 0
I0129 13:30:30.981134 1883 model_repository_manager.cc:492] GetVersionStates() 'yolo-1'
I0129 13:30:30.989231 1883 server.cc:179] Waiting for in-flight inferences to complete.
I0129 13:30:30.989278 1883 model_repository_manager.cc:463] GetLiveBackendStates()
I0129 13:30:30.989297 1883 server.cc:194] Timeout 30: Found 0 live models and 0 in-flight requests
W0129 13:30:30.989354 1883 metrics.cc:276] failed to get energy consumption for GPU 0, NVML_ERROR 3
App run failed
W0129 13:30:32.991676 1883 metrics.cc:276] failed to get energy consumption for GPU 0, NVML_ERROR 3




Hi @ronakchhatbar ,
We have YoloV4 DeepSTream sample - GitHub - NVIDIA-AI-IOT/yolov4_deepstream
TRT inference performance is much better than Triton, would you use TRT instead of Triton?

Thanks!

Hello @mchi ,

I was able to load the model with trt and triton. And yes, as suggested throughput of trt is higher than triton.

I have another query/ question?

In a standalone video is it suggested to have batchsize>1? And if so, is it the optimal way, for a single stream?
Or the batchsize>1 is to be used only with multiple streams so that, we can combine frames from various streams and pass it as a single batch?

Using what bacthsize depends on your use case.
But, if using too small batch, the GPU compute capability can’t be fully used. What batchsize can fully utilize GPU depends on the model and GPU.

the batchsize>1 is to be used only with multiple streams so that, we can combine frames from various streams and pass it as a single batch?
YES.