My goal is to run a detection pipeline on a batch of 6 camera feeds using dynamic batch and triton. My model is trained to enable a max of 12 concurrent inferences:
trtexec --onnx=end2end_dynamic.onnx --saveEngine=end2end_dynamic-b12.engine --fp16 --minShapes='input':1x3x1088x1920 --optShapes='input':12x3x1088x1920 --maxShapes='input':12x3x1088x1920
Everything seems to work as expected, however the labels do not appear beside bounding boxes when I set more than one source camera feed. The bounding boxes still appear but the labels don’t.
I will try to give as much context as possible. My pipeline is set via a docker compose file using the nvcr.io/nvidia/deepstream:7.1-gc-triton-devel image:
services:
deepstream:
image: nvcr.io/nvidia/deepstream:7.1-gc-triton-devel
runtime: nvidia
network_mode: host
privileged: true
volumes:
- "./deep-stream-7.1-tests:/deep-stream-7.1-tests"
- "./parser:/project"
- "/tmp/.X11-unix:/tmp/.X11-unix"
environment:
- DISPLAY=:1
- GST_DEBUG=nvinferserver:5
tty: true
stdin_open: true
command: ["deepstream-app", "-c", "/deep-stream-7.1-tests/raai_triton_test.txt"]
This is the DeepStream pipeline configuration file with the six source camera feeds, a sink to the display and another to a file:
[application]
enable-perf-measurement=1
perf-measurement-interval-sec=5
[source0]
enable=1
type=4
uri=OMMITED
num-sources=1
gpu-id=0
cudadec-memtype=0
[source1]
enable=1
type=4
uri=OMMITED
num-sources=1
gpu-id=0
udadec-memtype=0
[source2]
enable=1
type=4
uri=OMMITED
num-sources=1
gpu-id=0
cudadec-memtype=0
[source3]
enable=1
type=4
uri=OMMITED
num-sources=1
gpu-id=0
cudadec-memtype=0
[source4]
enable=1
type=4
uri=OMMITED
num-sources=1
gpu-id=0
cudadec-memtype=0
[source5]
enable=1
type=4
uri=OMMITED
num-sources=1
gpu-id=0
cudadec-memtype=0
[streammux]
gpu-id=0
live-source=1
batch-size=6
batched-push-timeout=40000
width=1920
height=1080
enable-padding=0
nvbuf-memory-type=0
[primary-gie]
enable=1
plugin-type=1
gpu-id=0
config-file=/deep-stream-7.1-tests/config_infer_triton_raai.txt
interval=0
gie-unique-id=1
nvbuf-memory-type=0
[osd]
enable=1
gpu-id=0
border-width=2
text-size=15
text-color=1;1;1;1;
text-bg-color=0.3;0.3;0.3;1
font=Serif
show-clock=1
clock-x-offset=0
clock-y-offset=0
clock-text-size=12
clock-color=1;0;0;0
nvbuf-memory-type=0
[tiled-display]
enable=1
rows=2
columns=3
width=1920
height=1080
gpu-id=0
nvbuf-memory-type=0
[sink0]
enable=1
type=2
sync=1
[sink1]
enable=1
type=3
sync=1
#source-id=0
container=2
output-file=/deep-stream-7.1-tests/results/output.mp4
This is the triton configuration .pbtxt file setting the max batch size:
name: "raai"
platform: "tensorrt_plan"
max_batch_size: 12
#default_model_filename: "end2end_tensorrt.onnx_b1_gpu0_fp32.engine"
default_model_filename: "end2end_dynamic-b12.engine"
# Enable dynamic batching
dynamic_batching {
max_queue_delay_microseconds: 5000
preferred_batch_size: [1, 6, 12]
}
input [
{
name: "input"
data_type: TYPE_FP32
format: FORMAT_NCHW
dims: [ 3, 1088, 1920 ]
}
]
output [
{
name: "dets"
data_type: TYPE_FP32
dims: [ 300, 5 ]
},
{
name: "labels"
data_type: TYPE_INT32
dims: [ 300 ]
label_filename: "labels.txt"
}
]
instance_group [
{
kind: KIND_GPU
count: 1
gpus: 0
}
]
I also run a custom model parser:
#include "nvdsinfer_custom_impl.h"
#include <cassert>
#include <iostream>
#include <vector>
/**
* Function expected by DeepStream for decoding the MMYOLO output.
*
* C-linkage [extern "C"] is used to prevent name-mangling. This function must return true after
* adding all bounding boxes to the objectList vector.
*
* @param [outputLayersInfo] Vector of NvDsInferLayerInfo objects with information about the output layer.
* @param [networkInfo] NvDsInferNetworkInfo object with information about the MMYOLO network.
* @param [detectionParams] NvDsInferParseDetectionParams with information about some config params.
* @param [objectList] Vector of NvDsInferParseObjectInfo objects to which bounding box information must be stored.
*
* @return true on success, false otherwise.
*/
extern "C" bool NvDsInferParseCustomMMYOLO(
std::vector<NvDsInferLayerInfo> const &outputLayersInfo,
NvDsInferNetworkInfo const &networkInfo,
NvDsInferParseDetectionParams const &detectionParams,
std::vector<NvDsInferParseObjectInfo> &objectList);
static inline float clamp(float &val, float min, float max)
{
return val > min ? (val < max ? val : max) : min;
}
static std::vector<NvDsInferParseObjectInfo> decodeMMYoloTensor(
const int num_dets,
const float *bboxes_and_scores,
const int *labels,
const float &conf_thres,
const unsigned int &img_w,
const unsigned int &img_h)
{
std::vector<NvDsInferParseObjectInfo> bboxInfo;
for (int i = 0; i < num_dets; i++)
{
float score = bboxes_and_scores[i * 5 + 4];
if (score < conf_thres)
continue;
float x0 = bboxes_and_scores[i * 5];
float y0 = bboxes_and_scores[i * 5 + 1];
float x1 = bboxes_and_scores[i * 5 + 2];
float y1 = bboxes_and_scores[i * 5 + 3];
x0 = clamp(x0, 0.f, img_w);
y0 = clamp(y0, 0.f, img_h);
x1 = clamp(x1, 0.f, img_w);
y1 = clamp(y1, 0.f, img_h);
NvDsInferParseObjectInfo obj;
obj.left = x0;
obj.top = y0;
obj.width = x1 - x0;
obj.height = y1 - y0;
obj.detectionConfidence = score;
obj.classId = labels[i];
bboxInfo.push_back(obj);
// Print bounding box, label, and detection confidence for debugging.
std::cout << "Detection " << i << ": Label = " << obj.classId
<< ", Confidence = " << obj.detectionConfidence
<< ", BBox = (" << obj.left << ", " << obj.top
<< ", " << obj.width << ", " << obj.height << ")" << std::endl;
}
return bboxInfo;
}
/* C-linkage to prevent name-mangling */
extern "C" bool NvDsInferParseCustomMMYOLO(
std::vector<NvDsInferLayerInfo> const &outputLayersInfo,
NvDsInferNetworkInfo const &networkInfo,
NvDsInferParseDetectionParams const &detectionParams,
std::vector<NvDsInferParseObjectInfo> &objectList)
{
// Log the network input size (for one sample) from networkInfo.
std::cerr << "Network sample input size: width = " << networkInfo.width
<< ", height = " << networkInfo.height << std::endl;
// Log dimensions of each output layer.
for (size_t i = 0; i < outputLayersInfo.size(); i++)
{
const NvDsInferLayerInfo &elem = outputLayersInfo[i];
std::cerr << "Output layer " << i << " dimensions:" << std::endl;
for (size_t j = 0; j < elem.inferDims.numDims; j++)
{
std::cerr << " dim[" << j << "] = " << elem.inferDims.d[j] << std::endl;
}
// If this is a 4D tensor, try to compute the batch size.
if (elem.inferDims.numDims == 4)
{
// Compute the product of the last three dimensions.
int prod = 1;
for (size_t j = 1; j < elem.inferDims.numDims; j++)
{
prod *= elem.inferDims.d[j];
}
int batch_size = elem.inferDims.numElements / prod;
std::cerr << " T Computed batch size for layer " << i << ": " << batch_size << std::endl;
}
}
if (outputLayersInfo.empty() || outputLayersInfo.size() != 2)
{
std::cerr << "ERROR: Expected 2 output layers but got " << outputLayersInfo.size() << std::endl;
return false;
}
const float conf_thres = detectionParams.perClassThreshold[0];
const int num_dets = outputLayersInfo[1].inferDims.numElements;
const float *bboxes_and_scores = (const float *)outputLayersInfo[0].buffer;
const int *labels = (const int *)outputLayersInfo[1].buffer;
// Print total number of detections in the batch.
std::cerr << "Total number of detections (from layer 1): " << num_dets << std::endl;
// Decode the output tensor of MMYOLO to the object list.
std::vector<NvDsInferParseObjectInfo> objects = decodeMMYoloTensor(
num_dets, bboxes_and_scores, labels, conf_thres, networkInfo.width, networkInfo.height);
objectList.clear();
objectList = objects;
return true;
}
CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseCustomMMYOLO);
I’m running this pipeline on a RTX 3080 Ti Mobile laptop with Ubuntu 22.04, driver version 535.183.01.