Please provide complete information as applicable to your setup.
• Hardware Platform (Jetson / GPU) → NVIDIA GeForce GTX 1650
• DeepStream Version → 6.1
• JetPack Version (valid for Jetson only) NA
• TensorRT Version → TensorRT 8.2.5.1
• NVIDIA GPU Driver Version (valid for GPU only) NVIDIA driver 515
• Issue Type( questions, new requirements, bugs) Question
• How to reproduce the issue ? (This is for bugs. Including which sample app is using, the configuration files content, the command line used and other details for reproducing)
• Requirement details( This is for new requirement. Including the module name-for which plugin or for which sample application, the function description)
Hello, I’m running a Retina Face primary detector using deepstream python sample applications as a reference.
My problem is that objectList in the posprocessing function in C++ shows +40 objects detected, while the number of objects after returning to python code is capped at 20.
std::vector NvDsInferObjectDetectionInfo &objectList
The video I’m using contains at least 50 faces. Which is confirmed by the List size in c++ vDsInferParseCustomRetinaFace function. But Python code only shows 20, and while running Deepstream the shown bounding boxes are only 20. I know the model is detecting more but for some reason DS only shows a max of 20.
Here are the inputs which may help you identify the problem.
config_Retinaface.txt FIle
[property]
gpu-id=0
# model-engine-file=../../data/models/Retina_Mobilenet_-1_3_640_640_dynamic.engine
# model-engine-file=../../data/models/RetinaFace_-1_3_1920_1080_dynamic.engine
# model-engine-file=../../data/models/RetinaFace_res50_640_640_dynamic.engine
model-engine-file=../../data/models/RetinaFace_res50_1920_1080_dynamic.engine
# model-engine-file=../../data/models/Pytorch_RetinaFace_resnet50-720-1080.engine
# model-engine-file=../../data/models/Pytorch_RetinaFace_resnet50.engine
# model-engine-file=../../data/models/RetinaFace_-1_3_512_512_dynamic.engine
# model-engine-file=../../data/models/RetinaFace_-1_3_720_720_dynamic.engine
labelfile-path=../../data/models/Retinaface_labels.txt
batch-size=4
net-scale-factor=1.0
offsets=104.0;117.0;123.0
force-implicit-batch-dim=1
model-color-format=1
# Data format to be used by inference Integer 0: FP32 1: INT8 2: FP16
network-mode=0
process-mode=1
num-detected-classes=1
interval=0
gie-unique-id=1
parse-bbox-func-name=NvDsInferParseCustomRetinaFace
custom-lib-path=../nvdsinfer_customparser/libnvds_RetinaFaceParser.so
maintain-aspect-ratio=1
[class-attrs-all]
# bbox threshold [Resnet .80]
pre-cluster-threshold=0.1
# nms threshold
post-cluster-threshold=0.2
nvdsinfer_RetinaFaceParser.cpp file:
/*
* Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include "nvdsinfer_custom_impl.h"
#include <algorithm>
#include <cassert>
#include <cmath>
#include <cstring>
#include <iostream>
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define CLIP(a, min, max) (MAX(MIN(a, max), min))
extern "C" bool NvDsInferParseCustomRetinaFace(std::vector<NvDsInferLayerInfo> const &outputLayersInfo,
NvDsInferNetworkInfo const &networkInfo,
NvDsInferParseDetectionParams const &detectionParams,
std::vector<NvDsInferObjectDetectionInfo> &objectList);
struct Bbox {
int x1, y1, x2, y2;
float score;
};
struct anchorBox {
float cx;
float cy;
float sx;
float sy;
};
void postprocessing(float *bbox, float *conf, float bbox_threshold, float nms_threshold, unsigned int topk, int width,
int height, std::vector<NvDsInferObjectDetectionInfo> &objectList);
void create_anchor_retinaface(std::vector<anchorBox> &anchor, int w, int h);
bool cmp(NvDsInferObjectDetectionInfo a, NvDsInferObjectDetectionInfo b);
void nms(std::vector<NvDsInferObjectDetectionInfo> &input_boxes, float NMS_THRESH);
void postprocessing(float *bbox, float *conf, float bbox_threshold, float nms_threshold, unsigned int topk, int width,
int height, std::vector<NvDsInferObjectDetectionInfo> &objectList) {
std::vector<anchorBox> anchor;
create_anchor_retinaface(anchor, width, height);
for (unsigned int i = 0; i < anchor.size(); ++i) {
if (*(conf + 1) > bbox_threshold) {
anchorBox tmp = anchor[i];
anchorBox tmp1;
NvDsInferObjectDetectionInfo result;
result.classId = 0;
// decode bbox
tmp1.cx = tmp.cx + *(bbox + 0) * 0.1 * tmp.sx;
tmp1.cy = tmp.cy + *(bbox + 1) * 0.1 * tmp.sy;
tmp1.sx = tmp.sx * exp(*(bbox + 2) * 0.2);
tmp1.sy = tmp.sy * exp(*(bbox + 3) * 0.2);
result.left = (tmp1.cx - tmp1.sx / 2) * width;
result.top = (tmp1.cy - tmp1.sy / 2) * height;
result.width = (tmp1.cx + tmp1.sx / 2) * width - result.left;
result.height = (tmp1.cy + tmp1.sy / 2) * height - result.top;
// Clip object box coordinates to network resolution
result.left = CLIP(result.left, 0, width - 1);
result.top = CLIP(result.top, 0, height - 1);
result.width = CLIP(result.width, 0, width - 1);
result.height = CLIP(result.height, 0, height - 1);
result.detectionConfidence = *(conf + 1);
objectList.push_back(result);
}
bbox += 4;
conf += 2;
}
std::sort(objectList.begin(), objectList.end(), cmp);
std::cerr << "Before NMS objectList.size()" << objectList.size() << std::endl;
nms(objectList, nms_threshold);
std::cerr << "After NMS objectList.size()" << objectList.size() << std::endl;
// TODO: Setup a limiter according to recognition model batch size.
// if (objectList.size() > topk)
// objectList.resize(topk);
}
void create_anchor_retinaface(std::vector<anchorBox> &anchor, int w, int h) {
anchor.clear();
std::vector<std::vector<int>> feature_map(3), min_sizes(3);
float steps[] = {8, 16, 32};
for (unsigned int i = 0; i < feature_map.size(); ++i) {
feature_map[i].push_back(ceil(h / steps[i]));
feature_map[i].push_back(ceil(w / steps[i]));
}
//std::vector<int> minsize1 = {10, 20};
std::vector<int> minsize1 = {16, 32};
min_sizes[0] = minsize1;
//std::vector<int> minsize2 = {32, 64};
std::vector<int> minsize2 = {64, 128};
min_sizes[1] = minsize2;
//std::vector<int> minsize3 = {128, 256};
std::vector<int> minsize3 = {256, 512};
min_sizes[2] = minsize3;
for (unsigned int k = 0; k < feature_map.size(); ++k) {
std::vector<int> min_size = min_sizes[k];
for (int i = 0; i < feature_map[k][0]; ++i) {
for (int j = 0; j < feature_map[k][1]; ++j) {
for (unsigned int l = 0; l < min_size.size(); ++l) {
float s_kx = min_size[l] * 1.0 / w;
float s_ky = min_size[l] * 1.0 / h;
float cx = (j + 0.5) * steps[k] / w;
float cy = (i + 0.5) * steps[k] / h;
anchorBox axil = {cx, cy, s_kx, s_ky};
anchor.push_back(axil);
}
}
}
}
}
bool cmp(NvDsInferObjectDetectionInfo a, NvDsInferObjectDetectionInfo b) {
if (a.detectionConfidence > b.detectionConfidence)
return true;
return false;
}
void nms(std::vector<NvDsInferObjectDetectionInfo> &input_boxes, float NMS_THRESH) {
std::vector<float> vArea(input_boxes.size());
for (int i = 0; i < int(input_boxes.size()); ++i) {
vArea[i] = (input_boxes.at(i).width + 1) * (input_boxes.at(i).height + 1);
}
for (int i = 0; i < int(input_boxes.size()); ++i) {
for (int j = i + 1; j < int(input_boxes.size());) {
float xx1 = std::max(input_boxes[i].left, input_boxes[j].left);
float yy1 = std::max(input_boxes[i].top, input_boxes[j].top);
float xx2 =
std::min(input_boxes[i].left + input_boxes[i].width, input_boxes[j].left + input_boxes[j].width);
float yy2 =
std::min(input_boxes[i].top + input_boxes[i].height, input_boxes[j].top + input_boxes[j].height);
float w = std::max(float(0), xx2 - xx1 + 1);
float h = std::max(float(0), yy2 - yy1 + 1);
float inter = w * h;
float ovr = inter / (vArea[i] + vArea[j] - inter);
if (ovr >= NMS_THRESH) {
input_boxes.erase(input_boxes.begin() + j);
vArea.erase(vArea.begin() + j);
} else {
j++;
}
}
}
}
bool NvDsInferParseCustomRetinaFace(std::vector<NvDsInferLayerInfo> const &outputLayersInfo,
NvDsInferNetworkInfo const &networkInfo,
NvDsInferParseDetectionParams const &detectionParams,
std::vector<NvDsInferObjectDetectionInfo> &objectList) {
// Get output indexes
static int bboxLayerIndex = -1;
static int confLayerIndex = -1;
for (unsigned int i = 0; i < outputLayersInfo.size(); i++) {
if (strcmp(outputLayersInfo[i].layerName, "bbox") == 0) {
bboxLayerIndex = i;
} else if (strcmp(outputLayersInfo[i].layerName, "conf") == 0) {
confLayerIndex = i;
}
}
if ((bboxLayerIndex == -1) || (confLayerIndex == -1)) {
std::cerr << "Could not find output layer buffer while parsing" << std::endl;
return false;
}
// Host memory for "decode"
float *bbox = (float *)outputLayersInfo[bboxLayerIndex].buffer;
float *conf = (float *)outputLayersInfo[confLayerIndex].buffer;
// Get thresholds and topk value
const float bbox_threshold = detectionParams.perClassPreclusterThreshold[0];
const float nms_threshold = detectionParams.perClassPostclusterThreshold[0];
const unsigned int rec_batch_size = 50; // TODO: Get rec_batch_size from configurations, and use it as limiter
// Do post processing
postprocessing(bbox, conf, bbox_threshold, nms_threshold, rec_batch_size, networkInfo.width, networkInfo.height, objectList);
std::cerr << "After postprocessing objectList.size()" << objectList.size() << std::endl;
return true;
}
/* Check that the custom function has been defined correctly */
CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseCustomRetinaFace);
tiler_sink_pad_buffer_probe Function that output the number of Objects
def tiler_sink_pad_buffer_probe(pad, info, u_data):
frame_number = 0
num_rects = 0
gst_buffer = info.get_buffer()
if not gst_buffer:
print("Unable to get GstBuffer ")
return
# Retrieve batch metadata from the gst_buffer
# Note that pyds.gst_buffer_get_nvds_batch_meta() expects the
# C address of gst_buffer as input, which is obtained with hash(gst_buffer)
batch_meta = pyds.gst_buffer_get_nvds_batch_meta(hash(gst_buffer))
l_frame = batch_meta.frame_meta_list
while l_frame is not None:
try:
# Note that l_frame.data needs a cast to pyds.NvDsFrameMeta
# The casting is done by pyds.NvDsFrameMeta.cast()
# The casting also keeps ownership of the underlying memory
# in the C code, so the Python garbage collector will leave
# it alone.
frame_meta = pyds.NvDsFrameMeta.cast(l_frame.data)
except StopIteration:
break
frame_number = frame_meta.frame_num
l_obj = frame_meta.obj_meta_list
num_rects = frame_meta.num_obj_meta
is_first_obj = True
save_image = False
obj_counter = {
PGIE_CLASS_ID_FACE: 0
}
while l_obj is not None:
try:
# Casting l_obj.data to pyds.NvDsObjectMeta
obj_meta = pyds.NvDsObjectMeta.cast(l_obj.data)
# Define an analyze_meta function to manipulate metadata
except StopIteration:
break
obj_counter[obj_meta.class_id] += 1
faces_count["stream_{}".format(frame_meta.pad_index)] += 1
try:
l_obj = l_obj.next
except StopIteration:
break
print("Frame Number=", frame_number, "Number of Objects=", num_rects, "Face_count=",
obj_counter[PGIE_CLASS_ID_FACE])
# update frame rate through this probe
stream_index = "stream{0}".format(frame_meta.pad_index)
global perf_data
perf_data.update_fps(stream_index)
saved_count["stream_{}".format(frame_meta.pad_index)] += 1
try:
l_frame = l_frame.next
except StopIteration:
break
return Gst.PadProbeReturn.OK
Sample Output from DS run:
**PERF: {'stream0': 7.2}
Before NMS objectList.size() 82
After NMS objectList.size() 41
After postprocessing objectList.size() 41
Frame Number= 4 Number of Objects= 20 Face_count= 20
Before NMS objectList.size() 72
After NMS objectList.size() 35
After postprocessing objectList.size() 35
Frame Number= 5 Number of Objects= 20 Face_count= 20
Before NMS objectList.size() 81
After NMS objectList.size() 38
After postprocessing objectList.size() 38
Frame Number= 6 Number of Objects= 20 Face_count= 20
Before NMS objectList.size() 78
After NMS objectList.size() 40
After postprocessing objectList.size() 40
Frame Number= 7 Number of Objects= 20 Face_count= 20
Before NMS objectList.size() 83
After NMS objectList.size() 44
After postprocessing objectList.size() 44
Frame Number= 8 Number of Objects= 20 Face_count= 20
Before NMS objectList.size() 77
After NMS objectList.size() 40
After postprocessing objectList.size() 40
Frame Number= 9 Number of Objects= 20 Face_count= 20
Before NMS objectList.size() 91
After NMS objectList.size() 41
After postprocessing objectList.size() 41
Frame Number= 10 Number of Objects= 20 Face_count= 20
If you need anymore input from my side please let me know. Thank you.