This isn’t an issue with nvosd; your nvinfer post-processing and configuration files have several errors. Please try the following code and configuration file.
# SCRFD Face Detection Model Configuration for DeepStream nvinfer
[property]
gpu-id=0
net-scale-factor=0.0039215697906911373
# ONNX model path
onnx-file=../scrfd.onnx
model-engine-file=../scrfd.onnx_b1_gpu0_fp16.engine
# Input dimensions
batch-size=1
network-mode=2
# Supported network modes:
# 0 - FP32
# 1 - INT8
# 2 - FP16
# Network input shape
infer-dims=3;640;640
maintain-aspect-ratio=0
network-type=3
# Custom instance mask parser (for face detection with keypoints)
parse-bbox-instance-mask-func-name=NvDsInferParseCustomSCRFD
custom-lib-path=libnvdsinfer_custom_impl_scrfd.so
# Number of classes (face detection has 1 class)
num-detected-classes=1
output-instance-mask=1
interval=0
gie-unique-id=1
process-mode=1
# 1 - Primary GIE, 2 - Secondary GIE
# Clustering and grouping
cluster-mode=4
# 0 - OpenCV groupRectangles
# 1 - DBSCAN
# 2 - NMS (non-maximum suppression)
# 3 - DBSCAN + NMS hybrid
# 4 - No clustering
# Network input configuration
# Model expects RGB input
model-color-format=0
# 0 - RGB
# 1 - BGR
# 2 - GRAY
# Performance tuning
# Increase for better throughput on high-resolution streams
network-input-order=0
# 0 - NCHW
# 1 - NHWC (Tensor RT prefers NCHW for conv layers)
[class-attrs-all]
pre-cluster-threshold=0.25
/**
* DeepStream NvInfer Custom Parser for SCRFD Face Detection Model
*
* Model Outputs:
* - score_8, score_16, score_32: [batch, num_anchors, 1] - face classification scores (sigmoid)
* - bbox_8, bbox_16, bbox_32: [batch, num_anchors, 4] - distance predictions (left, top, right,
* bottom)
* - kps_8, kps_16, kps_32: [batch, num_anchors, 10] - keypoint offsets (dx, dy) * 5
*
* Strides: 8, 16, 32
* Anchors per location: 2
*/
#include <algorithm>
#include <cmath>
#include <cstring>
#include <iostream>
#include <vector>
#include "nvdsinfer_custom_impl.h"
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define CLIP(a, min, max) (MAX(MIN(a, max), min))
static const int kNumAnchorsPerLocation = 2;
static const int kNumKeypoints = 5;
static const int kNumKeypointsCoords = kNumKeypoints * 2; // 10 coordinates
struct FaceDetectionParams {
float scoreThreshold = 0.5f;
float nmsThreshold = 0.45f;
int inputWidth = 640;
int inputHeight = 640;
};
struct AnchorGenerator {
int stride;
int width;
int height;
void generateAnchors(std::vector<std::pair<float, float>>& anchors) const {
anchors.clear();
int fmapW = width / stride;
int fmapH = height / stride;
// Generate anchor centers: matches official np.mgrid[:height, :width][::-1]
// This creates grid points at integer positions: [0, 1, 2, ..., H-1]
for (int i = 0; i < fmapH; i++) {
for (int j = 0; j < fmapW; j++) {
for (int k = 0; k < kNumAnchorsPerLocation; k++) {
// Official: anchor_centers = (anchor_centers * stride)
// No +0.5 offset - use corner-based grid
float cx = j * stride;
float cy = i * stride;
anchors.push_back({cx, cy});
}
}
}
}
};
static float calculateIoU(const NvDsInferInstanceMaskInfo& a, const NvDsInferInstanceMaskInfo& b) {
float areaA = a.width * a.height;
float areaB = b.width * b.height;
float interX1 = MAX(a.left, b.left);
float interY1 = MAX(a.top, b.top);
float interX2 = MIN(a.left + a.width, b.left + b.width);
float interY2 = MIN(a.top + a.height, b.top + b.height);
float interW = MAX(0.0f, interX2 - interX1);
float interH = MAX(0.0f, interY2 - interY1);
float interArea = interW * interH;
float unionArea = areaA + areaB - interArea;
return (unionArea > 0) ? (interArea / unionArea) : 0.0f;
}
static void nmsSort(std::vector<NvDsInferInstanceMaskInfo>& detections, float nmsThreshold) {
std::stable_sort(detections.begin(), detections.end(),
[](const NvDsInferInstanceMaskInfo& a, const NvDsInferInstanceMaskInfo& b) {
return a.detectionConfidence > b.detectionConfidence;
});
std::vector<bool> suppressed(detections.size(), false);
for (size_t i = 0; i < detections.size(); i++) {
if (suppressed[i]) continue;
for (size_t j = i + 1; j < detections.size(); j++) {
if (suppressed[j]) continue;
float iou = calculateIoU(detections[i], detections[j]);
// printf("nmsThreshold %f, IoU between box %zu and box %zu: %f\n", nmsThreshold, i, j, iou);
if (iou > nmsThreshold) {
suppressed[j] = true;
}
}
}
std::vector<NvDsInferInstanceMaskInfo> filtered;
for (size_t i = 0; i < detections.size(); i++) {
if (!suppressed[i]) {
filtered.push_back(detections[i]);
} else {
// Free allocated mask memory for suppressed detections
if (detections[i].mask != nullptr) {
delete[] detections[i].mask;
}
}
}
detections = std::move(filtered);
}
static void decodeBBoxes(const float* scores, const float* bboxes, const float* keypoints,
const std::vector<std::pair<float, float>>& anchors, int stride,
const FaceDetectionParams& params,
std::vector<NvDsInferInstanceMaskInfo>& results) {
for (size_t i = 0; i < anchors.size(); i++) {
float score = scores[i];
if (score < params.scoreThreshold) {
continue;
}
float cx = anchors[i].first;
float cy = anchors[i].second;
// Decode bounding box using distance2bbox
// Official: bbox_preds = bbox_preds * stride (done in inference)
// bbox format: [left, top, right, bottom] distances from anchor center
float left = bboxes[i * 4 + 0] * stride;
float top = bboxes[i * 4 + 1] * stride;
float right = bboxes[i * 4 + 2] * stride;
float bottom = bboxes[i * 4 + 3] * stride;
// distance2bbox: x1 = cx - left, y1 = cy - top, x2 = cx + right, y2 = cy + bottom
float x1 = cx - left;
float y1 = cy - top;
float x2 = cx + right;
float y2 = cy + bottom;
// Clip to image bounds
x1 = CLIP(x1, 0.0f, (float)params.inputWidth);
y1 = CLIP(y1, 0.0f, (float)params.inputHeight);
x2 = CLIP(x2, 0.0f, (float)params.inputWidth);
y2 = CLIP(y2, 0.0f, (float)params.inputHeight);
NvDsInferInstanceMaskInfo det;
det.classId = 0; // face class
det.left = x1;
det.top = y1;
det.width = CLIP(x2 - x1, 0.0f, (float)params.inputWidth);
det.height = CLIP(y2 - y1, 0.0f, (float)params.inputHeight);
det.detectionConfidence = score;
// Skip invalid detections
if (det.width < 1.0f || det.height < 1.0f) {
continue;
}
// Store keypoints in mask field
// Format: [x1, y1, conf1, x2, y2, conf2, ..., x5, y5, conf5] = 15 floats
const size_t kKeypointMaskSize = kNumKeypoints * 3; // 5 keypoints * (x, y, confidence)
det.mask = new float[kKeypointMaskSize];
det.mask_size = kKeypointMaskSize * sizeof(float);
det.mask_width = kNumKeypoints; // 5 keypoints
det.mask_height = 3; // x, y, confidence per keypoint
// Decode keypoints using distance2kps
// Official: kps_preds = kps_preds * stride (done in inference)
// kps format: [dx1, dy1, dx2, dy2, ..., dx5, dy5] offsets from anchor
for (int k = 0; k < kNumKeypoints; k++) {
float dx = keypoints[i * kNumKeypointsCoords + k * 2] * stride;
float dy = keypoints[i * kNumKeypointsCoords + k * 2 + 1] * stride;
// distance2kps: px = cx + dx, py = cy + dy
float kp_x = cx + dx;
float kp_y = cy + dy;
det.mask[k * 3 + 0] = CLIP(kp_x, 0.0f, (float)params.inputWidth);
det.mask[k * 3 + 1] = CLIP(kp_y, 0.0f, (float)params.inputHeight);
det.mask[k * 3 + 2] = 1.0f; // Confidence (always 1.0 for SCRFD)
}
results.push_back(det);
}
}
/* Custom parser for SCRFD face detection model.
* Uses NvDsInferInstanceMaskInfo to store facial keypoints in the mask field.
*/
extern "C" bool NvDsInferParseCustomSCRFD(std::vector<NvDsInferLayerInfo> const& outputLayersInfo,
NvDsInferNetworkInfo const& networkInfo,
NvDsInferParseDetectionParams const& detectionParams,
std::vector<NvDsInferInstanceMaskInfo>& objectList) {
static FaceDetectionParams params;
params.scoreThreshold = detectionParams.perClassThreshold[0];
params.nmsThreshold = detectionParams.perClassThreshold[0]; // Can be customized
params.inputWidth = networkInfo.width;
params.inputHeight = networkInfo.height;
// Expected 9 outputs: score_8, score_16, score_32, bbox_8, bbox_16, bbox_32, kps_8, kps_16,
// kps_32
if (outputLayersInfo.size() != 9) {
std::cerr << "ERROR: Expected 9 output layers for SCRFD model, got " << outputLayersInfo.size()
<< std::endl;
return false;
}
// Map output layers by name
const float* score_8 = nullptr;
const float* score_16 = nullptr;
const float* score_32 = nullptr;
const float* bbox_8 = nullptr;
const float* bbox_16 = nullptr;
const float* bbox_32 = nullptr;
const float* kps_8 = nullptr;
const float* kps_16 = nullptr;
const float* kps_32 = nullptr;
for (const auto& layer : outputLayersInfo) {
const char* layerName = layer.layerName;
if (strcmp(layerName, "score_8") == 0) {
score_8 = static_cast<const float*>(layer.buffer);
} else if (strcmp(layerName, "score_16") == 0) {
score_16 = static_cast<const float*>(layer.buffer);
} else if (strcmp(layerName, "score_32") == 0) {
score_32 = static_cast<const float*>(layer.buffer);
} else if (strcmp(layerName, "bbox_8") == 0) {
bbox_8 = static_cast<const float*>(layer.buffer);
} else if (strcmp(layerName, "bbox_16") == 0) {
bbox_16 = static_cast<const float*>(layer.buffer);
} else if (strcmp(layerName, "bbox_32") == 0) {
bbox_32 = static_cast<const float*>(layer.buffer);
} else if (strcmp(layerName, "kps_8") == 0) {
kps_8 = static_cast<const float*>(layer.buffer);
} else if (strcmp(layerName, "kps_16") == 0) {
kps_16 = static_cast<const float*>(layer.buffer);
} else if (strcmp(layerName, "kps_32") == 0) {
kps_32 = static_cast<const float*>(layer.buffer);
}
}
// Verify all outputs are found
if (!score_8 || !score_16 || !score_32 || !bbox_8 || !bbox_16 || !bbox_32 || !kps_8 || !kps_16 ||
!kps_32) {
std::cerr << "ERROR: Failed to find all required output layers" << std::endl;
return false;
}
std::vector<NvDsInferInstanceMaskInfo> allDetections;
// Process stride 8
{
AnchorGenerator anchorGen;
anchorGen.stride = 8;
anchorGen.width = params.inputWidth;
anchorGen.height = params.inputHeight;
std::vector<std::pair<float, float>> anchors;
anchorGen.generateAnchors(anchors);
decodeBBoxes(score_8, bbox_8, kps_8, anchors, 8, params, allDetections);
}
// Process stride 16
{
AnchorGenerator anchorGen;
anchorGen.stride = 16;
anchorGen.width = params.inputWidth;
anchorGen.height = params.inputHeight;
std::vector<std::pair<float, float>> anchors;
anchorGen.generateAnchors(anchors);
decodeBBoxes(score_16, bbox_16, kps_16, anchors, 16, params, allDetections);
}
// Process stride 32
{
AnchorGenerator anchorGen;
anchorGen.stride = 32;
anchorGen.width = params.inputWidth;
anchorGen.height = params.inputHeight;
std::vector<std::pair<float, float>> anchors;
anchorGen.generateAnchors(anchors);
decodeBBoxes(score_32, bbox_32, kps_32, anchors, 32, params, allDetections);
}
// Apply NMS
nmsSort(allDetections, params.nmsThreshold);
// Move results to output list
objectList.clear();
objectList = std::move(allDetections);
return true;
}
/* Check that the custom function has been defined correctly */
CHECK_CUSTOM_INSTANCE_MASK_PARSE_FUNC_PROTOTYPE(NvDsInferParseCustomSCRFD);
Then execute this command line. i have tried it, It works fine.
gst-launch-1.0 nvurisrcbin uri=file:///opt/nvidia/deepstream/deepstream/samples/streams/sample_ride_bike.mov ! m.sink_0 nvstreammux name=m width=1920 height=1080 batch-size=1 ! nvinfer config-file-path=config_infer_primary_scrfd.txt ! nvdsosd ! nvvideoencfilesinkbin output-file=out.mp4
This post-processing library stores landmarks in NvDsInferInstanceMaskInfo. To display these landmarks, a probe function needs to be added to the nvinfer source pad, similar to this link:
g_free(obj_meta->mask_params.data);
obj_meta->mask_params.width = 0;
obj_meta->mask_params.height = 0;
obj_meta->mask_params.size = 0;
}
static GstPadProbeReturn
nvosd_sink_pad_buffer_probe(GstPad *pad, GstPadProbeInfo *info, gpointer user_data)
{
GstBuffer *buf = (GstBuffer *) info->data;
NvDsBatchMeta *batch_meta = gst_buffer_get_nvds_batch_meta(buf);
NvDsMetaList *l_frame = NULL;
for (l_frame = batch_meta->frame_meta_list; l_frame != NULL; l_frame = l_frame->next) {
NvDsFrameMeta *frame_meta = (NvDsFrameMeta *) (l_frame->data);
NvDsMetaList *l_obj = NULL;
for (l_obj = frame_meta->obj_meta_list; l_obj != NULL; l_obj = l_obj->next) {
NvDsObjectMeta *obj_meta = (NvDsObjectMeta *) (l_obj->data);