Facing issues with face detection using Deepstream SDK

Please provide complete information as applicable to your setup.

• Hardware Platform (Jetson / GPU) - GPU
• DeepStream Version - 7.1
• JetPack Version (valid for Jetson only) - NA
• TensorRT Version - 8.6.1
• NVIDIA GPU Driver Version (valid for GPU only) - 535.216.01
• Issue Type( questions, new requirements, bugs) questions
• How to reproduce the issue ? (This is for bugs. Including which sample app is using, the configuration files content, the command line used and other details for reproducing)

Issue:

We are using the YOLOv8n face detection model in DeepStream to detect faces in a video stream. However, we have observed that the model inference is not available in the metadata, resulting in the detection not being displayed with bounding boxes.

Note: When the same model is executed using Python OpenCV with the same input video stream, the bounding boxes appear as expected.

Assistance Required:

We are seeking assistance to verify the flow of our implementation and identify any potential deviations or issues. Specifically, we would like help in reviewing our custom parser to ensure that it correctly handles the metadata (frame metadata) and outputs the bounding box values in the metadata as expected.

Here are supporting details.

app.py

import sys
import math
import gi
gi.require_version(‘Gst’, ‘1.0’)
gi.require_version(‘GLib’, ‘2.0’)
from gi.repository import Gst, GLib, GObject
from data_loading import *
from custom_probe import *

def decodebin_child_added(child_proxy, Object, name, user_data):
print(f"Decodebin child added: {name}")
if “decodebin” in name:
Object.connect(“child-added”, decodebin_child_added, user_data)

if "source" in name:
    source_element = child_proxy.get_by_name("source")
    if source_element and source_element.find_property('drop-on-latency') is not None:
        source_element.set_property("drop-on-latency", True)

def cb_newpad(decodebin, decoder_src_pad, data):
print(“New pad callback”)
caps = decoder_src_pad.get_current_caps()
if not caps:
caps = decoder_src_pad.query_caps(None)

gststruct = caps.get_structure(0)
gstname = gststruct.get_name()
source_bin = data

print(f"Pad type: {gstname}")
if "video" in gstname:
    bin_ghost_pad = source_bin.get_static_pad("src")
    if not bin_ghost_pad.set_target(decoder_src_pad):
        sys.stderr.write("Failed to link decoder src pad to source bin ghost pad\n")

def create_source_bin(index, uri):
bin_name = f"source-bin-{index:02d}"
nbin = Gst.Bin.new(bin_name)
if not nbin:
sys.stderr.write(“Unable to create source bin\n”)
return None

uri_decode_bin = Gst.ElementFactory.make("uridecodebin", "uri-decode-bin")
if not uri_decode_bin:
    sys.stderr.write("Unable to create uridecodebin\n")
    return None

uri_decode_bin.set_property("uri", uri)
uri_decode_bin.connect("pad-added", cb_newpad, nbin)
uri_decode_bin.connect("child-added", decodebin_child_added, nbin)

Gst.Bin.add(nbin, uri_decode_bin)

if not nbin.add_pad(Gst.GhostPad.new_no_target("src", Gst.PadDirection.SRC)):
    sys.stderr.write("Failed to add ghost pad in source bin\n")
    return None

return nbin

def main(cfg):

Gst.init(None)
print("Creating DeepStream Face Detection Pipeline")

# Create Pipeline
pipeline = Gst.Pipeline()

if not pipeline:
    print("Error: Unable to create pipeline")
    sys.exit(1)
else:
    print("Pipeline created successfully")

# Create Stream Muxer
streammux = Gst.ElementFactory.make("nvstreammux", "Stream-muxer")
pipeline.add(streammux)
set_property(cfg, streammux, "streammux")

# Create and Add Source Bin
sources = cfg['source']
source_bin = create_source_bin(0, list(sources.values())[0])
pipeline.add(source_bin)

# Link Source to Stream Muxer
sinkpad = streammux.get_request_pad("sink_0")
srcpad = source_bin.get_static_pad("src")

if sinkpad is None or srcpad is None:
    print("Error: Source or Streammux pad not found!")
else:
    print(">>> Linking Source Bin to StreamMuxer")
    srcpad.link(sinkpad)

# Create Primary Inference (Face Detection)
pgie = Gst.ElementFactory.make("nvinfer", "primary-inference")
pipeline.add(pgie)
set_property(cfg, pgie, "pgie")

tracker = Gst.ElementFactory.make("nvtracker", "tracker")
pipeline.add(tracker)
set_tracker_properties(tracker, cfg['tracker']['config-file-path'])

# Create Tiler
tiler = Gst.ElementFactory.make("nvmultistreamtiler", "nvtiler")
pipeline.add(tiler)
tiler.set_property("rows", 1)
tiler.set_property("columns", 1)
tiler.set_property("width", 1920)
tiler.set_property("height", 1080)

# Create Video Converter
nvvidconv = Gst.ElementFactory.make("nvvideoconvert", "convertor")
pipeline.add(nvvidconv)

# Create On-Screen Display
nvosd = Gst.ElementFactory.make("nvdsosd", "onscreendisplay")
nvosd.set_property("process-mode", 0)  # Default mode (draw bounding boxes)
nvosd.set_property("display-text", 1)
pipeline.add(nvosd)

# Create Sink
sink = Gst.ElementFactory.make("nveglglessink", "file-sink")
pipeline.add(sink)
sink.set_property("sync", 0)

print(">>> After creating elements linking of elements is started")

#srcpad.link(sinkpad)

streammux.link(pgie)
# pgie_src_pad = pgie.get_static_pad("sink")
# if pgie_src_pad:
#     pgie_src_pad.add_probe(Gst.PadProbeType.BUFFER,pgie_sink_pad_buffer_probe)

pgie.link(tracker)

tracker.link(tiler)

tiler.link(nvvidconv)
nvvidconv.link(nvosd)
nvosd.link(sink)

pgie_src_pad = pgie.get_static_pad("src")
if pgie_src_pad:
    pgie_src_pad.add_probe(Gst.PadProbeType.BUFFER,pgie_sink_pad_buffer_probe)

loop = GLib.MainLoop()

# Bus Message Handling
bus = pipeline.get_bus()
bus.add_signal_watch()
bus.connect("message", bus_call, loop)

# Start Pipeline
pipeline.set_state(Gst.State.PLAYING)

try:
    loop.run()
except Exception as e:
    print(f"Pipeline error: {e}")
finally:
    pipeline.set_state(Gst.State.NULL)

def bus_call(bus, message, loop):
t = message.type
if t == Gst.MessageType.EOS:
print(“End-of-stream”)
loop.quit()
elif t == Gst.MessageType.ERROR:
err, debug = message.parse_error()
print(f"Error: {err}“)
print(f"Debug info: {debug}”)
loop.quit()
return True

if name == ‘main’:
cfg = parse_args(cfg_path=“/home/dstream/Documents/Deep_Stream_App/paths/paths.toml”)
main(cfg)

yolov8_infer_config

[property]
gpu-id=0
net-scale-factor=0.0039215697906911373
model-color-format=0

model-engine-file=/home/dstream/Documents/Deep_Stream_App/models/yolov8n/yolov8n-1-face.engine
#int8-calib-file=calib.table
labelfile-path=/home/dstream/Documents/Deep_Stream_App/configs/labels.txt
infer-dims=3;640;640
network-input-order=0
num-detected-classes=1
interval=0
batch-size=1
network-mode=0

uff-input-blob-name=images
output-blob-names=output0
output-tensor-meta=1
gie-unique-id=1
process-mode=1
network-type=0
cluster-mode=2
maintain-aspect-ratio=1
symmetric-padding=1
#workspace-size=2000
parse-bbox-func-name=NvDsInferParseCustomYoloV8

custom-lib-path=/home/dstream/Documents/Deep_Stream_App/yolov8_parser/yolov8n_parser.so

[class-attrs-all]
pre-cluster-threshold=0.75
topk=300

YoloV8n Parser

include
include
include
include “nvdsinfer_custom_impl.h”

define CONF_THRESHOLD 0.5 // Confidence threshold
define INPUT_WIDTH 640
define INPUT_HEIGHT 640

extern “C” bool NvDsInferParseCustomYoloV8Face(
std::vector const &outputLayersInfo,
NvDsInferNetworkInfo const &networkInfo,
std::vector &objectList)
{
if (outputLayersInfo.empty())
{
std::cerr << “Error: Output layers info is empty!” << std::endl;
return false;
}

const NvDsInferLayerInfo &outputLayer = outputLayersInfo[0];

if (!outputLayer.buffer)
{
    std::cerr << "Error: Output layer buffer is null!" << std::endl;
    return false;
}

const float *output = reinterpret_cast<const float *>(outputLayer.buffer);

std::cout << "output : " << output << std::endl;

int num_attrs = outputLayer.inferDims.d[0]; // Number of attributes (5)
int num_detections = outputLayer.inferDims.d[1]; // Number of

detections (anchor points, 8400)

if (num_attrs != 5)
{
    std::cerr << "Error: Expected 5 attributes, but got: " <<

num_attrs << std::endl;
return false;
}

std::cout << "Number of detections (anchor points): " <<

num_detections << std::endl;
std::cout << "Number of attributes per detection: " << num_attrs
<< std::endl;

// Print raw output values (first 10 detections for debugging)
std::cout << "Raw Output Data (First 10 detections):\n";
for (int i = 0; i < std::min(10, num_detections); i++)
{
    float x_center = output[i + num_detections * 0];
    float y_center = output[i + num_detections * 1];
    float width = output[i + num_detections * 2];
    float height = output[i + num_detections * 3];
    float conf = output[i + num_detections * 4];

    std::cout << "Detection " << i << " - X: " << x_center
              << ", Y: " << y_center << ", W: " << width
              << ", H: " << height << ", Conf: " << conf << "\n";
}

for (int i = 0; i < num_attrs; i++)
{
    float x_center = output[i + num_detections * 0];
    float y_center = output[i + num_detections * 1];
    float width = output[i + num_detections * 2];
    float height = output[i + num_detections * 3];
    float conf = output[i + num_detections * 4];

    if (conf < CONF_THRESHOLD)
        continue; // Skip low-confidence detections

    // Convert (x_center, y_center, width, height) → (x1, y1, x2, y2)
    float x1 = x_center - width / 2;
    float y1 = y_center - height / 2;
    float x2 = x_center + width / 2;
    float y2 = y_center + height / 2;

    // Normalize to the input image size
    x1 *= networkInfo.width;
    y1 *= networkInfo.height;
    x2 *= networkInfo.width;
    y2 *= networkInfo.height;

    // Ensure bounding box is within valid range
    x1 = std::max(0.0f, std::min(x1,

static_cast(networkInfo.width)));
y1 = std::max(0.0f, std::min(y1,
static_cast(networkInfo.height)));
x2 = std::max(0.0f, std::min(x2,
static_cast(networkInfo.width)));
y2 = std::max(0.0f, std::min(y2,
static_cast(networkInfo.height)));

    // Store parsed object
    NvDsInferParseObjectInfo obj;
    obj.left = x1;
    obj.top = y1;
    obj.width = x2 - x1;
    obj.height = y2 - y1;
    obj.detectionConfidence = conf;
    obj.classId = 0; // Single-class model (faces)

    objectList.push_back(obj);
}

std::cout << "Parsed Objects: " << objectList.size() << " detected faces\n";

return true;

}

extern “C” bool NvDsInferParseCustomYoloV8Face_cuda(
std::vector const &outputLayersInfo,
NvDsInferNetworkInfo const &networkInfo,
std::vector &objectList)
{
return NvDsInferParseCustomYoloV8Face(outputLayersInfo,
networkInfo, objectList);
}

The function prototype of the custom parser does not match. You can refer to this example file

/opt/nvidia/deepstream/deepstream/sources/libs/nvdsinfer_customparser/nvdsinfer_custombboxparser.cpp

Similarly, you can refer to this community user’s sample

In addition, please ensure that the parameters in the configuration file are the same as the *.engine file, otherwise it will not work properly.

Alternatively, you can specify the path of onnx and let deepstream pipeline generate the *.engine file.

Hi @junshengy thanks for your response. Here is my response on your recommendation.

  1. I dont have to identify landmarks. It is only face detection. Hence we removed the landmark related from the parser and tried. Got the error which is shared here.
  2. By keeping the onnx format of the model resulted the same error.
    Let share your thoughts on debugging the issue. Thanks.
dstream@tokkio:~/Documents/Deep_Stream_App$ python3 app.py
Creating DeepStream Face Detection Pipeline
Pipeline created successfully
streammux set_property gpu_id 0 

streammux set_property batch-size 1 

streammux set_property width 1920 

streammux set_property height 1080 

/home/dstream/Documents/Deep_Stream_App/app.py:88: DeprecationWarning: Gst.Element.get_request_pad is deprecated
  sinkpad = streammux.get_request_pad("sink_0")
>>> Linking Source Bin to StreamMuxer
pgie set_property config-file-path /home/dstream/Documents/Deep_Stream_App/configs/yolov8_infer_config.txt 

>>> After creating elements linking of elements is started
libEGL warning: DRI2: failed to authenticate
gstnvtracker: Loading low-level lib at /home/dstream/Desktop/DEEP_STREAM/libraries/libnvds_nvmultiobjecttracker.so
[NvMultiObjectTracker] Initialized
0:00:00.495912054 110757 0x560d1e2f9070 INFO                 nvinfer gstnvinfer.cpp:682:gst_nvinfer_logger:<primary-inference> NvDsInferContext[UID 1]: Info from NvDsInferContextImpl::buildModel() <nvdsinfer_context_impl.cpp:2109> [UID = 1]: Trying to create engine from model files
WARNING: [TRT]: onnx2trt_utils.cpp:374: Your ONNX model has been generated with INT64 weights, while TensorRT does not natively support INT64. Attempting to cast down to INT32.
0:02:08.072013812 110757 0x560d1e2f9070 INFO                 nvinfer gstnvinfer.cpp:682:gst_nvinfer_logger:<primary-inference> NvDsInferContext[UID 1]: Info from NvDsInferContextImpl::buildModel() <nvdsinfer_context_impl.cpp:2141> [UID = 1]: serialize cuda engine to file: /home/dstream/Documents/Deep_Stream_App/models/yolov8n/yolov8n-face.onnx_b1_gpu0_fp32.engine successfully
WARNING: [TRT]: The getMaxBatchSize() function should not be used with an engine built from a network created with NetworkDefinitionCreationFlag::kEXPLICIT_BATCH flag. This function will always return 1.
INFO: ../nvdsinfer/nvdsinfer_model_builder.cpp:612 [Implicit Engine Info]: layers num: 2
0   INPUT  kFLOAT images          3x640x640       
1   OUTPUT kFLOAT output0         5x8400          

0:02:08.434920793 110757 0x560d1e2f9070 INFO                 nvinfer gstnvinfer_impl.cpp:343:notifyLoadModelStatus:<primary-inference> [UID 1]: Load new model:/home/dstream/Documents/Deep_Stream_App/configs/yolov8_infer_config.txt sucessfully
Decodebin child added: source
Decodebin child added: decodebin0
Decodebin child added: qtdemux0
Decodebin child added: multiqueue0
Decodebin child added: h264parse0
Decodebin child added: capsfilter0
Decodebin child added: aacparse0
Decodebin child added: avdec_aac0
Decodebin child added: nvv4l2decoder0
New pad callback
Pad type: video/x-raw
New pad callback
Pad type: audio/x-raw
Expected 2 output layers (boxes and scores), but got 1
0:02:09.638342773 110757 0x560d857b3210 ERROR                nvinfer gstnvinfer.cpp:676:gst_nvinfer_logger:<primary-inference> NvDsInferContext[UID 1]: Error in NvDsInferContextImpl::fillDetectionOutput() <nvdsinfer_context_impl_output_parsing.cpp:727> [UID = 1]: Failed to parse bboxes using custom parse function
Segmentation fault (core dumped)

What does your model output look like? This error should mean that your parser code does not match the model output.

Can you share your ONNX and explain the meaning of the output tensor? I can’t determine the issue from the code above.

yolov8n-face-onnx.zip (10.1 MB)

Here i have attached the onnx model used and the respective format output data.

ONNX format output:

  • 1st dimension (Batch size = 1): The model processes one image at a time.
  • 2nd dimension (5): Represents the bounding box format (e.g., x_center, y_center, width, height, confidence).
  • 3rd dimension (8400): The number of detection candidates (i.e., number of anchor points/grid cells used for prediction).

dstream@tokkio:~/Documents/Deep_Stream_App/Code$ python3 onnx_model_graph.py
Output 0 shape: (1, 5, 8400)
Output of the onnx model is : [array([[[5.8619080e+00, 1.2464156e+01, 2.0098057e+01, …,
5.4958411e+02, 5.6107458e+02, 5.7485925e+02],
[9.2494926e+00, 8.7434731e+00, 6.7619829e+00, …,
5.9759375e+02, 5.9369690e+02, 5.8265234e+02],
[1.1546909e+01, 2.2310633e+01, 2.4263664e+01, …,
1.7415131e+02, 1.5643512e+02, 1.2996936e+02],
[1.7989792e+01, 1.7483654e+01, 1.3686922e+01, …,
8.5384094e+01, 9.3202515e+01, 1.1504486e+02],
[1.2776256e-04, 1.3276935e-04, 7.4058771e-05, …,
8.5216761e-04, 9.1385841e-04, 8.6909533e-04]]], dtype=float32)]

Engine format output

shape of the engine model output (5,8400)
Raw output sample of engine model : [[7.20616341e+00 1.27603703e+01 1.82915344e+01 … 5.52990173e+02
5.68051453e+02 5.81696838e+02]
[9.43836975e+00 8.45473576e+00 6.90096951e+00 … 5.77839417e+02
5.81104614e+02 5.82301636e+02]
[1.41335354e+01 2.26355820e+01 2.37958717e+01 … 1.67598022e+02
1.41069092e+02 1.15839966e+02]
[1.84598618e+01 1.70480633e+01 1.40378571e+01 … 1.23966431e+02
1.17560608e+02 1.16858337e+02]
[1.71264168e-04 1.66447891e-04 1.11181660e-04 … 3.26974659e-05
3.65621963e-05 2.94052170e-05]]
x_center : [ 7.2061634 12.76037 18.291534 … 552.9902 568.05145
581.69684 ]
y_center : [ 9.43837 8.454736 6.9009695 … 577.8394 581.1046
582.30164 ]
width : [ 14.133535 22.635582 23.795872 … 167.59802 141.06909 115.839966]
height : [ 18.459862 17.048063 14.037857 … 123.96643 117.56061 116.85834 ]
confidence : [1.7126417e-04 1.6644789e-04 1.1118166e-04 … 3.2697466e-05 3.6562196e-05
2.9405217e-05]

  1. yolov8 models user must add a transpose node at the end of the network through onnx-graphsurgeon
pip install onnx_graphsurgeon

Run the following python script convert the yolov8n-face.onnx to yolov8n-face-dynamic_batch_640.onnx

import onnx_graphsurgeon as gs
import numpy as np
import onnx

graph = gs.import_onnx(onnx.load("yolov8n-face.onnx"))
# graph = gs.import_onnx(onnx.load("yolov8-s.onnx"))
ori_output = graph.outputs[0]
trans_out  = gs.Variable(name="trans_out", dtype=np.float32, shape=(-1, 8400, 5))
trans_node = gs.Node(op="Transpose",name="transpose_output_node", attrs={"perm":np.array([0,2,1])}, inputs=[ori_output], outputs=[trans_out])
graph.nodes.append(trans_node)
graph.outputs = [trans_out]
graph.cleanup(remove_unused_graph_inputs=True).toposort()
model = onnx.shape_inference.infer_shapes(gs.export_onnx(graph))
onnx.save(model, "yolov8n-face-dynamic_batch_640.onnx")

2.Save the content below as yolov8-config.txt and nvdsparsebbox_Yolo.cpp

[property]
gpu-id=0
net-scale-factor=0.0039215697906911373
model-color-format=0
infer-dims=3;640;640
onnx-file=yolov8n-face-dynamic_batch_640.onnx
model-engine-file=yolov8n-face-dynamic_batch_640.onnx_b1_gpu0_fp32.engine
#int8-calib-file=calib.table
#labelfile-path=labels.txt
batch-size=1
network-mode=0
num-detected-classes=1
interval=0
gie-unique-id=1
process-mode=1
network-type=0
## 1=DBSCAN, 2=NMS, 3= DBSCAN+NMS Hybrid, 4 = None(No clustering)
cluster-mode=2
maintain-aspect-ratio=1
num-detected-classes=1
symmetric-padding=1
#workspace-size=2000
custom-lib-path=libnvdsinfer_custom_impl_Yolo.so
parse-bbox-func-name=NvDsInferParseCustomYoloV8

[class-attrs-all]
nms-iou-threshold=0.3
pre-cluster-threshold=0.25
topk=300
/*
 * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES.
 * All rights reserved. SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */

#include "nvdsinfer_custom_impl.h"
#include <algorithm>
#include <cassert>
#include <cmath>
#include <cstring>
#include <fstream>
#include <iostream>
#include <unordered_map>

static const int NUM_CLASSES_YOLO = 1;

float clamp(const float val, const float minVal, const float maxVal) {
  assert(minVal <= maxVal);
  return std::min(maxVal, std::max(minVal, val));
}

static NvDsInferParseObjectInfo
convertBBoxYolo(const float &bx, const float &by, const float &bw,
                const float &bh, const int &stride, const uint &netW,
                const uint &netH) {
  NvDsInferParseObjectInfo b;
  // Restore coordinates to network input resolution
  float xCenter = bx * stride;
  float yCenter = by * stride;
  float x0 = xCenter - bw / 2;
  float y0 = yCenter - bh / 2;
  float x1 = x0 + bw;
  float y1 = y0 + bh;

  x0 = clamp(x0, 0, netW);
  y0 = clamp(y0, 0, netH);
  x1 = clamp(x1, 0, netW);
  y1 = clamp(y1, 0, netH);

  b.left = x0;
  b.width = clamp(x1 - x0, 0, netW);
  b.top = y0;
  b.height = clamp(y1 - y0, 0, netH);

  return b;
}

static void addBBoxProposalYolo(const float bx, const float by, const float bw,
                                const float bh, const uint stride,
                                const uint &netW, const uint &netH,
                                const int maxIndex, const float maxProb,
                                std::vector<NvDsInferParseObjectInfo> &binfo) {
  NvDsInferParseObjectInfo bbi =
      convertBBoxYolo(bx, by, bw, bh, stride, netW, netH);
  if (bbi.width < 1 || bbi.height < 1)
    return;

  bbi.detectionConfidence = maxProb;
  bbi.classId = maxIndex;
  binfo.push_back(bbi);
}

static bool
NvDsInferParseYoloV8(std::vector<NvDsInferLayerInfo> const &outputLayersInfo,
                     NvDsInferNetworkInfo const &networkInfo,
                     NvDsInferParseDetectionParams const &detectionParams,
                     std::vector<NvDsInferParseObjectInfo> &objectList) {
  if (outputLayersInfo.empty()) {
    std::cerr << "Could not find output layer in bbox parsing" << std::endl;
    ;
    return false;
  }
  const NvDsInferLayerInfo &layer = outputLayersInfo[0];

  if (NUM_CLASSES_YOLO != detectionParams.numClassesConfigured) {
    std::cerr << "WARNING: Num classes mismatch. Configured:"
              << detectionParams.numClassesConfigured
              << ", detected by network: " << NUM_CLASSES_YOLO << std::endl;
  }

  std::vector<NvDsInferParseObjectInfo> objects;

  float *data = (float *)layer.buffer;
  const int dimensions = layer.inferDims.d[1];
  int rows = layer.inferDims.numElements / layer.inferDims.d[1];
  // dimensions 8400 rows 5
  // printf("dimensions %d rows %d\n", dimensions, rows);

  for (int i = 0; i < rows; ++i) {
    // 85 = x, y, w, h, score0......score79
    float bx = data[0];
    float by = data[1];
    float bw = data[2];
    float bh = data[3];
    float *classes_scores = data + 4;

    float maxScore = 0;
    int index = 0;
    for (int j = 0; j < NUM_CLASSES_YOLO; j++) {
      if (*classes_scores > maxScore) {
        index = j;
        maxScore = *classes_scores;
      }
      classes_scores++;
    }

    int maxIndex = index;
    data += dimensions;
    // printf("maxIndex %d \n", maxIndex);
    // share the same addBBoxProposal function
    addBBoxProposalYolo(bx, by, bw, bh, 1, networkInfo.width,
                        networkInfo.height, maxIndex, maxScore, objects);
  }
  objectList = objects;
  return true;
}

extern "C" bool NvDsInferParseCustomYoloV8(
    std::vector<NvDsInferLayerInfo> const &outputLayersInfo,
    NvDsInferNetworkInfo const &networkInfo,
    NvDsInferParseDetectionParams const &detectionParams,
    std::vector<NvDsInferParseObjectInfo> &objectList) {
  return NvDsInferParseYoloV8(outputLayersInfo, networkInfo, detectionParams,
                              objectList);
}

CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseCustomYoloV8);

  1. Build the code to libnvdsinfer_custom_impl_Yolo.so

4.Run the gst-launch-1.0, You can see that the face is detected correctly.

gst-launch-1.0 uridecodebin uri="your video url" ! m.sink_0  nvstreammux name=m batch-size=1 width=1920 height=1080 ! nvinfer config-file-path=yolov8-config.txt ! nvvideoconvert ! 'video/x-raw(memory:NVMM),format=RGBA' ! nvdsosd ! nvvideoconvert ! nveglglessink

Hi @junshengy thanks for your support. The solution you have given worked. Able to detect object.

@junshengy As I am trying to detect the face and recognize the person in the frame, with your guided approach, i am able to detect and now i am facing issues with recognizing the face.
I tried using arcface by having the weights and architecture then convert into onnx format. This did not go well.
Can you please suggest me if there is any place where i can get the recognition model (feature extract) and use it in sgie.
Here are the two steps i need to perform

  1. I need to generate embedding for all the pictures that i have in my DB. Then store it in either a pickle file or in DB
  2. I need to compare the same with video input and recognize the face

Thanks for your support.

I think your pipeline should be like below, add output-tensor-meta=1 in sgie configuration file, you can get the output features of face, you can do recognition in post-processing

pgie(output face bbox)  -> nvdspreprocess (face to tensor) --> sgie(arcface) 

Refer to this sample
/opt/nvidia/deepstream/deepstream/sources/apps/sample_apps/deepstream-infer-tensor-meta-test/deepstream_infer_tensor_meta_test.cpp

/* convert to tensor metadata */
        NvDsInferTensorMeta *meta =
            (NvDsInferTensorMeta *) user_meta->user_meta_data;

        for (unsigned int i = 0; i < meta->num_output_layers; i++) {
          NvDsInferLayerInfo *info = &meta->output_layers_info[i];
          info->buffer = meta->out_buf_ptrs_host[i];
          if (use_device_mem && meta->out_buf_ptrs_dev[i]) {
            cudaMemcpy (meta->out_buf_ptrs_host[i], meta->out_buf_ptrs_dev[i],
                info->inferDims.numElements * 4, cudaMemcpyDeviceToHost);
          }
        }

By the way, please open a new topic to discuss follow-up questions

Hi @junshengy, just to clarify we are able to get the tensor and we are facing issues with sgie while generating the embedding.

I have created new topic for this discussion.

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.