Please provide complete information as applicable to your setup.
• Hardware Platform (Jetson / GPU) - GPU
• DeepStream Version - 7.1
• JetPack Version (valid for Jetson only) - NA
• TensorRT Version - 8.6.1
• NVIDIA GPU Driver Version (valid for GPU only) - 535.216.01
• Issue Type( questions, new requirements, bugs) questions
• How to reproduce the issue ? (This is for bugs. Including which sample app is using, the configuration files content, the command line used and other details for reproducing)
Issue:
We are using the YOLOv8n face detection model in DeepStream to detect faces in a video stream. However, we have observed that the model inference is not available in the metadata, resulting in the detection not being displayed with bounding boxes.
Note: When the same model is executed using Python OpenCV with the same input video stream, the bounding boxes appear as expected.
Assistance Required:
We are seeking assistance to verify the flow of our implementation and identify any potential deviations or issues. Specifically, we would like help in reviewing our custom parser to ensure that it correctly handles the metadata (frame metadata) and outputs the bounding box values in the metadata as expected.
Here are supporting details.
app.py
import sys
import math
import gi
gi.require_version(‘Gst’, ‘1.0’)
gi.require_version(‘GLib’, ‘2.0’)
from gi.repository import Gst, GLib, GObject
from data_loading import *
from custom_probe import *
def decodebin_child_added(child_proxy, Object, name, user_data):
print(f"Decodebin child added: {name}")
if “decodebin” in name:
Object.connect(“child-added”, decodebin_child_added, user_data)
if "source" in name:
source_element = child_proxy.get_by_name("source")
if source_element and source_element.find_property('drop-on-latency') is not None:
source_element.set_property("drop-on-latency", True)
def cb_newpad(decodebin, decoder_src_pad, data):
print(“New pad callback”)
caps = decoder_src_pad.get_current_caps()
if not caps:
caps = decoder_src_pad.query_caps(None)
gststruct = caps.get_structure(0)
gstname = gststruct.get_name()
source_bin = data
print(f"Pad type: {gstname}")
if "video" in gstname:
bin_ghost_pad = source_bin.get_static_pad("src")
if not bin_ghost_pad.set_target(decoder_src_pad):
sys.stderr.write("Failed to link decoder src pad to source bin ghost pad\n")
def create_source_bin(index, uri):
bin_name = f"source-bin-{index:02d}"
nbin = Gst.Bin.new(bin_name)
if not nbin:
sys.stderr.write(“Unable to create source bin\n”)
return None
uri_decode_bin = Gst.ElementFactory.make("uridecodebin", "uri-decode-bin")
if not uri_decode_bin:
sys.stderr.write("Unable to create uridecodebin\n")
return None
uri_decode_bin.set_property("uri", uri)
uri_decode_bin.connect("pad-added", cb_newpad, nbin)
uri_decode_bin.connect("child-added", decodebin_child_added, nbin)
Gst.Bin.add(nbin, uri_decode_bin)
if not nbin.add_pad(Gst.GhostPad.new_no_target("src", Gst.PadDirection.SRC)):
sys.stderr.write("Failed to add ghost pad in source bin\n")
return None
return nbin
def main(cfg):
Gst.init(None)
print("Creating DeepStream Face Detection Pipeline")
# Create Pipeline
pipeline = Gst.Pipeline()
if not pipeline:
print("Error: Unable to create pipeline")
sys.exit(1)
else:
print("Pipeline created successfully")
# Create Stream Muxer
streammux = Gst.ElementFactory.make("nvstreammux", "Stream-muxer")
pipeline.add(streammux)
set_property(cfg, streammux, "streammux")
# Create and Add Source Bin
sources = cfg['source']
source_bin = create_source_bin(0, list(sources.values())[0])
pipeline.add(source_bin)
# Link Source to Stream Muxer
sinkpad = streammux.get_request_pad("sink_0")
srcpad = source_bin.get_static_pad("src")
if sinkpad is None or srcpad is None:
print("Error: Source or Streammux pad not found!")
else:
print(">>> Linking Source Bin to StreamMuxer")
srcpad.link(sinkpad)
# Create Primary Inference (Face Detection)
pgie = Gst.ElementFactory.make("nvinfer", "primary-inference")
pipeline.add(pgie)
set_property(cfg, pgie, "pgie")
tracker = Gst.ElementFactory.make("nvtracker", "tracker")
pipeline.add(tracker)
set_tracker_properties(tracker, cfg['tracker']['config-file-path'])
# Create Tiler
tiler = Gst.ElementFactory.make("nvmultistreamtiler", "nvtiler")
pipeline.add(tiler)
tiler.set_property("rows", 1)
tiler.set_property("columns", 1)
tiler.set_property("width", 1920)
tiler.set_property("height", 1080)
# Create Video Converter
nvvidconv = Gst.ElementFactory.make("nvvideoconvert", "convertor")
pipeline.add(nvvidconv)
# Create On-Screen Display
nvosd = Gst.ElementFactory.make("nvdsosd", "onscreendisplay")
nvosd.set_property("process-mode", 0) # Default mode (draw bounding boxes)
nvosd.set_property("display-text", 1)
pipeline.add(nvosd)
# Create Sink
sink = Gst.ElementFactory.make("nveglglessink", "file-sink")
pipeline.add(sink)
sink.set_property("sync", 0)
print(">>> After creating elements linking of elements is started")
#srcpad.link(sinkpad)
streammux.link(pgie)
# pgie_src_pad = pgie.get_static_pad("sink")
# if pgie_src_pad:
# pgie_src_pad.add_probe(Gst.PadProbeType.BUFFER,pgie_sink_pad_buffer_probe)
pgie.link(tracker)
tracker.link(tiler)
tiler.link(nvvidconv)
nvvidconv.link(nvosd)
nvosd.link(sink)
pgie_src_pad = pgie.get_static_pad("src")
if pgie_src_pad:
pgie_src_pad.add_probe(Gst.PadProbeType.BUFFER,pgie_sink_pad_buffer_probe)
loop = GLib.MainLoop()
# Bus Message Handling
bus = pipeline.get_bus()
bus.add_signal_watch()
bus.connect("message", bus_call, loop)
# Start Pipeline
pipeline.set_state(Gst.State.PLAYING)
try:
loop.run()
except Exception as e:
print(f"Pipeline error: {e}")
finally:
pipeline.set_state(Gst.State.NULL)
def bus_call(bus, message, loop):
t = message.type
if t == Gst.MessageType.EOS:
print(“End-of-stream”)
loop.quit()
elif t == Gst.MessageType.ERROR:
err, debug = message.parse_error()
print(f"Error: {err}“)
print(f"Debug info: {debug}”)
loop.quit()
return True
if name == ‘main’:
cfg = parse_args(cfg_path=“/home/dstream/Documents/Deep_Stream_App/paths/paths.toml”)
main(cfg)
yolov8_infer_config
[property]
gpu-id=0
net-scale-factor=0.0039215697906911373
model-color-format=0
model-engine-file=/home/dstream/Documents/Deep_Stream_App/models/yolov8n/yolov8n-1-face.engine
#int8-calib-file=calib.table
labelfile-path=/home/dstream/Documents/Deep_Stream_App/configs/labels.txt
infer-dims=3;640;640
network-input-order=0
num-detected-classes=1
interval=0
batch-size=1
network-mode=0
uff-input-blob-name=images
output-blob-names=output0
output-tensor-meta=1
gie-unique-id=1
process-mode=1
network-type=0
cluster-mode=2
maintain-aspect-ratio=1
symmetric-padding=1
#workspace-size=2000
parse-bbox-func-name=NvDsInferParseCustomYoloV8
custom-lib-path=/home/dstream/Documents/Deep_Stream_App/yolov8_parser/yolov8n_parser.so
[class-attrs-all]
pre-cluster-threshold=0.75
topk=300
YoloV8n Parser
include
include
include
include “nvdsinfer_custom_impl.h”
define CONF_THRESHOLD 0.5 // Confidence threshold
define INPUT_WIDTH 640
define INPUT_HEIGHT 640
extern “C” bool NvDsInferParseCustomYoloV8Face(
std::vector const &outputLayersInfo,
NvDsInferNetworkInfo const &networkInfo,
std::vector &objectList)
{
if (outputLayersInfo.empty())
{
std::cerr << “Error: Output layers info is empty!” << std::endl;
return false;
}
const NvDsInferLayerInfo &outputLayer = outputLayersInfo[0];
if (!outputLayer.buffer)
{
std::cerr << "Error: Output layer buffer is null!" << std::endl;
return false;
}
const float *output = reinterpret_cast<const float *>(outputLayer.buffer);
std::cout << "output : " << output << std::endl;
int num_attrs = outputLayer.inferDims.d[0]; // Number of attributes (5)
int num_detections = outputLayer.inferDims.d[1]; // Number of
detections (anchor points, 8400)
if (num_attrs != 5)
{
std::cerr << "Error: Expected 5 attributes, but got: " <<
num_attrs << std::endl;
return false;
}
std::cout << "Number of detections (anchor points): " <<
num_detections << std::endl;
std::cout << "Number of attributes per detection: " << num_attrs
<< std::endl;
// Print raw output values (first 10 detections for debugging)
std::cout << "Raw Output Data (First 10 detections):\n";
for (int i = 0; i < std::min(10, num_detections); i++)
{
float x_center = output[i + num_detections * 0];
float y_center = output[i + num_detections * 1];
float width = output[i + num_detections * 2];
float height = output[i + num_detections * 3];
float conf = output[i + num_detections * 4];
std::cout << "Detection " << i << " - X: " << x_center
<< ", Y: " << y_center << ", W: " << width
<< ", H: " << height << ", Conf: " << conf << "\n";
}
for (int i = 0; i < num_attrs; i++)
{
float x_center = output[i + num_detections * 0];
float y_center = output[i + num_detections * 1];
float width = output[i + num_detections * 2];
float height = output[i + num_detections * 3];
float conf = output[i + num_detections * 4];
if (conf < CONF_THRESHOLD)
continue; // Skip low-confidence detections
// Convert (x_center, y_center, width, height) → (x1, y1, x2, y2)
float x1 = x_center - width / 2;
float y1 = y_center - height / 2;
float x2 = x_center + width / 2;
float y2 = y_center + height / 2;
// Normalize to the input image size
x1 *= networkInfo.width;
y1 *= networkInfo.height;
x2 *= networkInfo.width;
y2 *= networkInfo.height;
// Ensure bounding box is within valid range
x1 = std::max(0.0f, std::min(x1,
static_cast(networkInfo.width)));
y1 = std::max(0.0f, std::min(y1,
static_cast(networkInfo.height)));
x2 = std::max(0.0f, std::min(x2,
static_cast(networkInfo.width)));
y2 = std::max(0.0f, std::min(y2,
static_cast(networkInfo.height)));
// Store parsed object
NvDsInferParseObjectInfo obj;
obj.left = x1;
obj.top = y1;
obj.width = x2 - x1;
obj.height = y2 - y1;
obj.detectionConfidence = conf;
obj.classId = 0; // Single-class model (faces)
objectList.push_back(obj);
}
std::cout << "Parsed Objects: " << objectList.size() << " detected faces\n";
return true;
}
extern “C” bool NvDsInferParseCustomYoloV8Face_cuda(
std::vector const &outputLayersInfo,
NvDsInferNetworkInfo const &networkInfo,
std::vector &objectList)
{
return NvDsInferParseCustomYoloV8Face(outputLayersInfo,
networkInfo, objectList);
}