Deepstream YOLOV8 parser

Description:I am using the YOLOv8 object detection model, which we trained to detect objects like shoes and goggles and integrated into DeepStream for object detection.

Issue : I created a parser for the YOLOv8 object detection model and added it to DeepStream, but I am getting a black screen in the display window, as in the given screenshot below:

Environment

TensorRT Version: 8.5.2
GPU Type: Jetson Xavier NX
Nvidia Driver Version: jetpack version 5.1.4
CUDA Version: 11.5
Operating System + Version: Ubuntu -20.4
Python Version (if applicable): python -3.8

Relevant Files

My parser code is :

include “nvdsinfer_custom_impl.h”
include
include
include
include
include
include
include <unordered_map>
define DIVUP(x, y) (((x) + (y) - 1) / (y))

static const int NUM_CLASSES_YOLO = 5; // Number of classes for YOLOv8 object detection

float clamp(const float val, const float minVal, const float maxVal) {
assert(minVal <= maxVal);
return std::min(maxVal, std::max(minVal, val));
}

static NvDsInferParseObjectInfo
// convertBBoxYolo(const float &bx, const float &by, const float &bw,
// const float &bh, const uint &netW, const uint &netH) {

convertBBoxYolo(const float &bx, const float &by, const float &bw,
const float &bh, const int &stride, const uint &netW,
const uint &netH) {
NvDsInferParseObjectInfo b;

float xCenter = bx * stride;
float yCenter = by * stride;
float x0 = xCenter - bw / 2;
float y0 = yCenter - bh / 2;
float x1 = x0 + bw;
float y1 = y0 + bh;

x0 = clamp(x0, 0, netW);
y0 = clamp(y0, 0, netH);
x1 = clamp(x1, 0, netW);
y1 = clamp(y1, 0, netH);

b.left = x0;
b.width = clamp(x1 - x0, 0, netW);
b.top = y0;
b.height = clamp(y1 - y0, 0, netH);

return b;
}
static void addBBoxProposalYolo(const float bx, const float by, const float bw,
const float bh, const uint stride,const uint &netW, const uint &netH,
const int maxIndex, const float maxProb,
std::vector &binfo) {
NvDsInferParseObjectInfo bbi = convertBBoxYolo(bx, by, bw, bh, stride, netW, netH);
if (bbi.width < 1 || bbi.height < 1)
return;

bbi.detectionConfidence = maxProb;
bbi.classId = maxIndex;
binfo.push_back(bbi);
}

static bool
NvDsInferParseYoloV8(std::vector const &outputLayersInfo,
NvDsInferNetworkInfo const &networkInfo,
NvDsInferParseDetectionParams const &detectionParams,
std::vector &objectList) {
if (outputLayersInfo.empty()) {
std::cerr << “ERROR: No output layers found in bbox parsing.” << std::endl;
return false;
}

const NvDsInferLayerInfo &layer = outputLayersInfo[0];

// Print tensor shape
std::cerr << “Input Tensor Shape: [”;
for (int i = 0; i < layer.inferDims.numDims; i++) {
std::cerr << layer.inferDims.d[i] << (i < layer.inferDims.numDims - 1 ? ", " : “”);
}
std::cerr << “]” << std::endl;

// Validate the output layer dimensions
if (layer.inferDims.numDims != 2 || layer.inferDims.d[0] != 9) {
std::cerr << “ERROR: Invalid output layer dimensions. Expected shape: [9, 8400].” << std::endl;
return false;
}

std::vector objects;
float *data = (float *)layer.buffer;

int dimensions = layer.inferDims.d[0]; // Should be 9 (4 box coords + 1 objectness + 4 class scores)
int rows = layer.inferDims.d[1]; // Should be 8400 (number of detections)

const uint gridSize = sqrt(rows); // Approximate grid size
const uint stride = DIVUP(networkInfo.width, gridSize);

std::cerr << "Parsing YOLOv8 output with dimensions: " << dimensions
<< " rows: " << rows << ", calculated stride: " << stride << std::endl;

// // Log the dimensions for debugging
// std::cerr << "Parsing YOLOv8 output with dimensions: " << dimensions
// << " rows: " << rows << std::endl;

for (int i = 0; i < rows; ++i) {
// Process each detection
float bx = data[i * dimensions + 0]; // x_center
float by = data[i * dimensions + 1]; // y_center
float bw = data[i * dimensions + 2]; // width
float bh = data[i * dimensions + 3]; // height
float objectness = data[i * dimensions + 4]; // objectness score
float *classes_scores = &data[i * dimensions + 5]; // class scores start at index 5

// Find the class with highest probability
float maxScore = 0;
int maxIndex = 0;

for (int j = 0; j < NUM_CLASSES_YOLO; j++) {
  if (classes_scores[j] > maxScore) {
    maxIndex = j;
    maxScore = classes_scores[j];
  }
}

// Combine objectness with class score for final confidence
float confidence = objectness * maxScore;

// Log the parsed bounding box and confidence
std::cerr << "Detection " << i << ": bx=" << bx << ", by=" << by
<< ", bw=" << bw << ", bh=" << bh
<< ", confidence=" << confidence << ", class=" << maxIndex << std::endl;

// Add to list if above threshold
if (confidence > detectionParams.perClassThreshold[maxIndex]) {
  addBBoxProposalYolo(bx, by, bw, bh, stride, networkInfo.width,
                      networkInfo.height, maxIndex, confidence, objects);
}

}

std::cerr << "Number of objects detected: " << objects.size() << std::endl;
for (const auto &obj : objects) {
std::cerr << “Object: left=” << obj.left << “, top=” << obj.top
<< “, width=” << obj.width << “, height=” << obj.height
<< “, confidence=” << obj.detectionConfidence
<< “, classId=” << obj.classId << std::endl;
}

objectList = objects;
return true;
}

extern “C” bool NvDsInferParseCustomYoloV8(
std::vector const &outputLayersInfo,
NvDsInferNetworkInfo const &networkInfo,
NvDsInferParseDetectionParams const &detectionParams,
std::vector &objectList) {
try {
return NvDsInferParseYoloV8(outputLayersInfo, networkInfo, detectionParams,
objectList);
} catch (const std::exception &e) {
std::cerr << "ERROR: Exception in NvDsInferParseCustomYoloV8: " << e.what() << std::endl;
return false;
} catch (…) {
std::cerr << “ERROR: Unknown exception in NvDsInferParseCustomYoloV8.” << std::endl;
return false;
}
}

CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseCustomYoloV8);

And the output log of Deep Stream app after adding the parser is :

yolov8_parser_error.txt (1.5 MB)

My config file is :

[property]
gpu-id=0
net-scale-factor=0.0039215697906911373
model-color-format=0
infer-dims=3;640;640
onnx-file=/home/atmecs/Documents/ppe detection/models/Goggles_Shoes/Goggels_Shoes.onnx
model-engine-file=/home/atmecs/Documents/ppe detection/models/Goggles_Shoes/Goggels_Shoes.engine
#int8-calib-file=calib.table
labelfile-path=/home/atmecs/Documents/labels.txt
batch-size=1
network-mode=0
num-detected-classes=5
interval=0
gie-unique-id=1
process-mode=1
network-type=0

1=DBSCAN, 2=NMS, 3= DBSCAN+NMS Hybrid, 4 = None(No clustering)

cluster-mode=2
maintain-aspect-ratio=1

symmetric-padding=1
#workspace-size=2000
#custom-lib-path=/home/atmecs/Desktop/DeepStream-Yolov8-Jetson-Nano/nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so
custom-lib-path=/home/atmecs/Documents/ppe detection/yolov8 parser/yolov8n_goggle_shoe_parser.so
#parse-bbox-func-name=NvDsInferParseYolo
parse-bbox-func-name=NvDsInferParseCustomYoloV8

[class-attrs-all]
nms-iou-threshold=0.3
pre-cluster-threshold=0.25
topk=300

classes which my model is detecting is :

0- Safety Goggle
1- ToeGuard
2- Non Safety Shoes
3- Safety Shoes
4- Non Safety Goggles

@junshengy Could you please help with this issue? Your guidance would be greatly appreciated!

You need to check the pipeline first. Even if the post-processing is wrong, it usually does not cause the display to fail.

In addition, this is a complete sample of yolov8, you can refer to this

Hi @junshengy ,

Thank you for your previous response. I modified the pipeline, and after that, I was able to see the frames on the screen.

However, when I tried to install the pyds package to access the metadata, following the instructions from this link, I found that the approach in the link is for DeepStream version 7.1 and Python version 3.10. Currently, I am using DeepStream version 6.3 and Python 3.8. I searched for alternative approaches that might fit my versions, but I couldn’t find any, so I followed the same approach and started installing, I encountered the following issue:

atmecs@atmecs-desktop:/opt/nvidia/deepstream/deepstream-6.3/sources/deepstream_python_apps/bindings/3rdparty/gstreamer/subprojects/gst-python$ sudo meson setup build
The Meson build system
Version: 1.7.0
Source dir: /opt/nvidia/deepstream/deepstream-6.3/sources/deepstream_python_apps/bindings/3rdparty/gstreamer/subprojects/gst-python
Build dir: /opt/nvidia/deepstream/deepstream-6.3/sources/deepstream_python_apps/bindings/3rdparty/gstreamer/subprojects/gst-python/build
Build type: native build
Project name: gst-python
Project version: 1.20.3
C compiler for the host machine: cc (gcc 9.4.0 “cc (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0”)
C linker for the host machine: cc ld.bfd 2.34
Host machine cpu family: aarch64
Host machine cpu: aarch64
Found pkg-config: YES (/usr/bin/pkg-config) 0.29.1
Dependency gstreamer-1.0 found: NO. Found 1.16.3 but need: ‘>= 1.20.0’
Found CMake: /usr/bin/cmake (3.16.3)
DEPRECATION: CMake support for versions <3.17 is deprecated since Meson 0.62.0.
|
| However, Meson was only able to find CMake 3.16.3.
|
| Support for all CMake versions below 3.17.0 will be removed once
| newer CMake versions are more widely adopted. If you encounter
| any errors please try upgrading CMake to a newer version first.

Run-time dependency gstreamer-1.0 found: NO (tried pkgconfig and cmake)
Looking for a fallback subproject for the dependency gstreamer-1.0

meson.build:18:10: ERROR: Neither a subproject directory nor a gstreamer.wrap file was found.

From the log, I understand that Meson is expecting gstreamer-1.0 version 1.20.0 or higher, but I currently have version 1.16.3 installed. I tried changing the dependency version in the meson.build file to 1.16.3, but then I encountered a pygobject dependency issue. It seems like this approach leads to a chain of dependency issues.

I need your guidance on how to resolve these dependency issues.

Additionally, I would like to ask the following:

If I upgrade the GStreamer version and try to install it, will it affect my existing DeepStream setup? My DeepStream version is 6.3, JetPack version is 5.1.4, python version is 3.8 and I am using the Jetson Xavier NX device?

Could you suggest any alternative solutions to bypass or fix these dependency problems?

Is there any other approach I can follow for my DeepStream version?

To use pyds on DS-6.3, you need checkout the v1.1.8 branch.

You can also install *.whl, of course you need to install the dependencies in the above document first

Hi @junshengy,

Thank you for the solution you provided—it worked well for me! I was able to install pyds successfully.

Previously, you shared a document for creating a parser for the model. Based on your advice, I have it as reference and developed a parser for a YOLOv8 model trained to detect personal protective equipment (PPE). My model is trained to detect the following classes:

  1. Safety Goggles
  2. ToeGuard
  3. Non-Safety Shoes
  4. Safety Shoes
  5. Non-Safety Goggles

However, when I run the script, I encounter two main issues:

  1. Even though there is only one object in the frame (e.g., Safety Goggles), I am getting multiple bounding boxes displayed in the output.
  2. The bounding boxes are not correctly positioned over the detected object.

I’m unsure if these issues are related to the parser or the display. Could you please advise on what might be causing these problems?

Here’s the parser script I’m using:

include
include
include
include
include
include
include <unordered_map>
include “nvdsinfer_custom_impl.h”

static const int NUM_CLASSES_YOLO = 5;

float clamp(const float val, const float minVal, const float maxVal)
{
assert(minVal <= maxVal);
return std::min(maxVal, std::max(minVal, val));
}

static NvDsInferParseObjectInfo convertBBoxYoloV8(const float& bx, const float& by, const float& bw,
const float& bh, const int& stride, const uint& netW,
const uint& netH)
{
NvDsInferParseObjectInfo b;
// Restore coordinates to network input resolution
float xCenter = bx * stride;
float yCenter = by * stride;
float x0 = xCenter - bw / 2;
float y0 = yCenter - bh / 2;
float x1 = x0 + bw;
float y1 = y0 + bh;

x0 = clamp(x0, 0, netW);
y0 = clamp(y0, 0, netH);
x1 = clamp(x1, 0, netW);
y1 = clamp(y1, 0, netH);

b.left = x0;
b.width = clamp(x1 - x0, 0, netW);
b.top = y0;
b.height = clamp(y1 - y0, 0, netH);

return b;

}

static void addBBoxProposalYoloV8(const float bx, const float by, const float bw, const float bh,
const uint stride, const uint& netW, const uint& netH, const int maxIndex,
const float maxProb, std::vector& binfo)
{
NvDsInferParseObjectInfo bbi = convertBBoxYoloV8(bx, by, bw, bh, stride, netW, netH);
if (bbi.width < 1 || bbi.height < 1) return;

bbi.detectionConfidence = maxProb;
bbi.classId = maxIndex;
binfo.push_back(bbi);

}

static bool NvDsInferParseYoloV8(
std::vector const& outputLayersInfo,
NvDsInferNetworkInfo const& networkInfo,
NvDsInferParseDetectionParams const& detectionParams,
std::vector& objectList)
{
if (outputLayersInfo.empty()) {
std::cerr << “Could not find output layer in bbox parsing” << std::endl;;
return false;
}
const NvDsInferLayerInfo &layer = outputLayersInfo[0];

if (NUM_CLASSES_YOLO != detectionParams.numClassesConfigured)
{
    std::cerr << "WARNING: Num classes mismatch. Configured:"
              << detectionParams.numClassesConfigured
              << ", detected by network: " << NUM_CLASSES_YOLO << std::endl;
}

std::vector<NvDsInferParseObjectInfo> objects;

float* data = (float*)layer.buffer;
const int dimensions = layer.inferDims.d[1];
int rows = layer.inferDims.numElements / layer.inferDims.d[1];

for (int i = 0; i < rows; ++i) {
    //85 = x, y, w, h, score0......score79
    float bx = data[0];
    float by = data[1];
    float bw = data[2];
    float bh = data[3];
    float * classes_scores = data + 4;

    float maxScore = 0;
    int index = 0;
    for (int j = 0; j < NUM_CLASSES_YOLO; j++){
       if(*classes_scores > maxScore){
          index = j;
          maxScore = *classes_scores;
       }
       classes_scores++;
    }

    // Important: Check confidence threshold here
    if (maxScore > detectionParams.perClassPreclusterThreshold[index]) {
        int maxIndex = index;
        data += dimensions;
        // Use maxScore as confidence instead of always using 1.0
        //float maxProb = 0 
        addBBoxProposalYoloV8(bx, by, bw, bh, 1, networkInfo.width, networkInfo.height, maxIndex, maxScore, objects);
    } else {
        data += dimensions;
    }
}
objectList = objects;
return true;

}

extern “C” bool NvDsInferParseCustomYoloV8(
std::vector const& outputLayersInfo,
NvDsInferNetworkInfo const& networkInfo,
NvDsInferParseDetectionParams const& detectionParams,
std::vector& objectList)
{
return NvDsInferParseYoloV8(
outputLayersInfo, networkInfo, detectionParams, objectList);
}

/* Check that the custom function has been defined correctly */
CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseCustomYoloV8);

And here’s my OSD display script:

def osd_sink_pad_buffer_probe(pad,info,u_data):
frame_number=0
num_rects=0

gst_buffer = info.get_buffer()
if not gst_buffer:
    print("Unable to get GstBuffer ")
    return

# Retrieve batch metadata from the gst_buffer
# Note that pyds.gst_buffer_get_nvds_batch_meta() expects the
# C address of gst_buffer as input, which is obtained with hash(gst_buffer)
batch_meta = pyds.gst_buffer_get_nvds_batch_meta(hash(gst_buffer))
l_frame = batch_meta.frame_meta_list
while l_frame is not None:
    try:
        # Note that l_frame.data needs a cast to pyds.NvDsFrameMeta
        # The casting is done by pyds.NvDsFrameMeta.cast()
        # The casting also keeps ownership of the underlying memory
        # in the C code, so the Python garbage collector will leave
        # it alone.
        frame_meta = pyds.NvDsFrameMeta.cast(l_frame.data)
    except StopIteration:
        break

    #Intiallizing object counter with 0.
    obj_counter = {
        Safety_Goggle : 0 ,
        ToeGuard : 0,
        Non_Safety_Shoes : 0,
        Safety_Shoes : 0,
        Non_Safety_Goggles : 0

        
    }
    frame_number=frame_meta.frame_num
    num_rects = frame_meta.num_obj_meta
    l_obj=frame_meta.obj_meta_list
    while l_obj is not None:
        try:
            # Casting l_obj.data to pyds.NvDsObjectMeta
            obj_meta=pyds.NvDsObjectMeta.cast(l_obj.data)
        except StopIteration:
            break
        obj_counter[obj_meta.class_id] += 1
        obj_meta.rect_params.border_color.set(0.0, 0.0, 1.0, 0.8) #0.8 is alpha (opacity)
        try: 
            l_obj=l_obj.next
        except StopIteration:
            break

    # Acquiring a display meta object. The memory ownership remains in
    # the C code so downstream plugins can still access it. Otherwise
    # the garbage collector will claim it when this probe function exits.
    display_meta=pyds.nvds_acquire_display_meta_from_pool(batch_meta)
    display_meta.num_labels = 1
    py_nvosd_text_params = display_meta.text_params[0]
    # Setting display text to be shown on screen
    # Note that the pyds module allocates a buffer for the string, and the
    # memory will not be claimed by the garbage collector.
    # Reading the display_text field here will return the C address of the
    # allocated string. Use pyds.get_string() to get the string content.
    #py_nvosd_text_params.display_text = "Frame Number={} Number of Objects={} Vehicle_count={} Person_count={}".format(frame_number, num_rects, obj_counter[PGIE_CLASS_ID_VEHICLE], obj_counter[PGIE_CLASS_ID_PERSON])

    py_nvosd_text_params.display_text = "Frame Number={} Number of Objects={} Safety_Goggle={} ToeGuard={} Non_Safety_Shoes={} Safety_Shoes={} Non_Safety_Goggles={}".format(
        frame_number, num_rects, 
        obj_counter[Safety_Goggle],
        obj_counter[ToeGuard],
        obj_counter[Non_Safety_Shoes],
        obj_counter[Safety_Shoes],
        obj_counter[Non_Safety_Goggles]
    )


    # Now set the offsets where the string should appear
    py_nvosd_text_params.x_offset = 10
    py_nvosd_text_params.y_offset = 12

    # Font , font-color and font-size
    py_nvosd_text_params.font_params.font_name = "Serif"
    py_nvosd_text_params.font_params.font_size = 10
    # set(red, green, blue, alpha); set to White
    py_nvosd_text_params.font_params.font_color.set(1.0, 1.0, 1.0, 1.0)

    # Text background color
    py_nvosd_text_params.set_bg_clr = 1
    # set(red, green, blue, alpha); set to Black
    py_nvosd_text_params.text_bg_clr.set(0.0, 0.0, 0.0, 1.0)
    # Using pyds.get_string() to get display_text as string
    print(pyds.get_string(py_nvosd_text_params.display_text))
    pyds.nvds_add_display_meta_to_frame(frame_meta, display_meta)
    try:
        l_frame=l_frame.next
    except StopIteration:
        break
		
return Gst.PadProbeReturn.OK	

Additionally, I’m sharing an image taken from the output video stream that illustrates the issue I’m facing.

I think this is a problem with the model itself. Generally speaking, OSD will not have any problems.

If the output of the model is correct, even if there is a bug in the post-processing code, only one bbox should be obtained. In your nvinfer configuration file, have you modified the following content?

[class-attrs-all]
#nms-iou-threshold=0.3
#threshold=0.7
nms-iou-threshold=0.65
pre-cluster-threshold=0.45
topk=100

Hi @junshengy ,

Following your suggestion, I tested the .pt model, and it works fine. However, when I tested the .engine model, I encountered issues. To investigate further, I loaded the .engine model outside of DeepStream and ran it with the provided input. Below is the terminal output I received:

Class ID: 4, Label: Non Safety Goggles: 4.66
Warning: class_id 62 is out of bounds, skipping detection.
Warning: class_id 9 is out of bounds, skipping detection.
Warning: class_id 35 is out of bounds, skipping detection.
Class ID: 4, Label: Non Safety Goggles: 4.18
Warning: class_id 60 is out of bounds, skipping detection.
Warning: class_id 8 is out of bounds, skipping detection.
Warning: class_id 35 is out of bounds, skipping detection.
Class ID: 4, Label: Non Safety Goggles: 4.39
Warning: class_id 60 is out of bounds, skipping detection.
Warning: class_id 9 is out of bounds, skipping detection.
Warning: class_id 35 is out of bounds, skipping detection.
Class ID: 4, Label: Non Safety Goggles: 4.04
Warning: class_id 61 is out of bounds, skipping detection.
Warning: class_id 8 is out of bounds, skipping detection.

This suggests that there may be an issue with the model itself, as it detects “Non-Safety Goggles” even when I am wearing safety goggles.

To further diagnose the issue, I tested the parser to confirm if it might be causing the problem. I used the YOLOv8 pretrained object detection model with “person” as the label for detecting persons. I adjusted the num_of_classes = 1 in both the parser and the configuration file, and used the following settings:

[property]
gpu-id=0
net-scale-factor=0.0039215697906911373
model-color-format=0

model-engine-file=/home/anand-orin-nx/Desktop/dsTesting/PPE_DeepStream-20250318T094105Z-001/ppe_rtsp-20250320T085729Z-001/ppe_rtsp/yolov8N.engine
labelfile-path=/home/anand-orin-nx/Desktop/dsTesting/PPE_DeepStream-20250318T094105Z-001/ppe_rtsp-20250320T085729Z-001/ppe_rtsp/labels.txt
infer-dims=3;640;640
batch-size=1
process-mode=1
network-type=0
network-mode=0
num-detected-classes=1  # Adjusted to match the model's number of classes
interval=0
gie-unique-id=1
cluster-mode=2
maintain-aspect-ratio=1
symmetric-padding=1

custom-lib-path=/home/anand-orin-nx/Desktop/dsTesting/PPE_DeepStream-20250318T094105Z-001/ppe_rtsp-20250320T085729Z-001/ppe_rtsp/yolov8N_parser.so
parse-bbox-func-name= NvDsInferParseCustomYoloV8

[class-attrs-all]
nms-iou-threshold=0.3
pre-cluster-threshold=0.25
topk=300

Even after making these changes, I am still facing the same issue with bounding boxes not being properly aligned.

Guidance Request:

  1. Model or Parser Issue: I am unsure whether the issue lies with the model or the parser. Could you help me confirm if both are incorrect, and if so, how to adjust the parser? I have already built the parser using the reference link you shared, and I updated the num_classes_detected parameter to match my model, which detects five classes.

  2. PPE Detection with YOLOv8 in DeepStream: Has anyone successfully implemented PPE detection with DeepStream using the YOLOv8 model? If so, could you share any guidance or resources on how to properly configure it?

I have attached images that demonstrate the issue with the bounding boxes and detection:

I appreciate your guidance and help in troubleshooting this further.

Is there any model provided by NVIDIA for PPE detection?

How did you generate the engine file? Did you export onnx and then generate it through nvinfer? Is this the recommended way?
In addition, if your model input and output are different from the sample, you need to modify the post-processing code according to your model input and output

This is the input and output of the yolov8 model in deepstream_tools. If you do not modify the post-processing code, your model needs to match it.

Deepstream does not provide such a model, only the general yolov8 model, you need to adjust based on this model

Thank you for your response. I followed two approaches to convert the ONNX model into a .engine file:

  1. I used the nvinfer method, where I provided the ONNX model and generated the engine file.
  2. I also used the trtexec command to convert the ONNX model into a TensorRT model. (This approach also works; I tried this approach before as well, and it will work.)

In both cases, I encountered the same issue.

Here are the model dimensions for our setup:

  • Input: 1x3x640x640
  • Output: 1x84x8400

When compared to the sample model dimensions you provided, I see a difference: the sample model dimensions are 1x8400x84, while our model has 1x84x8400.

Based on our model’s dimensions, could you guide me on how to modify the parser code? Since I’m not familiar with C++, I would greatly appreciate your assistance in adjusting the parser to handle our specific model’s output format.

My parser script is :
include
include
include
include
include
include
include <unordered_map>
include “nvdsinfer_custom_impl.h”

static const int NUM_CLASSES_YOLO = 5;

float clamp(const float val, const float minVal, const float maxVal)
{
assert(minVal <= maxVal);
return std::min(maxVal, std::max(minVal, val));
}

static NvDsInferParseObjectInfo convertBBoxYoloV8(const float& bx, const float& by, const float& bw,
const float& bh, const int& stride, const uint& netW,
const uint& netH)
{
NvDsInferParseObjectInfo b;
// Restore coordinates to network input resolution
float xCenter = bx * stride;
float yCenter = by * stride;
float x0 = xCenter - bw / 2;
float y0 = yCenter - bh / 2;
float x1 = x0 + bw;
float y1 = y0 + bh;

x0 = clamp(x0, 0, netW);
y0 = clamp(y0, 0, netH);
x1 = clamp(x1, 0, netW);
y1 = clamp(y1, 0, netH);

b.left = x0;
b.width = clamp(x1 - x0, 0, netW);
b.top = y0;
b.height = clamp(y1 - y0, 0, netH);

return b;

}

static void addBBoxProposalYoloV8(const float bx, const float by, const float bw, const float bh,
const uint stride, const uint& netW, const uint& netH, const int maxIndex,
const float maxProb, std::vector& binfo)
{
NvDsInferParseObjectInfo bbi = convertBBoxYoloV8(bx, by, bw, bh, stride, netW, netH);
if (bbi.width < 1 || bbi.height < 1) return;

bbi.detectionConfidence = maxProb;
bbi.classId = maxIndex;
binfo.push_back(bbi);

}

static bool NvDsInferParseYoloV8(
std::vector const& outputLayersInfo,
NvDsInferNetworkInfo const& networkInfo,
NvDsInferParseDetectionParams const& detectionParams,
std::vector& objectList)
{
if (outputLayersInfo.empty()) {
std::cerr << “Could not find output layer in bbox parsing” << std::endl;;
return false;
}
const NvDsInferLayerInfo &layer = outputLayersInfo[0];

if (NUM_CLASSES_YOLO != detectionParams.numClassesConfigured)
{
    std::cerr << "WARNING: Num classes mismatch. Configured:"
              << detectionParams.numClassesConfigured
              << ", detected by network: " << NUM_CLASSES_YOLO << std::endl;
}

std::vector<NvDsInferParseObjectInfo> objects;

float* data = (float*)layer.buffer;
const int dimensions = layer.inferDims.d[1];
int rows = layer.inferDims.numElements / layer.inferDims.d[1];

for (int i = 0; i < rows; ++i) {
    //85 = x, y, w, h, score0......score79
    float bx = data[0];
    float by = data[1];
    float bw = data[2];
    float bh = data[3];
    float * classes_scores = data + 4;

    float maxScore = 0;
    int index = 0;
    for (int j = 0; j < NUM_CLASSES_YOLO; j++){
       if(*classes_scores > maxScore){
          index = j;
          maxScore = *classes_scores;
       }
       classes_scores++;
    }

    // Important: Check confidence threshold here
    if (maxScore > detectionParams.perClassPreclusterThreshold[index]) {
        int maxIndex = index;
        data += dimensions;
        // Use maxScore as confidence instead of always using 1.0
        //float maxProb = 0 
        addBBoxProposalYoloV8(bx, by, bw, bh, 1, networkInfo.width, networkInfo.height, maxIndex, maxScore, objects);
    } else {
        data += dimensions;
    }
}
objectList = objects;
return true;

}

extern “C” bool NvDsInferParseCustomYoloV8(
std::vector const& outputLayersInfo,
NvDsInferNetworkInfo const& networkInfo,
NvDsInferParseDetectionParams const& detectionParams,
std::vector& objectList)
{
return NvDsInferParseYoloV8(
outputLayersInfo, networkInfo, detectionParams, objectList);
}

/* Check that the custom function has been defined correctly */
CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseCustomYoloV8);

Refer to this transpose script

It’s working, Thank you.

Do you have any suggestions to increase the confidence score?