Show result detection

• Hardware Platform (GPU)
• DeepStream Version 5.0
• TensorRT Version 7.1.3
• NVIDIA GPU Driver Version 450.119.04
• Issue Type questions

I have a problem with runing app for face detection. I could convert net *.wts → *.onnx → *.engine with repo: help GitHub - nghiapq77/face-recognition-cpp-tensorrt: Face Recognition with RetinaFace and ArcFace.. When I run app than I can’t see display and not get callback info in terminal. Info in terminal:

// main.cpp
#include "gstnvdsmeta.h"
#include <glib.h>
#include <gst/gst.h>
#include <stdio.h>
#include <string>
#include <iostream>

#define COUT_RESET   "\033[0m"
#define COUT_BLACK   "\033[30m"      /* Black */
#define COUT_RED     "\033[31m"      /* Red */
#define COUT_GREEN   "\033[32m"      /* Green */
#define COUT_YELLOW  "\033[33m"      /* Yellow */
#define COUT_BLUE    "\033[34m"      /* Blue */
#define COUT_MAGENTA "\033[35m"      /* Magenta */
#define COUT_CYAN    "\033[36m"      /* Cyan */
#define COUT_WHITE   "\033[37m"      /* White */

#define HVG_DEBUG(msg) std::cout << COUT_BLUE << "[DEBUG] " << COUT_RESET << msg << std::endl;
#define HVG_ERROR(msg) std::cerr << COUT_RED << "[ERROR] " << COUT_RESET << msg << std::endl;
#define HVG_INFO(msg) std::cout << COUT_GREEN << "[INFO]  " << COUT_RESET << msg << std::endl;
#define HVG_WARNING(msg) std::cout << COUT_YELLOW << "[WARNING]  " << COUT_RESET << msg << std::endl;

#define MAKE_ELEMENT(element, gs_name, name) \
element = gst_element_factory_make(gs_name, name); \
if (!element)\
{\
    HVG_ERROR("Element '" << name << "' was not created!")\
    return EXIT_FAILURE;\
}\
else\
{\
    HVG_INFO("Element '" << name << "' was created successfully!") \
}

#define MAKE_PIPE(pipe, name) \
pipe = gst_pipeline_new(name); \
if (!pipe)\
{\
    HVG_ERROR("Pipeline '" << name << "' was not created!")\
    return EXIT_FAILURE;\
}\
else\
{\
    HVG_INFO("Pipeline '" << name << "' was created successfully!") \
}


GST_DEBUG_CATEGORY(NVDS_APP);

#define MAX_DISPLAY_LEN 64

#define PGIE_CLASS_ID_VEHICLE 0
#define PGIE_CLASS_ID_PERSON 2

/* The muxer output resolution must be set if the input streams will be of
 * different resolution. The muxer will scale all the input frames to this
 * resolution. */
#define MUXER_OUTPUT_WIDTH 1920
#define MUXER_OUTPUT_HEIGHT 1080

/* Muxer batch formation timeout, for e.g. 40 millisec. Should ideally be set
 * based on the fastest source's framerate. */
#define MUXER_BATCH_TIMEOUT_USEC 40000

gint frame_number = 0;
gchar pgie_classes_str[4][32] = {"Vehicle", "TwoWheeler", "Person", "Roadsign"};

/* osd_sink_pad_buffer_probe  will extract metadata received on OSD sink pad
 * and update params for drawing rectangle, object information etc. */

static GstPadProbeReturn osd_sink_pad_buffer_probe(GstPad* pad, GstPadProbeInfo* info, gpointer u_data)
{
    HVG_DEBUG("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")

    GstBuffer* buf = (GstBuffer*)info->data;
    guint num_rects = 0;
    NvDsObjectMeta* obj_meta = nullptr;
    guint vehicle_count = 0;
    guint person_count = 0;
    NvDsMetaList* l_frame = nullptr;
    NvDsMetaList* l_obj = nullptr;
    NvDsDisplayMeta* display_meta = nullptr;

    NvDsBatchMeta* batch_meta = gst_buffer_get_nvds_batch_meta(buf);

    for (l_frame = batch_meta->frame_meta_list; l_frame != nullptr; l_frame = l_frame->next)
    {
        NvDsFrameMeta* frame_meta = (NvDsFrameMeta*)(l_frame->data);
        int offset = 0;
        for (l_obj = frame_meta->obj_meta_list; l_obj != nullptr; l_obj = l_obj->next)
        {
            obj_meta = (NvDsObjectMeta*)(l_obj->data);
            if (obj_meta->class_id == PGIE_CLASS_ID_VEHICLE)
            {
                vehicle_count++;
                num_rects++;
            }
            if (obj_meta->class_id == PGIE_CLASS_ID_PERSON)
            {
                person_count++;
                num_rects++;
            }
        }
        display_meta = nvds_acquire_display_meta_from_pool(batch_meta);
        NvOSD_TextParams* txt_params = &display_meta->text_params[0];
        display_meta->num_labels = 1;
        txt_params->display_text = (char*)g_malloc0(MAX_DISPLAY_LEN);
        offset = snprintf(txt_params->display_text, MAX_DISPLAY_LEN, "Person = %d ", person_count);
        offset = snprintf(txt_params->display_text + offset, MAX_DISPLAY_LEN, "Vehicle = %d ", vehicle_count);

        /* Now set the offsets where the string should appear */
        txt_params->x_offset = 10;
        txt_params->y_offset = 12;

        /* Font , font-color and font-size */
        txt_params->font_params.font_name = "Serif";
        txt_params->font_params.font_size = 10;
        txt_params->font_params.font_color.red = 1.0;
        txt_params->font_params.font_color.green = 1.0;
        txt_params->font_params.font_color.blue = 1.0;
        txt_params->font_params.font_color.alpha = 1.0;

        /* Text background color */
        txt_params->set_bg_clr = 1;
        txt_params->text_bg_clr.red = 0.0;
        txt_params->text_bg_clr.green = 0.0;
        txt_params->text_bg_clr.blue = 0.0;
        txt_params->text_bg_clr.alpha = 1.0;

        nvds_add_display_meta_to_frame(frame_meta, display_meta);
    }

    g_print("Frame Number = %d Number of objects = %d "
            "Vehicle Count = %d Person Count = %d\n",
            frame_number,
            num_rects,
            vehicle_count,
            person_count);
    frame_number++;
    return GST_PAD_PROBE_OK;
}

static gboolean bus_call(GstBus* bus, GstMessage* msg, gpointer data)
{
    GMainLoop* loop = (GMainLoop*)data;
    switch (GST_MESSAGE_TYPE(msg))
    {
        case GST_MESSAGE_EOS:
            g_print("End of stream\n");
            g_main_loop_quit(loop);
            break;
        case GST_MESSAGE_ERROR:
        {
            gchar* debug;
            GError* error;
            gst_message_parse_error(msg, &error, &debug);
            g_printerr("ERROR from element %s: %s\n", GST_OBJECT_NAME(msg->src), error->message);
            if (debug)
                g_printerr("Error details: %s\n", debug);
            g_free(debug);
            g_error_free(error);
            g_main_loop_quit(loop);
            break;
        }
        default:
            break;
    }
    return TRUE;
}

int main(int argc, char* argv[])
{
    const std::string FILE_H264 = "/data/deepstream/data/face_video.mp4";
    const std::string CONFIG_INFER = "/data/deepstream/face_recognition_skytrack/build_docker/config_temp.txt";
    const int WIDTH  = 1280;
    const int HEIGHT = 720;

    /* Standard GStreamer initialization */
    gst_init(&argc, &argv);
    GMainLoop* loop = g_main_loop_new(nullptr, FALSE);

    /* Create gstreamer elements */
    /* Create Pipeline element that will form a connection of other elements */
    GstElement *pipeline = nullptr;
    MAKE_PIPE(pipeline, "dstest1-pipeline")

    /* Source element for reading from the file */
    GstElement *source = nullptr;
    MAKE_ELEMENT(source, "filesrc", "file-source")

    /* Since the data format in the input file is elementary h264 stream,
     * we need a h264parser */
    GstElement *h264parser = nullptr;
    MAKE_ELEMENT(h264parser, "h264parse", "file-parser")

    /* Use nvdec_h264 for hardware accelerated decode on GPU */
    GstElement *decoder = nullptr;
    MAKE_ELEMENT(decoder, "nvv4l2decoder", "nvv4l2-parser")

    /* Create nvstreammux instance to form batches from one or more sources. */
    GstElement *streammux = nullptr;
    MAKE_ELEMENT(streammux, "nvstreammux", "stream-muxer")

    /* Use nvinfer to run inferencing on decoder's output,
     * behaviour of inferencing is set through config file */
    GstElement *pgie = nullptr;
    MAKE_ELEMENT(pgie, "nvinfer", "primary-nvinference-engine")

    /* Use convertor to convert from NV12 to RGBA as required by nvosd */
    GstElement *nvvidconv = nullptr;
    MAKE_ELEMENT(nvvidconv, "nvvideoconvert", "nvvideo-converter")

    /* Create OSD to draw on the converted RGBA buffer */
    GstElement *nvosd = nullptr;
    MAKE_ELEMENT(nvosd, "nvdsosd", "nv-onscreendisplay")

    /* Finally render the osd output */
    GstElement *nvsink = nullptr;
    MAKE_ELEMENT(nvsink, "nveglglessink", "nvvideo-renderer")



    /* we add a message handler */
    GstBus* bus = gst_pipeline_get_bus(GST_PIPELINE(pipeline));
    guint bus_watch_id = gst_bus_add_watch(bus, bus_call, loop);
    gst_object_unref(bus);

    /* Set up the pipeline */
    /* we add all elements into the pipeline */
    gst_bin_add_many(GST_BIN(pipeline), source, h264parser, decoder, streammux, pgie, nvvidconv, nvosd, nvsink, nullptr);

    gchar pad_name_src[16] = "src";
    GstPad* srcpad = gst_element_get_static_pad(decoder, pad_name_src);
    if (!srcpad)
    {
        HVG_ERROR("Decoder request src pad failed. Exiting.");
        return -1;
    }

    gchar pad_name_sink[16] = "sink_0";
    GstPad* sinkpad = gst_element_get_request_pad(streammux, pad_name_sink);
    if (!sinkpad)
    {
        HVG_ERROR("Streammux request sink pad failed. Exiting.");
        return -1;
    }

    if (gst_pad_link(srcpad, sinkpad) != GST_PAD_LINK_OK)
    {
        HVG_ERROR("Failed to link decoder to stream muxer. Exiting.");
        return -1;
    }

    gst_object_unref(sinkpad);
    gst_object_unref(srcpad);

    /* we link the elements together */
    /* file-source -> h264-parser -> nvh264-decoder ->
     * nvinfer -> nvvidconv -> nvosd -> video-renderer */

    if (!gst_element_link_many(source, h264parser, decoder, nullptr))
    {
        HVG_ERROR("Elements could not be linked: 1. Exiting.");
        return EXIT_FAILURE;
    }

    if (!gst_element_link_many(streammux, pgie, nvvidconv, nvosd, nvsink, nullptr))
    {
        HVG_ERROR("Elements could not be linked: 2. Exiting.");
        return EXIT_FAILURE;
    }

    /* we set the input filename to the source element */
    g_object_set(G_OBJECT(source), "location", FILE_H264.c_str(), nullptr);

    g_object_set(G_OBJECT(streammux), "batch-size", 1, nullptr);

    g_object_set(G_OBJECT(streammux),
                 "width",
                 WIDTH,
                 "height",
                 HEIGHT,
                 "batched-push-timeout",
                 MUXER_BATCH_TIMEOUT_USEC,
                 nullptr);

    g_object_set (G_OBJECT(nvsink), "sync", FALSE, NULL);

    /* Set all the necessary properties of the nvinfer element, the necessary ones are : */
    g_object_set(G_OBJECT(pgie), "config-file-path", CONFIG_INFER.c_str(), nullptr);

    /* Lets add probe to get informed of the meta data generated, we add probe to
     * the sink pad of the osd element, since by that time, the buffer would have
     * had got all the metadata. */
    GstPad* osd_sink_pad = gst_element_get_static_pad(nvosd, "sink");
    if (!osd_sink_pad)
    {
        HVG_WARNING("Unable to get sink pad");
    }
    else
    {
        gst_pad_add_probe(osd_sink_pad, GST_PAD_PROBE_TYPE_BUFFER, osd_sink_pad_buffer_probe, nullptr, nullptr);
    }
    gst_object_unref(osd_sink_pad);

    /* Set the pipeline to "playing" state */
    HVG_INFO("Now playing: " << FILE_H264.c_str());
    gst_element_set_state(pipeline, GST_STATE_PLAYING);

    /* Wait till pipeline encounters an error or EOS */
    HVG_INFO("Running...");
    g_main_loop_run(loop);

    /* Out of the main loop, clean up nicely */
    HVG_INFO("Returned, stopping playback");
    gst_element_set_state(pipeline, GST_STATE_NULL);
    HVG_INFO("Deleting pipeline");
    gst_object_unref(GST_OBJECT(pipeline));
    g_source_remove(bus_watch_id);
    g_main_loop_unref(loop);
    return 0;
}
# config_temp.txt
[property]
gpu-id=0
model-engine-file=model_info/retina-mobile0.25-288x320.engine
batch-size=1
net-scale-factor=1.0
offsets=104.0;117.0;123.0
force-implicit-batch-dim=1
model-color-format=1
## 0=FP32, 1=INT8, 2=FP16 mode
network-mode=2
#process-mode=1
# workaround for getting topk detection
num-detected-classes=4
# number of consecutive batches to skip for inference
interval=0
# custom detection parser
parse-bbox-func-name=NvDsInferParseCustomRetinaFace
custom-lib-path=../../retinaface_parser/build_docker/libretinaface_parser.so
gie-unique-id=1

[class-attrs-all]
# bbox threshold
pre-cluster-threshold=0.6
# nms threshold
post-cluster-threshold=0.4
#include "nvdsinfer_custom_impl.h"
#include <algorithm>
#include <cassert>
#include <cmath>
#include <cstring>
#include <iostream>
#include <gst/gst.h>
#include <fstream>

#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define CLIP(a, min, max) (MAX(MIN(a, max), min))

extern "C" bool NvDsInferParseCustomRetinaFace(std::vector<NvDsInferLayerInfo> const& outputLayersInfo,
                                               NvDsInferNetworkInfo const& networkInfo,
                                               NvDsInferParseDetectionParams const& detectionParams,
                                               std::vector<NvDsInferObjectDetectionInfo>& objectList);
struct Bbox
{
    int x1, y1, x2, y2;
    float score;
};
struct anchorBox
{
    float cx;
    float cy;
    float sx;
    float sy;
};
void postprocessing(float* bbox,
                    float* conf,
                    float bbox_threshold,
                    float nms_threshold,
                    unsigned int topk,
                    int width,
                    int height,
                    std::vector<NvDsInferObjectDetectionInfo>& objectList);
void create_anchor_retinaface(std::vector<anchorBox>& anchor, int w, int h);
bool cmp(NvDsInferObjectDetectionInfo a, NvDsInferObjectDetectionInfo b);
void nms(std::vector<NvDsInferObjectDetectionInfo>& input_boxes, float NMS_THRESH);

void postprocessing(float* bbox,
                    float* conf,
                    float bbox_threshold,
                    float nms_threshold,
                    unsigned int topk,
                    int width,
                    int height,
                    std::vector<NvDsInferObjectDetectionInfo>& objectList)
{
    std::vector<anchorBox> anchor;
    create_anchor_retinaface(anchor, width, height);

    for (unsigned int i = 0; i < anchor.size(); ++i)
    {
        if (*(conf + 1) > bbox_threshold)
        {
            anchorBox tmp = anchor[i];
            anchorBox tmp1;
            NvDsInferObjectDetectionInfo result;
            result.classId = 0;

            // decode bbox
            tmp1.cx = tmp.cx + *bbox * 0.1f * tmp.sx;
            tmp1.cy = tmp.cy + *(bbox + 1) * 0.1f * tmp.sy;
            tmp1.sx = tmp.sx * exp(*(bbox + 2) * 0.2f);
            tmp1.sy = tmp.sy * exp(*(bbox + 3) * 0.2f);

            result.left = (tmp1.cx - tmp1.sx / 2) * width;
            result.top = (tmp1.cy - tmp1.sy / 2) * height;
            result.width = (tmp1.cx + tmp1.sx / 2) * width - result.left;
            result.height = (tmp1.cy + tmp1.sy / 2) * height - result.top;

            // Clip object box coordinates to network resolution
            result.left = CLIP(result.left, 0, width - 1);
            result.top = CLIP(result.top, 0, height - 1);
            result.width = CLIP(result.width, 0, width - 1);
            result.height = CLIP(result.height, 0, height - 1);

            result.detectionConfidence = *(conf + 1);
            objectList.push_back(result);
        }
        bbox += 4;
        conf += 2;
    }
    std::sort(objectList.begin(), objectList.end(), cmp);
    nms(objectList, nms_threshold);
    if (objectList.size() > topk)
        objectList.resize(topk);
}

void create_anchor_retinaface(std::vector<anchorBox>& anchor, int w, int h)
{
    anchor.clear();
    std::vector<std::vector<int>> feature_map(3), min_sizes(3);
    float steps[] = {8, 16, 32};
    for (unsigned int i = 0; i < feature_map.size(); ++i)
    {
        feature_map[i].push_back(ceil(h / steps[i]));
        feature_map[i].push_back(ceil(w / steps[i]));
    }
    std::vector<int> minsize1 = {10, 20};
    min_sizes[0] = minsize1;
    std::vector<int> minsize2 = {32, 64};
    min_sizes[1] = minsize2;
    std::vector<int> minsize3 = {128, 256};
    min_sizes[2] = minsize3;

    for (unsigned int k = 0; k < feature_map.size(); ++k)
    {
        std::vector<int> min_size = min_sizes[k];
        for (int i = 0; i < feature_map[k][0]; ++i)
        {
            for (int j = 0; j < feature_map[k][1]; ++j)
            {
                for (unsigned int l = 0; l < min_size.size(); ++l)
                {
                    float s_kx = static_cast<float>(min_size[l]) * 1.0f / static_cast<float>(w);
                    float s_ky = static_cast<float>(min_size[l]) * 1.0f / static_cast<float>(h);

                    float cx = (static_cast<float>(j) + 0.5f) * steps[k] / static_cast<float>(w);
                    float cy = (static_cast<float>(i) + 0.5f) * steps[k] / static_cast<float>(h);

                    anchorBox axil = {cx, cy, s_kx, s_ky};
                    anchor.push_back(axil);
                }
            }
        }
    }
}

bool cmp(NvDsInferObjectDetectionInfo a, NvDsInferObjectDetectionInfo b)
{
    if (a.detectionConfidence > b.detectionConfidence)
        return true;
    return false;
}

void nms(std::vector<NvDsInferObjectDetectionInfo>& input_boxes, float NMS_THRESH)
{
    std::vector<float> vArea(input_boxes.size());
    for (int i = 0; i < int(input_boxes.size()); ++i)
    {
        vArea[i] = (input_boxes.at(i).width + 1) * (input_boxes.at(i).height + 1);
    }
    for (int i = 0; i < int(input_boxes.size()); ++i)
    {
        for (int j = i + 1; j < int(input_boxes.size());)
        {
            float xx1 = std::max(input_boxes[i].left, input_boxes[j].left);
            float yy1 = std::max(input_boxes[i].top, input_boxes[j].top);
            float xx2 =
                std::min(input_boxes[i].left + input_boxes[i].width, input_boxes[j].left + input_boxes[j].width);
            float yy2 =
                std::min(input_boxes[i].top + input_boxes[i].height, input_boxes[j].top + input_boxes[j].height);
            float w = std::max(float(0), xx2 - xx1 + 1);
            float h = std::max(float(0), yy2 - yy1 + 1);
            float inter = w * h;
            float ovr = inter / (vArea[i] + vArea[j] - inter);
            if (ovr >= NMS_THRESH)
            {
                input_boxes.erase(input_boxes.begin() + j);
                vArea.erase(vArea.begin() + j);
            }
            else
            {
                j++;
            }
        }
    }
}
bool NvDsInferParseCustomRetinaFace(std::vector<NvDsInferLayerInfo> const& outputLayersInfo,
                                    NvDsInferNetworkInfo const& networkInfo,
                                    NvDsInferParseDetectionParams const& detectionParams,
                                    std::vector<NvDsInferObjectDetectionInfo>& objectList)
{
    // Get output indexes
    static int bboxLayerIndex = -1;
    static int confLayerIndex = -1;
    for (unsigned int i = 0; i < outputLayersInfo.size(); i++)
    {
        if (strcmp(outputLayersInfo[i].layerName, "output_det0") == 0)
        {
            bboxLayerIndex = static_cast<int>(i);
        }
        else if (strcmp(outputLayersInfo[i].layerName, "output_det1") == 0)
        {
            confLayerIndex = static_cast<int>(i);
        }
    }
    if ((bboxLayerIndex == -1) || (confLayerIndex == -1))
    {
        std::cerr << "Could not find output layer buffer while parsing" << std::endl;
        return false;
    }

    // Host memory for "decode"
    auto bbox = (float*)outputLayersInfo[bboxLayerIndex].buffer;
    auto* conf = (float*)outputLayersInfo[confLayerIndex].buffer;

    // Get thresholds and topk value
    const float bbox_threshold = detectionParams.perClassPreclusterThreshold[0];
    const float nms_threshold = detectionParams.perClassPostclusterThreshold[0];
    const unsigned int topk = detectionParams.numClassesConfigured;

    // Do post processing
    postprocessing(bbox, conf, bbox_threshold, nms_threshold, topk, networkInfo.width, networkInfo.height, objectList);
    return true;
}

/* Check that the custom function has been defined correctly */
CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseCustomRetinaFace);

Have you debug with your code? Can you get output in NvDsInferParseCustomRetinaFace()?

Neither the model nor the application is provided by Nvidia. It is not reasonable for us to debug your code for you.

Thank you for your answer! I have debug only with help std::cout . For example:

...
bool NvDsInferParseCustomRetinaFace(std::vector<NvDsInferLayerInfo> const& outputLayersInfo,
                                    NvDsInferNetworkInfo const& networkInfo,
                                    NvDsInferParseDetectionParams const& detectionParams,
                                    std::vector<NvDsInferObjectDetectionInfo>& objectList)
{
    // [DEBUG]
    static int numFrame = 0;
    std::cout << "[DEBUG] NvDsInferParseCustomRetinaFace : " << ++numFrame << std::endl;
    // [DEBUG]

    // Get output indexes
    static int bboxLayerIndex = -1;
    static int confLayerIndex = -1;
    for (unsigned int i = 0; i < outputLayersInfo.size(); i++)
    {
...

But I did not get logs from NvDsInferParseCustomRetinaFace in terminal:

Also I send models for this app:

labels.txt (5 Bytes)
mobilenet0.25_Final.pth (1.7 MB)
retina-mobile0.25-288x320.onnx (1.6 MB)
retina-mobile0.25-288x320.engine (1.5 MB)

I converted: mobilenet0.25_Final.pth → retina-mobile0.25-288x320.onnx → retina-mobile0.25-288x320.engine

Do you know the input and output layer names of the model?

config_model.txt (705 Bytes)
I’ve tried the attached nvinfer config file with deepstream-app, there is output from model in NvDsInferParseCustomRetinaFace()

Yes, I do. Output layers are output_det0 and output_det1

How I can get outputs from NvDsInferParseCustomRetinaFace with help my code? Can I visualize detection work result on display?

You have output the bboxes. Please refer to deepstream sample code, the nvdsosd plugin will draw bboxes by default.

This topic was automatically closed 60 days after the last reply. New replies are no longer allowed.