I have double checked my Parsing box function which is mainly doing NMS. But the outputs fetched from the buffer seems to be incorrect. I did not use any parser to create the TRT engine. But used this tensorrtx/yolov5 repo. Or you can give me a direction to move in.
@CJR Below is the only config file in the DeepStream app for yolov5 detector ( added cluster-mode=4 ) which did not make any difference. I am using exactly the same method in Bbox parsing to parse the output of TRT engine as I used for TensorRT inference :
[property]
gpu-id=0
net-scale-factor=0.0039215697906911373
workspace-size=1024
model-engine-file=./models/yolov5s.engine
labelfile-path=./labels.txt
batch-size=1
process-mode=1
model-color-format=1
network-mode=2
num-detected-classes=80
interval=0
gie-unique-id=1
is-classifier=0
cluster-mode=4
output-blob-names=prob
parse-bbox-func-name=NvDsInferParseCustomFD
custom-lib-path=./nvdsinfer_custom_parser/libyoloplugin.so
[class-attrs-all]
pre-cluster-threshold=0.3
roi-top-offset=0
roi-bottom-offset=0
detected-min-w=0
detected-min-h=0
detected-max-w=0
detected-max-h=0
This is the parsing function for NMS to work buffer should have the correct values. I dont think currently the buffer provides correct output values. as the number of Bboxes detected are way to high:
#include <algorithm>
#include <cstring>
#include <iostream>
#include <map>
#include "nvdsinfer_custom_impl.h"
#include <cassert>
#define NMS_THRESH 0.5
#define CONF_THRESH 0.4
static constexpr int LOCATIONS = 4;
struct alignas(float) Detection{
//center_x center_y w h
float bbox[LOCATIONS];
float conf; // bbox_conf * cls_conf
float class_id;
};
// stuff we know about the network and the input/output blobs
static const int INPUT_H = 608;
static const int INPUT_W = 608;
static const int MAX_OUTPUT_BBOX_COUNT = 1000;
static const int OUTPUT_SIZE = MAX_OUTPUT_BBOX_COUNT * sizeof(Detection) / sizeof(float) + 1; // we assume the yololayer outputs no more than 1000 boxes that conf >= 0.1
/* C-linkage to prevent name-mangling */
extern "C"
bool NvDsInferParseCustomFD (
std::vector<NvDsInferLayerInfo> const &outputLayersInfo,
NvDsInferNetworkInfo const &networkInfo,
NvDsInferParseDetectionParams const &detectionParams,
std::vector<NvDsInferObjectDetectionInfo> &objectList);
extern "C"
float iou(float lbox[4], float rbox[4]) {
float interBox[] = {
std::max(lbox[0] - lbox[2]/2.f , rbox[0] - rbox[2]/2.f), //left
std::min(lbox[0] + lbox[2]/2.f , rbox[0] + rbox[2]/2.f), //right
std::max(lbox[1] - lbox[3]/2.f , rbox[1] - rbox[3]/2.f), //top
std::min(lbox[1] + lbox[3]/2.f , rbox[1] + rbox[3]/2.f), //bottom
};
if(interBox[2] > interBox[3] || interBox[0] > interBox[1])
return 0.0f;
float interBoxS =(interBox[1]-interBox[0])*(interBox[3]-interBox[2]);
return interBoxS/(lbox[2]*lbox[3] + rbox[2]*rbox[3] -interBoxS);
}
bool cmp(Detection& a, Detection& b) {
return a.conf > b.conf;
}
void nms(std::vector<Detection>& res, float *output, float conf_thresh, float nms_thresh = 0.5) {
int det_size = sizeof(Detection) / sizeof(float);
std::cout << "detected before nms -> " << output[0] << std::endl;
std::map<float, std::vector<Detection>> m;
for (int i = 0; i < output[0] && i < 1000; i++) {
if (output[1 + det_size * i + 4] <= conf_thresh) continue;
Detection det;
memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float));
if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector<Detection>());
m[det.class_id].push_back(det);
}
for (auto it = m.begin(); it != m.end(); it++) {
//std::cout << it->second[0].class_id << " --- " << std::endl;
auto& dets = it->second;
std::sort(dets.begin(), dets.end(), cmp);
for (size_t m = 0; m < dets.size(); ++m) {
auto& item = dets[m];
res.push_back(item);
for (size_t n = m + 1; n < dets.size(); ++n) {
if (iou(item.bbox, dets[n].bbox) > nms_thresh) {
dets.erase(dets.begin()+n);
--n;
}
}
}
}
}
bool NvDsInferParseCustomFD (std::vector<NvDsInferLayerInfo> const &outputLayersInfo,
NvDsInferNetworkInfo const &networkInfo,
NvDsInferParseDetectionParams const &detectionParams,
std::vector<NvDsInferObjectDetectionInfo> &objectList) {
static int decodeIndex = -1;
/* Find the decode layer */
if (decodeIndex == -1) {
for (unsigned int i = 0; i < outputLayersInfo.size(); i++) {
if (strcmp(outputLayersInfo[i].layerName, "prob") == 0) {
decodeIndex = i;
std::cout << "Found decode layer buffer while parsing" << decodeIndex << std::endl;
break;
}
std::cout << outputLayersInfo[i].layerName << " " << std::endl;
}
if (decodeIndex == -1) {
std::cerr << "Could not find decode layer buffer while parsing" << std::endl;
return false;
}
}
// Host memory for "decode"
float* out_decode = (float *) outputLayersInfo[decodeIndex].buffer;
const int batch_id = 0;
const int out_class_size = detectionParams.numClassesConfigured;
const float threshold = detectionParams.perClassThreshold[0];
std::cout<<"out_class_size: "<< out_class_size << std::endl;
std::cout<<"threshold: "<< threshold << std::endl;
std::vector<Detection> res;
nms(res, &out_decode[0], CONF_THRESH, NMS_THRESH);
std::cout << "after nms -> " << res.size() << std::endl;
for (size_t j = 0; j < res.size(); j++){
if (res[j].conf < 0.1) continue;
// std::cout << "class -> " << res[j].class_id;
// std::cout << " conf -> " << res[j].conf << std::endl;
NvDsInferObjectDetectionInfo object;
object.classId = res[j].class_id;
object.detectionConfidence = res[j].conf;
/* Clip object box co-ordinates to network resolution */
float left = res[j].bbox[0] - res[j].bbox[2]/2.f;
float top = res[j].bbox[1] - res[j].bbox[3]/2.f;
object.left = left;
object.top = top;
object.width = res[j].bbox[2];
object.height = res[j].bbox[3];
objectList.push_back(object);
}
return true;
}
/* Check that the custom function has been defined correctly */
CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseCustomFD);
Below is the Terminal Out:
root@91fbcc5a3a74:/opt/nvidia/deepstream/deepstream-5.0/sources/apps/deepstream-yolov5-img# ./deepstream-custom -c yolo_pgie_config.txt -i samples/bus.jpg
Now playing: yolo_pgie_config.txt
WARNING: ../nvdsinfer/nvdsinfer_func_utils.cpp:34 [TRT]: Current optimization profile is: 0. Please ensure there are no enqueued operations pending in this context prior to switching profiles
0:00:02.430127300 14944 0x563e38873360 INFO nvinfer gstnvinfer.cpp:602:gst_nvinfer_logger:<primary-nvinference-engine> NvDsInferContext[UID 1]: Info from NvDsInferContextImpl::deserializeEngineAndBackend() <nvdsinfer_context_impl.cpp:1577> [UID = 1]: deserialized trt engine from :/opt/nvidia/deepstream/deepstream-5.0/sources/apps/deepstream-yolov5-img/models/yolov5s.engine
INFO: ../nvdsinfer/nvdsinfer_model_builder.cpp:685 [Implicit Engine Info]: layers num: 2
0 INPUT kFLOAT data 3x608x608
1 OUTPUT kFLOAT prob 6001x1x1
0:00:02.430234100 14944 0x563e38873360 INFO nvinfer gstnvinfer.cpp:602:gst_nvinfer_logger:<primary-nvinference-engine> NvDsInferContext[UID 1]: Info from NvDsInferContextImpl::generateBackendContext() <nvdsinfer_context_impl.cpp:1681> [UID = 1]: Use deserialized engine model: /opt/nvidia/deepstream/deepstream-5.0/sources/apps/deepstream-yolov5-img/models/yolov5s.engine
0:00:02.431288464 14944 0x563e38873360 INFO nvinfer gstnvinfer_impl.cpp:311:notifyLoadModelStatus:<primary-nvinference-engine> [UID 1]: Load new model:yolo_pgie_config.txt sucessfully
Running...
Found decode layer buffer while parsing0
out_class_size: 80
threshold: 0.3
detected before nms -> 8295
after nms -> 0
End of stream
Returned, stopping playback
Deleting pipeline
Trt inference:
detected before NMS => 84
Thanks. Let me know if you have any suggestions!!