Please provide complete information as applicable to your setup.
• Hardware Platform (Jetson / GPU) → dGPU aws T4
• DeepStream Version → 5.0
• TensorRT Version → 7
• NVIDIA GPU Driver Version (valid for GPU only) → 440.82
I am trying to write a custom parser for bbox so I am able to find the output layer with OUTPUT_BLOB_NAME = “prob” correctly but when I run inference in TensorRT on an image I have 207 detections before NMS while here in the parser func which is given below the size of the output is 28560 on the same image and I am not able to look beyond that, any help is welcome. The Parser func, its not complete yet:
#include <algorithm>
#include <cstring>
#include <iostream>
#include "nvdsinfer_custom_impl.h"
#include <cassert>
#define MIN(a,b) ((a) < (b) ? (a) : (b))
#define MAX(a,b) ((a) > (b) ? (a) : (b))
#define CLIP(a,min,max) (MAX(MIN(a, max), min))
#define DIVIDE_AND_ROUND_UP(a, b) ((a + b - 1) / b)
//static params gParams;
/* This is a sample bounding box parsing function for the sample ssd
*
* detector model provided with the SDK. */
/* C-linkage to prevent name-mangling */
extern "C"
bool NvDsInferParseCustomFD (
std::vector<NvDsInferLayerInfo> const &outputLayersInfo,
NvDsInferNetworkInfo const &networkInfo,
NvDsInferParseDetectionParams const &detectionParams,
std::vector<NvDsInferObjectDetectionInfo> &objectList);
extern "C"
struct alignas(float) Detection{
float bbox[4]; //x1 y1 x2 y2
float class_confidence;
float landmark[10];
};
bool cmp(Detection& a, Detection& b) {
return a.class_confidence > b.class_confidence;
}
float iou(float lbox[4], float rbox[4]) {
float interBox[] = {
std::max(lbox[0], rbox[0]), //left
std::min(lbox[2], rbox[2]), //right
std::max(lbox[1], rbox[1]), //top
std::min(lbox[3], rbox[3]), //bottom
};
if(interBox[2] > interBox[3] || interBox[0] > interBox[1])
return 0.0f;
float interBoxS = (interBox[1] - interBox[0]) * (interBox[3] - interBox[2]);
return interBoxS / ((lbox[2] - lbox[0]) * (lbox[3] - lbox[1]) + (rbox[2] - rbox[0]) * (rbox[3] - rbox[1]) -interBoxS + 0.000001f);
}
void nms(std::vector<Detection>& res, float *output, float nms_thresh = 0.4) {
std::vector<Detection> dets;
// Copy detection result
for (int i = 0; i < output[0]; i++) {
if (output[15 * i + 1 + 4] <= 0.1) continue;
Detection det;
memcpy(&det, &output[15 * i + 1], sizeof(Detection));
dets.push_back(det);
}
// Sort detection results based on confidence scores.
std::sort(dets.begin(), dets.end(), cmp);
if (dets.size() > 5000) dets.erase(dets.begin() + 5000, dets.end());
// main nms operation
for (size_t m = 0; m < dets.size(); ++m) {
auto& item = dets[m];
res.push_back(item);
//std::cout << item.class_confidence << " bbox " << item.bbox[0] << ", " << item.bbox[1] << ", " << item.bbox[2] << ", " << item.bbox[3] << std::endl;
for (size_t n = m + 1; n < dets.size(); ++n) {
if (iou(item.bbox, dets[n].bbox) > nms_thresh) {
dets.erase(dets.begin()+n);
--n;
}
}
}
}
bool NvDsInferParseCustomFD (std::vector<NvDsInferLayerInfo> const &outputLayersInfo,
NvDsInferNetworkInfo const &networkInfo,
NvDsInferParseDetectionParams const &detectionParams,
std::vector<NvDsInferObjectDetectionInfo> &objectList) {
static int decodeIndex = -1;
/* Find the decode layer */
if (decodeIndex == -1) {
for (unsigned int i = 0; i < outputLayersInfo.size(); i++) {
if (strcmp(outputLayersInfo[i].layerName, "prob") == 0) {
decodeIndex = i;
break;
}
}
if (decodeIndex == -1) {
std::cerr << "Could not find decode layer buffer while parsing" << std::endl;
return false;
}
}
// Host memory for "decode"
float* out_decode = (float *) outputLayersInfo[decodeIndex].buffer;
std::cout << "detected before nms -> " << out_decode[0] << std::endl;
const int batch_id = 0;
const int out_class_size = detectionParams.numClassesConfigured;
const float threshold = detectionParams.perClassThreshold[0];
std::vector<Detection> res;
nms(res, out_decode);
std::cout << "detected before nms -> " << out_decode[0] << std::endl;
std::cout << "after nms -> " << res.size() << std::endl;
for (size_t j = 0; j < res.size(); j++){
if (res[j].class_confidence < 0.1) continue;
// std::cout << "class confidence -> " << res[j].class_confidence << std::endl;
NvDsInferObjectDetectionInfo object;
object.classId = 0;
object.detectionConfidence = res[j].class_confidence;
/* Clip object box co-ordinates to network resolution */
object.left = CLIP(res[j].bbox[0] * networkInfo.width, 0, networkInfo.width - 1);
object.top = CLIP(res[j].bbox[1] * networkInfo.height, 0, networkInfo.height - 1);
object.width = CLIP((res[j].bbox[2] - res[j].bbox[0]) * networkInfo.width, 0, networkInfo.width - 1);
object.height = CLIP((res[j].bbox[3] - res[j].bbox[1]) * networkInfo.height, 0, networkInfo.height - 1);
objectList.push_back(object);
}
return true;
}
/* Check that the custom function has been defined correctly */
CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseCustomFD);
The output of above code:
detected before nms -> 28560
after nms -> 2137
Which is quite strange. Obviously, I am not correctly retrieving the output from the outputLayerBuffer which is where I am stuck.
This is how I am doing inference in TensorRT which shows me 207 no of preds before NMS(nms is defined above):
doInference(*context, data, prob, 1);
std::vector<Detection> res
nms(res, prob);
void doInference(IExecutionContext& context, float* input, float* output, int batchSize) {
const ICudaEngine& engine = context.getEngine();
// Pointers to input and output device buffers to pass to engine.
// Engine requires exactly IEngine::getNbBindings() number of buffers.
assert(engine.getNbBindings() == 2);
void* buffers[2];
// In order to bind the buffers, we need to know the names of the input and output tensors.
// Note that indices are guaranteed to be less than IEngine::getNbBindings()
const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);
// Create GPU buffers on device
CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));
// Create stream
cudaStream_t stream;
CHECK(cudaStreamCreate(&stream));
// DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
context.enqueue(batchSize, buffers, stream, nullptr);
CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);
// Release stream and buffers
cudaStreamDestroy(stream);
CHECK(cudaFree(buffers[inputIndex]));
CHECK(cudaFree(buffers[outputIndex]));
}
std::cout << "detected before nms -> " << prob[0] << std::endl; // 207
std::cout << "after nms -> " << res.size() << std::endl; // 4