NvDsInferLayerInfo not giving expected no. of outputs

Please provide complete information as applicable to your setup.

• Hardware Platform (Jetson / GPU) → dGPU aws T4
• DeepStream Version → 5.0
• TensorRT Version → 7
• NVIDIA GPU Driver Version (valid for GPU only) → 440.82

I am trying to write a custom parser for bbox so I am able to find the output layer with OUTPUT_BLOB_NAME = “prob” correctly but when I run inference in TensorRT on an image I have 207 detections before NMS while here in the parser func which is given below the size of the output is 28560 on the same image and I am not able to look beyond that, any help is welcome. The Parser func, its not complete yet:

    #include <algorithm>
#include <cstring>
#include <iostream>
#include "nvdsinfer_custom_impl.h"
#include <cassert>

#define MIN(a,b) ((a) < (b) ? (a) : (b))
#define MAX(a,b) ((a) > (b) ? (a) : (b))
#define CLIP(a,min,max) (MAX(MIN(a, max), min))
#define DIVIDE_AND_ROUND_UP(a, b) ((a + b - 1) / b)

//static params gParams;

/* This is a sample bounding box parsing function for the sample ssd
 *
 * detector model provided with the SDK. */

/* C-linkage to prevent name-mangling */

extern "C"
bool NvDsInferParseCustomFD (
         std::vector<NvDsInferLayerInfo> const &outputLayersInfo,
         NvDsInferNetworkInfo  const &networkInfo,
         NvDsInferParseDetectionParams const &detectionParams,
         std::vector<NvDsInferObjectDetectionInfo> &objectList);


extern "C"
struct alignas(float) Detection{
        float bbox[4];  //x1 y1 x2 y2
        float class_confidence;
        float landmark[10];
    };

bool cmp(Detection& a, Detection& b) {
    return a.class_confidence > b.class_confidence;
}

float iou(float lbox[4], float rbox[4]) {
    float interBox[] = {
        std::max(lbox[0], rbox[0]), //left
        std::min(lbox[2], rbox[2]), //right
        std::max(lbox[1], rbox[1]), //top
        std::min(lbox[3], rbox[3]), //bottom
    };

    if(interBox[2] > interBox[3] || interBox[0] > interBox[1])
        return 0.0f;

    float interBoxS = (interBox[1] - interBox[0]) * (interBox[3] - interBox[2]);
    return interBoxS / ((lbox[2] - lbox[0]) * (lbox[3] - lbox[1]) + (rbox[2] - rbox[0]) * (rbox[3] - rbox[1]) -interBoxS + 0.000001f);
}


void nms(std::vector<Detection>& res, float *output, float nms_thresh = 0.4) {
    std::vector<Detection> dets;

    // Copy detection result
    for (int i = 0; i < output[0]; i++) {
        if (output[15 * i + 1 + 4] <= 0.1) continue;
        Detection det;
        memcpy(&det, &output[15 * i + 1], sizeof(Detection));
        dets.push_back(det);
    }

    // Sort detection results based on confidence scores. 
    std::sort(dets.begin(), dets.end(), cmp);
    if (dets.size() > 5000) dets.erase(dets.begin() + 5000, dets.end());

    // main nms operation
    for (size_t m = 0; m < dets.size(); ++m) {
        auto& item = dets[m];
        res.push_back(item);
        //std::cout << item.class_confidence << " bbox " << item.bbox[0] << ", " << item.bbox[1] << ", " << item.bbox[2] << ", " << item.bbox[3] << std::endl;
        for (size_t n = m + 1; n < dets.size(); ++n) {
            if (iou(item.bbox, dets[n].bbox) > nms_thresh) {
                dets.erase(dets.begin()+n);
                --n;
            }
        }
    }
}

bool NvDsInferParseCustomFD (std::vector<NvDsInferLayerInfo> const &outputLayersInfo,
                                   NvDsInferNetworkInfo  const &networkInfo,
                                   NvDsInferParseDetectionParams const &detectionParams,
                                   std::vector<NvDsInferObjectDetectionInfo> &objectList) {
    static int decodeIndex = -1;

    /* Find the decode layer */
    if (decodeIndex == -1) {
        for (unsigned int i = 0; i < outputLayersInfo.size(); i++) {
            if (strcmp(outputLayersInfo[i].layerName, "prob") == 0) {
                decodeIndex = i;
                break;
            }
        }
        if (decodeIndex == -1) {
            std::cerr << "Could not find decode layer buffer while parsing" << std::endl;
            return false;
        }
    }

    // Host memory for "decode"
    float* out_decode = (float *) outputLayersInfo[decodeIndex].buffer;
    std::cout << "detected before nms -> " << out_decode[0] << std::endl;
    const int batch_id = 0;
    const int out_class_size = detectionParams.numClassesConfigured;
    const float threshold = detectionParams.perClassThreshold[0];

    std::vector<Detection> res;
    nms(res, out_decode);
    std::cout << "detected before nms -> " << out_decode[0] << std::endl;
    std::cout << "after nms -> " << res.size() << std::endl;

    for (size_t j = 0; j < res.size(); j++){
        if (res[j].class_confidence < 0.1) continue;
        // std::cout << "class confidence -> " << res[j].class_confidence << std::endl;
        NvDsInferObjectDetectionInfo object;
        object.classId = 0;
        object.detectionConfidence = res[j].class_confidence;
        /* Clip object box co-ordinates to network resolution */
        object.left = CLIP(res[j].bbox[0] * networkInfo.width, 0, networkInfo.width - 1);
        object.top = CLIP(res[j].bbox[1] * networkInfo.height, 0, networkInfo.height - 1);
        object.width = CLIP((res[j].bbox[2] - res[j].bbox[0]) * networkInfo.width, 0, networkInfo.width - 1);
        object.height = CLIP((res[j].bbox[3] - res[j].bbox[1]) * networkInfo.height, 0, networkInfo.height - 1);
        objectList.push_back(object);
    }

    return true;
}

/* Check that the custom function has been defined correctly */
CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseCustomFD);

The output of above code:

detected before nms -> 28560
after nms -> 2137

Which is quite strange. Obviously, I am not correctly retrieving the output from the outputLayerBuffer which is where I am stuck.

This is how I am doing inference in TensorRT which shows me 207 no of preds before NMS(nms is defined above):

doInference(*context, data, prob, 1);
std::vector<Detection> res
nms(res, prob);

void doInference(IExecutionContext& context, float* input, float* output, int batchSize) {
    const ICudaEngine& engine = context.getEngine();

    // Pointers to input and output device buffers to pass to engine.
    // Engine requires exactly IEngine::getNbBindings() number of buffers.
    assert(engine.getNbBindings() == 2);
    void* buffers[2];

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
    const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);

    // Create GPU buffers on device
    CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
    CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

    // Create stream
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));

    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
    context.enqueue(batchSize, buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream);

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndex]));
}

std::cout << "detected before nms -> " << prob[0] << std::endl; // 207
std::cout << "after nms -> " << res.size() << std::endl;        // 4

@bcao Hey I have written the custom func for parsing bbox but facing some issues in extracting output from the network. Can you please have a look? Or give me a direction to look in? Thanks

could you try use the tensorrt engine built from DS in your TRT inference and check the output?
could you try using fp32 for both inference?
and, try feed the image with the same resolution as network to rule out the affection of resize.
and, please check if they are using the same scaling and mean value.

Thanks!

@mchi I don’t understand what do you mean by TensorRT engine built from DS because I am already providing DS with a serialized engine file that I built in a TRT container. So, I guess you want me to try inference with same resolution in both TRT and DS container environments?

So, the TRT inference and DS inference are using the same TRT engine, right? If they are, it’s also ok, that’s the same as what I asked.

I guess you want me to try inference with same resolution in both TRT and DS container environments?
They already have the same network input resolution since they are using the same TRT engine, right?

Regarding “try feed the image with the same resolution as network to rule out the affection of resize.”, I mean, for exmaple, the network input resolution is 640x480, you could manually firstly resize the image to 640x480 and feed to TRT inference or DS infernece.

Thanks!

@mchi So, below is the way an image/frame is fed to RetinaFace, this is the way its trained:

cv::Mat preprocess_img(cv::Mat& img) {
    int w, h, x, y;
    // multiplied by 1.0 to convert to float
    float r_w = INPUT_W / (img.cols*1.0); 
    float r_h = INPUT_H / (img.rows*1.0);
    // Maintain aspect ratio.
    if (r_h > r_w) {
        w = INPUT_W;
        h = r_w * img.rows;
        x = 0;
        y = (INPUT_H - h) / 2;
    } else {
        w = r_h* img.cols;
        h = INPUT_H;
        x = (INPUT_W - w) / 2;
        y = 0;
    }

    // Read OpenCV C++ documentation to further understand the process. 
    cv::Mat re(h, w, CV_8UC3); // Create a cv::Mat object.
    cv::resize(img, re, re.size(), 0, 0, cv::INTER_CUBIC); // Resize image
    cv::Mat out(INPUT_H, INPUT_W, CV_8UC3, cv::Scalar(128, 128, 128)); // Create another cv::Mat object
    re.copyTo(out(cv::Rect(x, y, re.cols, re.rows))); // COpy the image to the object.
    return out;
}


static float data[3 * INPUT_H * INPUT_W];
cv::Mat img = cv::imread("/home/retinaface_tensorrt/download.jpeg");
cv::Mat pr_img = preprocess_img(img);
for (int i = 0; i < INPUT_H * INPUT_W; i++) {
    data[i] = pr_img.at<cv::Vec3b>(i)[0] - 104.0;
    data[i + INPUT_H * INPUT_W] = pr_img.at<cv::Vec3b>(i)[1] - 117.0;
    data[i + 2 * INPUT_H * INPUT_W] = pr_img.at<cv::Vec3b>(i)[2] - 123.0;
}

Then ‘‘data’’ is fed to the network for inference! So, is there a way we can include this preprocessing step in the pipeline?

@mchi I tried what you asked to write the input data to a file. But that doesnt work, so this is what I did:

cv::Mat preprocesses_img = cv::Mat::zeros(1, 3 * INPUT_H * INPUT_W, CV_32F);
for (int i = 0; i < INPUT_H * INPUT_W; i++) {
    preprocesses_img.at<float>(i) = pr_img.at<cv::Vec3b>(i)[0] - 104.0;
    preprocesses_img.at<float>(i + INPUT_H * INPUT_W) = pr_img.at<cv::Vec3b>(i)[1] - 117.0;
    preprocesses_img.at<float>(i + 2 * INPUT_H * INPUT_W) = pr_img.at<cv::Vec3b>(i)[2] - 123.0;
}
cv::imwrite("preprocessed_img.jpeg", preprocesses_img);

Yes, obviously this didnt work becuase DS is expecting an image(3 channel) but what I did here is converted the image into the format required by the network and the jpeg parser thats used in DS cannot parse it so throws this error:

root@77b0bcb3f77c:/opt/nvidia/deepstream/deepstream-5.0/sources/apps/deepstream-retinaface# ./deepstream-custom -c retinaface_pgie_config.txt -i ./preprocessed_img.jpeg 
Now playing: retinaface_pgie_config.txt
WARNING: ../nvdsinfer/nvdsinfer_func_utils.cpp:34 [TRT]: Current optimization profile is: 0. Please ensure there are no enqueued operations pending in this context prior to switching profiles
0:00:02.735356338   361 0x556cebde5f60 INFO                 nvinfer gstnvinfer.cpp:602:gst_nvinfer_logger:<primary-nvinference-engine> NvDsInferContext[UID 1]: Info from NvDsInferContextImpl::deserializeEngineAndBackend() <nvdsinfer_context_impl.cpp:1577> [UID = 1]: deserialized trt engine from :/opt/nvidia/deepstream/deepstream-5.0/sources/apps/deepstream-retinaface/tensorrt_engines_awsT4/retina_r50.engine
INFO: ../nvdsinfer/nvdsinfer_model_builder.cpp:685 [Implicit Engine Info]: layers num: 2
0   INPUT  kFLOAT data            3x640x1088      
1   OUTPUT kFLOAT prob            428401x1x1      

0:00:02.735458852   361 0x556cebde5f60 INFO                 nvinfer gstnvinfer.cpp:602:gst_nvinfer_logger:<primary-nvinference-engine> NvDsInferContext[UID 1]: Info from NvDsInferContextImpl::generateBackendContext() <nvdsinfer_context_impl.cpp:1681> [UID = 1]: Use deserialized engine model: /opt/nvidia/deepstream/deepstream-5.0/sources/apps/deepstream-retinaface/tensorrt_engines_awsT4/retina_r50.engine
0:00:02.737574910   361 0x556cebde5f60 INFO                 nvinfer gstnvinfer_impl.cpp:311:notifyLoadModelStatus:<primary-nvinference-engine> [UID 1]: Load new model:retinaface_pgie_config.txt sucessfully
Running...
ERROR from element jpeg-parser: No valid frames found before end of stream
Error details: gstbaseparse.c(3603): gst_base_parse_loop (): /GstPipeline:ds-custom-pipeline/GstJpegParse:jpeg-parser
Returned, stopping playback

Hi,
I didn’t ask for writing the input data to a file.

What I suggesth to check are as below:

And, “could you try use the tensorrt engine built from DS in your TRT inference and check the output?” ==> you have confirmed they are using the same TRT engine, how about others?

And, seems you are referening to GitHub - NVIDIA-AI-IOT/deepstream_tao_apps: Sample apps to demonstrate how to deploy models trained with TAO on DeepStream, is your Retina network different from RetinaNet? Why not integrate NMS into TRT engine as TRT supports NMS plugin?

@mchi I’ll try out the NMS plugin but this should also work in DS
if its working in TRT inference right? !!
I tried feeding the same resolution image to rule out the effect of resize but didn’t make any difference. Maybe because TRT engine requires float of size 3 * INPUT_W * INPUT_H which is 1D float array and not a cv::Mat as input? A bit confused about the input thing

yes, for TRT input, cv:Mat does not work, here is a sample with OpenCV input processing.

bool SampleInceptionResnetV2::processInput(
    const samplesCommon::BufferManager& buffers, const std::string& inputTensorName, std::string& image, int batchIndex) const
{
    const int inputC = mInputDims.d[0];
    const int inputH = mInputDims.d[1];
    const int inputW = mInputDims.d[2];

    float* hostInputBuffer = static_cast<float*>(buffers.getHostBuffer(inputTensorName));

    hostInputBuffer += batchIndex * (inputC * inputH * inputW);

    // Load image through OpenCV
    cv::Mat src;
    src = cv::imread(image, CV_LOAD_IMAGE_COLOR);

    // Convert uchar to float
    src.convertTo(src, CV_32FC3);

    // Resize to target dimension
    cv::Mat resize;
    cv::Size size(inputH, inputW);
    cv::resize(src, resize, size, 0, 0, cv::INTER_LINEAR);
            
    // Split channels to BGR
    cv::Mat bgr[inputC];
    cv::split(resize, bgr);

    // Copy result into input buffer 
    for (int c = 0, volChannel = inputH * inputW; c < inputC; c++)
    {
        memcpy(&hostInputBuffer[c * volChannel], bgr[c].data,
                sizeof(float) * volChannel);
    }

    return true;
}

you can also refer to the TensorRT samples in TensorRT package

So @mchi , My confusion is when I feed an image to DS it will automatically convert it to float array right?

But how do I do this scaling in DS:

static float data[3 * INPUT_H * INPUT_W];
cv::Mat img = cv::imread("/home/retinaface_tensorrt/download.jpeg");
cv::Mat pr_img = preprocess_img(img);
for (int i = 0; i < INPUT_H * INPUT_W; i++) {
    data[i] = pr_img.at<cv::Vec3b>(i)[0] - 104.0;
    data[i + INPUT_H * INPUT_W] = pr_img.at<cv::Vec3b>(i)[1] - 117.0;
    data[i + 2 * INPUT_H * INPUT_W] = pr_img.at<cv::Vec3b>(i)[2] - 123.0;
}

In TRT I am able to do this way

And yes the difference from RetinaNet is RetinaFace network also predicts landmarks besides Bboxes

My confusion is when I feed an image to DS it will automatically convert it to float array right?

Depends on what pre-precessing you will add in the pipeline, for example, in https://github.com/NVIDIA-AI-IOT/deepstream_tlt_apps/blob/master/deepstream_custom.c, it supports to read the jpg image, decoding
to YUV(NV12) data and feed NV12 data into nvinfer(TRT) plugin, nvinfer will do resize and convert to RGB planar for TRT inference

@mchi So there is no scaling done here? If there is how to integrate the scaling that I mentioned in the previous post. Also it seems that if DS converts it to float itself then I am sort of doing parsing thing the same way in DS as I am doing it in TRT but it seems to give the wrong output bboxes still. To be precise it only generates an image with one bbox covering the whole image. Is there anypoint in removing the custom parsing thing and adding NMS plugin instead of that because that parsing function works in TRT container perfectly.

sorry, don’t under your questions.

@mchi So, I still dont know how to proceed from here so I’ll again list out what I have and what I am trying to do:

Regarding the input, conversion to TRT acceptable type is handled correctly as you said to NV12 format, the only thing thats missing from DS and which is there in my TRT inference is:

static float data[3 * INPUT_H * INPUT_W];
cv::Mat img = cv::imread("/home/retinaface_tensorrt/download.jpeg");
cv::Mat pr_img = preprocess_img(img);
for (int i = 0; i < INPUT_H * INPUT_W; i++) {
    data[i] = pr_img.at<cv::Vec3b>(i)[0] - 104.0;
    data[i + INPUT_H * INPUT_W] = pr_img.at<cv::Vec3b>(i)[1] - 117.0;
    data[i + 2 * INPUT_H * INPUT_W] = pr_img.at<cv::Vec3b>(i)[2] - 123.0;
}

Now with this said. The next this is TRT engine which is exaclty the same in both TRT and DS. Finally, the parsing the output from decode layer is handled exactly the same way in TRT as in DS the only difference being this:

In DS custom parsing bboxes function → this output buffer has 28560 bboxes:

float* out_decode = (float *) outputLayersInfo[decodeIndex].buffer;
std::cout << "detected before nms -> " << out_decode[0] << std::endl;

This outputs → 28560

While the output of the decode layer in TRT has only 207 before nms:

static float prob[OUTPUT_SIZE];

// doInference I have posted [here](https://forums.developer.nvidia.com/t/nvdsinferlayerinfo-not-giving-expected-no-of-outputs/127131?u=y14uc339) on the very first post
std::vector<Detection> res;
doInference(*context, data, prob, 1);
//also posted above very first post
nms(res, prob);
std::cout << "detected before nms -> " << prob[0] << std::endl;

This outputs 207!

Now, I dont understand whats going wrong!
So, I am gonna try FP32 run inference on TRT keeping all the other things fixed. Save the unscaled precprocessed images from TRT and run inference on that image in DS… Is that what you wanted me to do?

is it convinent to provide your TRT code and DS code ?

@mchi Sure check your PM!

checked the preprocessing code in retinaface_tensorrt/retina_r50.cpp, you need to set “net-scale-factor” and “offsets” as below in retinaface_pgie_config.txt, DS will use these value to do pre-processing as below to the input data

(B-104.0) * net-scale-factor
(G-117.0) * net-scale-factor
(R-123.0) * net-scale-factor

net-scale-factor=1.0
offsets=104.0;117.0;123.0
model-color-format=1             // model-color-format=1 means BGR, please check your B, G, R sequence and change offsets accordingly,