OCRNET parse function for DeepStream

Hi,

I’ve successfully implemented the custom_parse_classifier_funcfor OCRNet and validated that it correctly sends data to the pipeline.
My pipeline is streamux > nvdspreprocess > pgie (triton-server) with a probe function on PGIE.
I use Gst-nvdspreprocess to send only the text area for OCR. Everything works perfectly up to the PGIE, but I can’t retrieve the data in the probe because frame_meta.obj_meta_list appears as NONE.
I seem to be missing something in the implementation.

snippet code

    gst_buffer = info.get_buffer()
    if not gst_buffer:
        print("Unable to get GstBuffer ")
        return

    batch_meta = pyds.gst_buffer_get_nvds_batch_meta(hash(gst_buffer))
    l_frame = batch_meta.frame_meta_list
    

    while l_frame:
        try:
            frame_meta = pyds.NvDsFrameMeta.cast(l_frame.data)
        except StopIteration:
            break

        frame_number=frame_meta.frame_num
        l_obj=frame_meta.obj_meta_list  ## THIS IS NONE

Triton Server Conf

 name: "nvidia-ocrnet"
 platform: "tensorrt_plan"
 max_batch_size: 32
 input [
   {
     name: "input"
     data_type: TYPE_FP32
     format: FORMAT_NCHW
     dims: [1 , 32, 100]
   }
 ]
 output [
   {
     name: "output_id"
     data_type: TYPE_INT32
     dims: [ 26 ]
   },
   {
     name: "output_prob"
     data_type: TYPE_FP32
     dims: [ 26 ]
   },
   {
     name: "798"
     data_type: TYPE_INT32
     dims: [ 26 ]
   }
 ]
 instance_group [
     {
       count: 1
       kind: KIND_GPU
       gpus: [ 0 ]
     }
 ]
 version_policy: { latest: { num_versions: 1}}
 dynamic_batching {
   max_queue_delay_microseconds: 0
 }

config_preprocess.txt

[property]
enable=1
target-unique-ids=1
process-on-frame=1

# if enabled maintain the aspect ratio while scaling
maintain-aspect-ratio=1

# if enabled pad symmetrically with maintain-aspect-ratio enabled
symmetric-padding=1

# processing width/height at which image scaled
processing-width=100
processing-height=32

scaling-buf-pool-size=6
tensor-buf-pool-size=6

# 0=NCHW, 1=NHWC, 2=CUSTOM
network-input-order=0

# tensor shape based on network-input-order
network-input-shape=32;1;32;100

# 0=RGB, 1=BGR, 2=GRAY
network-color-format=2

# 0=FP32, 1=UINT8, 2=INT8, 3=UINT32, 4=INT32, 5=FP16
tensor-data-type=0

tensor-name=input

# 0=NVBUF_MEM_DEFAULT 1=NVBUF_MEM_CUDA_PINNED 2=NVBUF_MEM_CUDA_DEVICE 3=NVBUF_MEM_CUDA_UNIFIED
scaling-pool-memory-type=0

# 0=NvBufSurfTransformCompute_Default 1=NvBufSurfTransformCompute_GPU 2=NvBufSurfTransformCompute_VIC
scaling-pool-compute-hw=0

# Scaling Interpolation method
# 0=NvBufSurfTransformInter_Nearest 1=NvBufSurfTransformInter_Bilinear 2=NvBufSurfTransformInter_Algo1
# 3=NvBufSurfTransformInter_Algo2 4=NvBufSurfTransformInter_Algo3 5=NvBufSurfTransformInter_Algo4
# 6=NvBufSurfTransformInter_Default
scaling-filter=0

custom-lib-path=/opt/nvidia/deepstream/deepstream/lib/gst-plugins/libcustom2d_preprocess.so
custom-tensor-preparation-function=CustomTensorPreparation

output-tensor-meta=1

[user-configs]
pixel-normalization-factor=0.00784313
#mean-file=
offsets=127.5

[group-0]
src-ids=0
custom-input-transformation-function=CustomAsyncTransformation
process-on-roi=1
roi-params-src-0=85;121;235;61
draw-roi=1
roi-color=1;1;1;1

pgie_conf.txt

infer_config {
  unique_id: 1
  gpu_ids: [0]
  max_batch_size: 32
  backend {

    triton {
      model_name: "nvidia-ocrnet"
      version: -1
      grpc {
        url: "127.0.0.1:8001"
        enable_cuda_buffer_sharing: true
      }
    }
  }

  input_tensor_from_meta { 
      is_first_dim_batch : true 
  }

  #preprocess {
  #  network_format: IMAGE_FORMAT_GRAY 
  #  tensor_order: TENSOR_ORDER_NONE
  #  normalize {
  #    scale_factor: 0.00784313
  #  }
  #}

  postprocess {
     classification {
      threshold:0.2
      custom_parse_classifier_func: "NvDsInferParseOCRNetCTC"
    }
  }
  extra {
    copy_input_to_host_buffers: false
    output_buffer_pool_size: 6
  }
  
  custom_lib {
    path: "/apps/custom_lib/nvocr/nvinfer_ocrnet_parser.so"
  }
}

input_control {
  process_mode : PROCESS_MODE_FULL_FRAME
  interval : 0
}

output_control {
  output_tensor_meta: true
}

custom_parse_classifier_func

#include <string>
#include <vector>
#include <iostream>
#include <locale>
#include <cstring>
#include "nvdsinfer_custom_impl.h"

using namespace std;
using std::string;
using std::vector;

static bool ocr_dict_ready = false;
std::vector<string> ocr_dict_table;

/* C-linkage to prevent name-mangling */
extern "C"
bool NvDsInferParseOCRNetCTC(std::vector<NvDsInferLayerInfo> const &outputLayersInfo,
                                NvDsInferNetworkInfo const &networkInfo, float classifierThreshold,
                                std::vector<NvDsInferAttribute> &attrList, std::string &attrString);

extern "C" 
bool NvDsInferParseOCRNetCTC(std::vector<NvDsInferLayerInfo> const &outputLayersInfo,
                                NvDsInferNetworkInfo const &networkInfo, float classifierThreshold,
                                std::vector<NvDsInferAttribute> &attrList, std::string &attrString)
{
    NvDsInferAttribute OCR_attr;

    if (!ocr_dict_ready) {
        static const char* hardcodedOCRDict[] = {
            "0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
            "a", "b", "c", "d", "e", "f", "g", "h", "i", "j",
            "k", "l", "m", "n", "o", "p", "q", "r", "s", "t",
            "u", "v", "w", "x", "y", "z"
        };
        ocr_dict_table.emplace_back("CTCBlank");
        for (size_t i = 0; i < std::extent<decltype(hardcodedOCRDict)>::value; ++i) {
            ocr_dict_table.emplace_back(hardcodedOCRDict[i]);
        } 
        ocr_dict_ready = true;
    }
    

    if (outputLayersInfo.size() != 3)
    {
        std::cerr << "Mismatch in the number of output buffers."
                  << "Expected 3 output buffers, detected in the network: "
                  << outputLayersInfo.size() << std::endl;
        return false;
    }

    auto layerFinder = [&outputLayersInfo](const std::string &name)
        -> const NvDsInferLayerInfo *{
        for (auto &layer : outputLayersInfo) {
            if (layer.layerName && name == layer.layerName) {
                return &layer;
            }
        }
        return nullptr;
    };

    const NvDsInferLayerInfo *output_id = layerFinder("output_id");
    const NvDsInferLayerInfo *output_prob = layerFinder("output_prob");
    const NvDsInferLayerInfo *_798 = layerFinder("798");


    if (!output_id || !output_prob || !_798 ) {
        if (!output_id) {
            std::cerr << "  - output_id: Missing or unsupported data type." << std::endl;
        }

        if (!output_prob) {
            std::cerr << "  - output_prob: Missing or unsupported data type." << std::endl;
        }

        if (!_798) {
            std::cerr << "  - 798: Missing or unsupported data type." << std::endl;
        }
        return false;
    }

    if(output_id->inferDims.numDims != 1U) {
        std::cerr << "Network output_id dims is : " <<
            output_id->inferDims.numDims << " expect is 1"<< std::endl;
        return false;
    }
    if(output_prob->inferDims.numDims != 1U) {
        std::cerr << "Network output_prob dims is : " <<
            output_prob->inferDims.numDims << " expect is 1"<< std::endl;
        return false;
    }
    if(_798->inferDims.numDims != 1U) {
        std::cerr << "Network 798 dims is : " <<
            _798->inferDims.numDims << " expect is 1"<< std::endl;
        return false;
    }

    int batch_size = 1;
    int output_len = output_prob->inferDims.d[0];

    //std::cout << "Batch size: " << batch_size << std::endl;
    //std::cout << "Output length: " << output_len << std::endl;
    //std::cout << "networkInfo.width: " << networkInfo.width << std::endl;
    
    std::vector<std::pair<std::string, float>> temp_de_texts;

    int *output_id_data = reinterpret_cast<int*>(output_id->buffer);
    float *output_prob_data = reinterpret_cast<float*>(output_prob->buffer);

    for(int batch_idx = 0; batch_idx < batch_size; ++batch_idx)
        {
            int b_offset = batch_idx * output_len; 
            int prev = output_id_data[b_offset];
            std::vector<int> temp_seq_id = {prev};
            std::vector<float> temp_seq_prob = {output_prob_data[b_offset]};
            for(int i = 1 ; i < output_len; ++i)
            {
                if (output_id_data[b_offset + i] != prev)
                {
                    temp_seq_id.push_back(output_id_data[b_offset + i]);
                    temp_seq_prob.push_back(output_prob_data[b_offset + i]);
                    prev = output_id_data[b_offset + i];
                }
            }
            std::string de_text = "";
            float prob = 1.0;
            for(size_t i = 0; i < temp_seq_id.size(); ++i)
            {
                if (temp_seq_id[i] != 0)
                {
                    if (temp_seq_id[i] <= static_cast<int>(ocr_dict_table.size()) - 1)
                    {
                        de_text += ocr_dict_table[temp_seq_id[i]];
                        prob *= temp_seq_prob[i];
                    }
                    else
                    {
                        std::cerr << "[ERROR] Character dict is not compatible with OCRNet TRT engine." << std::endl;
                    }
                }
            }
            temp_de_texts.emplace_back(std::make_pair(de_text, prob));
        }

    attrString = "";
    for (const auto& temp_text : temp_de_texts) {
        if (temp_text.second >= classifierThreshold) {
            attrString += temp_text.first;
        }
        //std::cout << "Decoded text: " << temp_text.first << ", Probability: " << temp_text.second <<  ", Threshold: " << classifierThreshold << std::endl;
    }

    OCR_attr.attributeIndex = 0;
    OCR_attr.attributeValue = 1;
    OCR_attr.attributeLabel = strdup(attrString.c_str()); 
    OCR_attr.attributeConfidence = 1.0;
    
    for (const auto& temp_text : temp_de_texts) {
        OCR_attr.attributeConfidence *= temp_text.second;
    }

    // std::cout << "attributeIndex: " << OCR_attr.attributeIndex << std::endl;
    // std::cout << "attributeValue: " << OCR_attr.attributeValue << std::endl;
    // std::cout << "attributeLabel: " << OCR_attr.attributeLabel << std::endl;
    // std::cout << "attributeConfidence: " << OCR_attr.attributeConfidence << std::endl;

    attrList.push_back(OCR_attr);

    return true;
}

CHECK_CUSTOM_CLASSIFIER_PARSE_FUNC_PROTOTYPE(NvDsInferParseOCRNetCTC);