OCRNET parse function for DeepStream

Levi_Pereira · July 26, 2024, 6:55pm

Hi,

I’ve successfully implemented the custom_parse_classifier_funcfor OCRNet and validated that it correctly sends data to the pipeline.
My pipeline is streamux > nvdspreprocess > pgie (triton-server) with a probe function on PGIE.
I use Gst-nvdspreprocess to send only the text area for OCR. Everything works perfectly up to the PGIE, but I can’t retrieve the data in the probe because frame_meta.obj_meta_list appears as NONE.
I seem to be missing something in the implementation.

snippet code

    gst_buffer = info.get_buffer()
    if not gst_buffer:
        print("Unable to get GstBuffer ")
        return

    batch_meta = pyds.gst_buffer_get_nvds_batch_meta(hash(gst_buffer))
    l_frame = batch_meta.frame_meta_list
    

    while l_frame:
        try:
            frame_meta = pyds.NvDsFrameMeta.cast(l_frame.data)
        except StopIteration:
            break

        frame_number=frame_meta.frame_num
        l_obj=frame_meta.obj_meta_list  ## THIS IS NONE

Triton Server Conf

 name: "nvidia-ocrnet"
 platform: "tensorrt_plan"
 max_batch_size: 32
 input [
   {
     name: "input"
     data_type: TYPE_FP32
     format: FORMAT_NCHW
     dims: [1 , 32, 100]
   }
 ]
 output [
   {
     name: "output_id"
     data_type: TYPE_INT32
     dims: [ 26 ]
   },
   {
     name: "output_prob"
     data_type: TYPE_FP32
     dims: [ 26 ]
   },
   {
     name: "798"
     data_type: TYPE_INT32
     dims: [ 26 ]
   }
 ]
 instance_group [
     {
       count: 1
       kind: KIND_GPU
       gpus: [ 0 ]
     }
 ]
 version_policy: { latest: { num_versions: 1}}
 dynamic_batching {
   max_queue_delay_microseconds: 0
 }

config_preprocess.txt

[property]
enable=1
target-unique-ids=1
process-on-frame=1

# if enabled maintain the aspect ratio while scaling
maintain-aspect-ratio=1

# if enabled pad symmetrically with maintain-aspect-ratio enabled
symmetric-padding=1

# processing width/height at which image scaled
processing-width=100
processing-height=32

scaling-buf-pool-size=6
tensor-buf-pool-size=6

# 0=NCHW, 1=NHWC, 2=CUSTOM
network-input-order=0

# tensor shape based on network-input-order
network-input-shape=32;1;32;100

# 0=RGB, 1=BGR, 2=GRAY
network-color-format=2

# 0=FP32, 1=UINT8, 2=INT8, 3=UINT32, 4=INT32, 5=FP16
tensor-data-type=0

tensor-name=input

# 0=NVBUF_MEM_DEFAULT 1=NVBUF_MEM_CUDA_PINNED 2=NVBUF_MEM_CUDA_DEVICE 3=NVBUF_MEM_CUDA_UNIFIED
scaling-pool-memory-type=0

# 0=NvBufSurfTransformCompute_Default 1=NvBufSurfTransformCompute_GPU 2=NvBufSurfTransformCompute_VIC
scaling-pool-compute-hw=0

# Scaling Interpolation method
# 0=NvBufSurfTransformInter_Nearest 1=NvBufSurfTransformInter_Bilinear 2=NvBufSurfTransformInter_Algo1
# 3=NvBufSurfTransformInter_Algo2 4=NvBufSurfTransformInter_Algo3 5=NvBufSurfTransformInter_Algo4
# 6=NvBufSurfTransformInter_Default
scaling-filter=0

custom-lib-path=/opt/nvidia/deepstream/deepstream/lib/gst-plugins/libcustom2d_preprocess.so
custom-tensor-preparation-function=CustomTensorPreparation

output-tensor-meta=1

[user-configs]
pixel-normalization-factor=0.00784313
#mean-file=
offsets=127.5

[group-0]
src-ids=0
custom-input-transformation-function=CustomAsyncTransformation
process-on-roi=1
roi-params-src-0=85;121;235;61
draw-roi=1
roi-color=1;1;1;1

pgie_conf.txt

infer_config {
  unique_id: 1
  gpu_ids: [0]
  max_batch_size: 32
  backend {

    triton {
      model_name: "nvidia-ocrnet"
      version: -1
      grpc {
        url: "127.0.0.1:8001"
        enable_cuda_buffer_sharing: true
      }
    }
  }

  input_tensor_from_meta { 
      is_first_dim_batch : true 
  }

  #preprocess {
  #  network_format: IMAGE_FORMAT_GRAY 
  #  tensor_order: TENSOR_ORDER_NONE
  #  normalize {
  #    scale_factor: 0.00784313
  #  }
  #}

  postprocess {
     classification {
      threshold:0.2
      custom_parse_classifier_func: "NvDsInferParseOCRNetCTC"
    }
  }
  extra {
    copy_input_to_host_buffers: false
    output_buffer_pool_size: 6
  }
  
  custom_lib {
    path: "/apps/custom_lib/nvocr/nvinfer_ocrnet_parser.so"
  }
}

input_control {
  process_mode : PROCESS_MODE_FULL_FRAME
  interval : 0
}

output_control {
  output_tensor_meta: true
}

custom_parse_classifier_func

#include <string>
#include <vector>
#include <iostream>
#include <locale>
#include <cstring>
#include "nvdsinfer_custom_impl.h"

using namespace std;
using std::string;
using std::vector;

static bool ocr_dict_ready = false;
std::vector<string> ocr_dict_table;

/* C-linkage to prevent name-mangling */
extern "C"
bool NvDsInferParseOCRNetCTC(std::vector<NvDsInferLayerInfo> const &outputLayersInfo,
                                NvDsInferNetworkInfo const &networkInfo, float classifierThreshold,
                                std::vector<NvDsInferAttribute> &attrList, std::string &attrString);

extern "C" 
bool NvDsInferParseOCRNetCTC(std::vector<NvDsInferLayerInfo> const &outputLayersInfo,
                                NvDsInferNetworkInfo const &networkInfo, float classifierThreshold,
                                std::vector<NvDsInferAttribute> &attrList, std::string &attrString)
{
    NvDsInferAttribute OCR_attr;

    if (!ocr_dict_ready) {
        static const char* hardcodedOCRDict[] = {
            "0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
            "a", "b", "c", "d", "e", "f", "g", "h", "i", "j",
            "k", "l", "m", "n", "o", "p", "q", "r", "s", "t",
            "u", "v", "w", "x", "y", "z"
        };
        ocr_dict_table.emplace_back("CTCBlank");
        for (size_t i = 0; i < std::extent<decltype(hardcodedOCRDict)>::value; ++i) {
            ocr_dict_table.emplace_back(hardcodedOCRDict[i]);
        } 
        ocr_dict_ready = true;
    }
    

    if (outputLayersInfo.size() != 3)
    {
        std::cerr << "Mismatch in the number of output buffers."
                  << "Expected 3 output buffers, detected in the network: "
                  << outputLayersInfo.size() << std::endl;
        return false;
    }

    auto layerFinder = [&outputLayersInfo](const std::string &name)
        -> const NvDsInferLayerInfo *{
        for (auto &layer : outputLayersInfo) {
            if (layer.layerName && name == layer.layerName) {
                return &layer;
            }
        }
        return nullptr;
    };

    const NvDsInferLayerInfo *output_id = layerFinder("output_id");
    const NvDsInferLayerInfo *output_prob = layerFinder("output_prob");
    const NvDsInferLayerInfo *_798 = layerFinder("798");


    if (!output_id || !output_prob || !_798 ) {
        if (!output_id) {
            std::cerr << "  - output_id: Missing or unsupported data type." << std::endl;
        }

        if (!output_prob) {
            std::cerr << "  - output_prob: Missing or unsupported data type." << std::endl;
        }

        if (!_798) {
            std::cerr << "  - 798: Missing or unsupported data type." << std::endl;
        }
        return false;
    }

    if(output_id->inferDims.numDims != 1U) {
        std::cerr << "Network output_id dims is : " <<
            output_id->inferDims.numDims << " expect is 1"<< std::endl;
        return false;
    }
    if(output_prob->inferDims.numDims != 1U) {
        std::cerr << "Network output_prob dims is : " <<
            output_prob->inferDims.numDims << " expect is 1"<< std::endl;
        return false;
    }
    if(_798->inferDims.numDims != 1U) {
        std::cerr << "Network 798 dims is : " <<
            _798->inferDims.numDims << " expect is 1"<< std::endl;
        return false;
    }

    int batch_size = 1;
    int output_len = output_prob->inferDims.d[0];

    //std::cout << "Batch size: " << batch_size << std::endl;
    //std::cout << "Output length: " << output_len << std::endl;
    //std::cout << "networkInfo.width: " << networkInfo.width << std::endl;
    
    std::vector<std::pair<std::string, float>> temp_de_texts;

    int *output_id_data = reinterpret_cast<int*>(output_id->buffer);
    float *output_prob_data = reinterpret_cast<float*>(output_prob->buffer);

    for(int batch_idx = 0; batch_idx < batch_size; ++batch_idx)
        {
            int b_offset = batch_idx * output_len; 
            int prev = output_id_data[b_offset];
            std::vector<int> temp_seq_id = {prev};
            std::vector<float> temp_seq_prob = {output_prob_data[b_offset]};
            for(int i = 1 ; i < output_len; ++i)
            {
                if (output_id_data[b_offset + i] != prev)
                {
                    temp_seq_id.push_back(output_id_data[b_offset + i]);
                    temp_seq_prob.push_back(output_prob_data[b_offset + i]);
                    prev = output_id_data[b_offset + i];
                }
            }
            std::string de_text = "";
            float prob = 1.0;
            for(size_t i = 0; i < temp_seq_id.size(); ++i)
            {
                if (temp_seq_id[i] != 0)
                {
                    if (temp_seq_id[i] <= static_cast<int>(ocr_dict_table.size()) - 1)
                    {
                        de_text += ocr_dict_table[temp_seq_id[i]];
                        prob *= temp_seq_prob[i];
                    }
                    else
                    {
                        std::cerr << "[ERROR] Character dict is not compatible with OCRNet TRT engine." << std::endl;
                    }
                }
            }
            temp_de_texts.emplace_back(std::make_pair(de_text, prob));
        }

    attrString = "";
    for (const auto& temp_text : temp_de_texts) {
        if (temp_text.second >= classifierThreshold) {
            attrString += temp_text.first;
        }
        //std::cout << "Decoded text: " << temp_text.first << ", Probability: " << temp_text.second <<  ", Threshold: " << classifierThreshold << std::endl;
    }

    OCR_attr.attributeIndex = 0;
    OCR_attr.attributeValue = 1;
    OCR_attr.attributeLabel = strdup(attrString.c_str()); 
    OCR_attr.attributeConfidence = 1.0;
    
    for (const auto& temp_text : temp_de_texts) {
        OCR_attr.attributeConfidence *= temp_text.second;
    }

    // std::cout << "attributeIndex: " << OCR_attr.attributeIndex << std::endl;
    // std::cout << "attributeValue: " << OCR_attr.attributeValue << std::endl;
    // std::cout << "attributeLabel: " << OCR_attr.attributeLabel << std::endl;
    // std::cout << "attributeConfidence: " << OCR_attr.attributeConfidence << std::endl;

    attrList.push_back(OCR_attr);

    return true;
}

CHECK_CUSTOM_CLASSIFIER_PARSE_FUNC_PROTOTYPE(NvDsInferParseOCRNetCTC);

Topic		Replies	Views
Training OCRNet for being used for LPD/LPR DeepStream SDK	66	977	June 4, 2024
OCRNet output parser not working DeepStream SDK	5	216	June 4, 2024
Loading OCDNet as sgie0 DeepStream SDK deepstream	11	61	January 14, 2025
How to make a parses function for my regression model DeepStream SDK	23	1733	October 12, 2021
Error in nvOCDR DeepStream DeepStream SDK jetson , deepstream	25	108	March 24, 2025
Challenges in Implementing PoseClassificationNet in DeepStream-6.2 DeepStream SDK	27	803	August 21, 2023
Could not find output coverage layer for parsing objects DeepStream SDK	31	1141	May 16, 2024
Converting Custom RetinaNet model to TensorRT in DeepStream DeepStream SDK tensorrt , neural-network-framework , jetson , deepstream , net	29	142	January 21, 2025
ResNet50 classifier model as pgie DeepStream SDK	21	1046	November 16, 2023
How to parse CRNN model / OCR Text Recognition DeepStream SDK onnx	8	2037	October 12, 2021