Face alignment: Extract landmarks from yolo model and dump it into user_meta

I am working on a face recognition project by setting this pipeline:
face detection → face alignment → face recognition

Currently, I am facing issues related to face alignment, which requires landmarks objected from the face detection model (yolo)

How can we transfer the landmarks to user_meta in cpp/cu and get it in Python binding?

Please suggest some solution on how to add this feature.

Thank you,

PS: This is the code I have to extract landmarks from the Yolo-face model.


static std::vector<NvDsInferParseObjectInfo> decodeTensorYoloUM(const float* detection, const uint& outputSize, const uint& netW, const uint& netH,
    const std::vector<float>& preclusterThreshold, NvDsUserMetaList *obj_user_meta_list)
{
  std::vector<NvDsInferParseObjectInfo> binfo;
  // adding User-meta information
  // 'Something to see if obj user meta list exists';
  // NvDsUserMetaList *obj_user_meta_list = g_new0(NvDsUserMetaList, 1);

  // for face landmarks
  std::vector<std::array<float, 10>> blmk;

  for (uint b = 0; b < outputSize; ++b) {
    float maxProb = detection[b * ARRAY_SIZE + 4];
    int maxIndex = (int) detection[b * ARRAY_SIZE + 15];

    if (maxProb < preclusterThreshold[maxIndex])
      continue;

    float bxc = detection[b * ARRAY_SIZE + 0];
    float byc = detection[b * ARRAY_SIZE + 1];
    float bw = detection[b * ARRAY_SIZE + 2];
    float bh = detection[b * ARRAY_SIZE + 3];

    float bx1 = bxc - bw / 2;
    float by1 = byc - bh / 2;
    float bx2 = bx1 + bw;
    float by2 = by1 + bh;

    addBBoxProposal(bx1, by1, bx2, by2, netW, netH, maxIndex, maxProb, binfo);

    std::array<float, 10> face_landmarks;
    face_landmarks[0] = detection[b * ARRAY_SIZE + 5];
    face_landmarks[1] = detection[b * ARRAY_SIZE + 6];
    face_landmarks[2] = detection[b * ARRAY_SIZE + 7];
    face_landmarks[3] = detection[b * ARRAY_SIZE + 8];
    face_landmarks[4] = detection[b * ARRAY_SIZE + 9];
    face_landmarks[5] = detection[b * ARRAY_SIZE + 10];
    face_landmarks[6] = detection[b * ARRAY_SIZE + 11];
    face_landmarks[7] = detection[b * ARRAY_SIZE + 12];
    face_landmarks[8] = detection[b * ARRAY_SIZE + 13];
    face_landmarks[9] = detection[b * ARRAY_SIZE + 14];
    blmk.push_back(face_landmarks);

  }

  // std::cout << "number of face with landmarks : " << blmk.size() << "/" << binfo.size() << std::endl;
  // assert( binfo.size() == blmk.size());
  for (uint m = 0; m < blmk.size(); ++m)
  {
      NvDsInferParseObjectInfo item = binfo[m];
      std::array<float, 10> lmks = blmk[m];

      NvDsUserMeta* user_meta = g_new0(NvDsUserMeta, 1);
      user_meta->user_meta_data = set_metadata_ptr(lmks);
      /*
      std::cout << "BBox Cords: " << item.left << " " << item.top << " " << (item.left+item.width) << " " << (item.top+item.height) << " " <<  item.detectionConfidence <<  std::endl;
      std::cout << "LNMs Cords: ";
      for(auto i = 0; i < 10; ++i)
      {
        std::cout << lmks[i] <<  " ";
      }
      std::cout << std::endl;
      */
      obj_user_meta_list = g_list_append(obj_user_meta_list, user_meta);

    }
  return binfo;
}


static bool
NvDsInferParseCustomYoloUM(std::vector<NvDsInferLayerInfo> const& outputLayersInfo, NvDsInferNetworkInfo const& networkInfo,
    NvDsInferParseDetectionParams const& detectionParams, std::vector<NvDsInferParseObjectInfo>& objectList)
{
  if (outputLayersInfo.empty()) {
    std::cerr << "ERROR: Could not find output layer in bbox parsing" << std::endl;
    return false;
  }

  std::vector<NvDsInferParseObjectInfo> objects;

  const NvDsInferLayerInfo& layer = outputLayersInfo[0];

  const uint outputSize = layer.inferDims.d[0];

  NvDsUserMetaList *obj_user_meta_list = g_new0(NvDsUserMetaList, 1);

  std::vector<NvDsInferParseObjectInfo> outObjs =
  decodeTensorYoloUM((const float*) (layer.buffer), outputSize, networkInfo.width,
  networkInfo.height, detectionParams.perClassPreclusterThreshold, obj_user_meta_list);

  objects.insert(objects.end(), outObjs.begin(), outObjs.end());

  objectList = objects;

  // related to usermeta
  // Now add user metadata
  // for (auto & obj : objectList) {
    // Assuming landmarks are available for each object and are retrieved somehow.
    // Example placeholder code for setting landmarks data:
  //   NvDsInferFaceLandmarks* landmarksData = new NvDsInferFaceLandmarks;

  //   NvDsUserMeta* userMeta = nvds_acquire_user_meta_from_pool(outObjs);
  //   if (userMeta) {
  //     userMeta->user_meta_data = (void*)landmarksData;
  //     userMeta->base_meta.meta_type = NVDSINFER_TLT_PGIE_USER_META;
  //     nvds_add_user_meta_to_obj(objMeta, userMeta);
  //   } else {
  //     delete landmarksData;
  //   }
  // }

  return true;
}

extern "C" bool
NvDsInferParseYoloUM(std::vector<NvDsInferLayerInfo> const& outputLayersInfo, NvDsInferNetworkInfo const& networkInfo,
    NvDsInferParseDetectionParams const& detectionParams, std::vector<NvDsInferParseObjectInfo>& objectList)
{
  return NvDsInferParseCustomYoloUM(outputLayersInfo, networkInfo, detectionParams, objectList);
}

CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseYoloUM);

Please provide complete information as applicable to your setup.

• Hardware Platform (Jetson / GPU) Jetson
• DeepStream Version 6.2
• JetPack Version (valid for Jetson only) 5.1.2
• TensorRT Version 8.2

There is no update from you for a period, assuming this is not an issue anymore. Hence we are closing this topic. If need further support, please open a new one. Thanks

There are two solutions.

  1. Here is a complete deepstream-yolo-face solution, using NvDsInferInstanceMaskInfo to store landmarks data.
    The disadvantage is that the instance segmentation network cannot be used in sgie.
  1. Add a probe function to the pgie src pad, process NVDSINFER_TENSOR_OUTPUT_META, and add custom metadata.

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.