The following code is the original NvDsInferParseCustomMrcnnUff
function from the deepstream4.x file
extern "C"
bool NvDsInferParseCustomMrcnnUff (std::vector<NvDsInferLayerInfo> const &outputLayersInfo,
NvDsInferNetworkInfo const &networkInfo,
NvDsInferParseDetectionParams const &detectionParams,
std::vector<NvDsInferObjectDetectionInfo> &objectList) {
static int detIndex = -1;
static int maskIndex = -1;
/* Find the detection layer */
if (detIndex == -1) {
for (unsigned int i = 0; i < outputLayersInfo.size(); i++) {
if (strcmp(outputLayersInfo[i].layerName, "mrcnn_detection") == 0) {
detIndex = i;
break;
}
}
if (detIndex == -1) {
std::cerr << "Could not find detection layer buffer while parsing" << std::endl;
return false;
}
}
/* Find the mask layer */
if (maskIndex == -1) {
for (unsigned int i = 0; i < outputLayersInfo.size(); i++) {
if (strcmp(outputLayersInfo[i].layerName, "mrcnn_mask/Sigmoid") == 0) {
maskIndex = i;
break;
}
}
if (maskIndex == -1) {
std::cerr << "Could not find mask layer buffer while parsing" << std::endl;
return false;
}
}
float* out_det = (float *) outputLayersInfo[detIndex].buffer;
float* out_mask = (float *) outputLayersInfo[maskIndex].buffer;
std::vector<MRCNNBBoxInfo> binfo = decodeOutput(out_det, out_mask);
for (unsigned int roi_id = 0; roi_id < binfo.size(); roi_id++) {
NvDsInferObjectDetectionInfo object;
object.classId = binfo[roi_id].label;
object.detectionConfidence = binfo[roi_id].prob;
/* Clip object box co-ordinates to network resolution */
object.left = CLIP(binfo[roi_id].box.x1 * networkInfo.width, 0, networkInfo.width - 1);
object.top = CLIP(binfo[roi_id].box.y1 * networkInfo.height, 0, networkInfo.height - 1);
object.width = CLIP((binfo[roi_id].box.x2 - binfo[roi_id].box.x1) * networkInfo.width, 0, networkInfo.width - 1);
object.height = CLIP((binfo[roi_id].box.y2 - binfo[roi_id].box.y1) * networkInfo.height, 0, networkInfo.height - 1);
objectList.push_back(object);
}
return true;
}
This code is able to succesfully parse the output from the mrcnn uff file but, as we can see, it does not store the mask data and hence nvosd cannot display it.
On the other hand, the following code belongs to the function NvDsInferParseCustomMrcnnTLTV2
from the file /opt/nvidia/deepstream/deepstream-6.0/sources/libs/nvdsinfer_customparser/nvdsinfer_custombboxparser.cpp
.
extern "C"
bool NvDsInferParseCustomMrcnnTLTV2 (std::vector<NvDsInferLayerInfo> const &outputLayersInfo,
NvDsInferNetworkInfo const &networkInfo,
NvDsInferParseDetectionParams const &detectionParams,
std::vector<NvDsInferInstanceMaskInfo> &objectList) {
auto layerFinder = [&outputLayersInfo](const std::string &name)
-> const NvDsInferLayerInfo *{
for (auto &layer : outputLayersInfo) {
if (layer.dataType == FLOAT &&
(layer.layerName && name == layer.layerName)) {
return &layer;
}
}
return nullptr;
};
const NvDsInferLayerInfo *detectionLayer = layerFinder("generate_detections");
const NvDsInferLayerInfo *maskLayer = layerFinder("mask_fcn_logits/BiasAdd");
if (!detectionLayer || !maskLayer) {
std::cerr << "ERROR: some layers missing or unsupported data types "
<< "in output tensors" << std::endl;
return false;
}
if(maskLayer->inferDims.numDims != 4U) {
std::cerr << "Network output number of dims is : " <<
maskLayer->inferDims.numDims << " expect is 4"<< std::endl;
return false;
}
const unsigned int det_max_instances = maskLayer->inferDims.d[0];
const unsigned int num_classes = maskLayer->inferDims.d[1];
if(num_classes != detectionParams.numClassesConfigured) {
std::cerr << "WARNING: Num classes mismatch. Configured:" <<
detectionParams.numClassesConfigured << ", detected by network: " <<
num_classes << std::endl;
}
const unsigned int mask_instance_height= maskLayer->inferDims.d[2];
const unsigned int mask_instance_width = maskLayer->inferDims.d[3];
auto out_det = reinterpret_cast<MrcnnRawDetection*>( detectionLayer->buffer);
auto out_mask = reinterpret_cast<float(*)[mask_instance_width *
mask_instance_height]>(maskLayer->buffer);
for(auto i = 0U; i < det_max_instances; i++) {
MrcnnRawDetection &rawDec = out_det[i];
if(rawDec.score < detectionParams.perClassPreclusterThreshold[0])
continue;
NvDsInferInstanceMaskInfo obj;
obj.left = CLIP(rawDec.x1, 0, networkInfo.width - 1);
obj.top = CLIP(rawDec.y1, 0, networkInfo.height - 1);
obj.width = CLIP(rawDec.x2, 0, networkInfo.width - 1) - rawDec.x1;
obj.height = CLIP(rawDec.y2, 0, networkInfo.height - 1) - rawDec.y1;
if(obj.width <= 0 || obj.height <= 0)
continue;
obj.classId = static_cast<int>(rawDec.class_id);
obj.detectionConfidence = rawDec.score;
obj.mask_size = sizeof(float)*mask_instance_width*mask_instance_height;
obj.mask = new float[mask_instance_width*mask_instance_height];
obj.mask_width = mask_instance_width;
obj.mask_height = mask_instance_height;
float *rawMask = reinterpret_cast<float *>(out_mask + i
* detectionParams.numClassesConfigured + obj.classId);
memcpy (obj.mask, rawMask, sizeof(float)*mask_instance_width*mask_instance_height);
objectList.push_back(obj);
}
return true;
}
This function does parse the output from the peoplesegnetv2.etlt and send/store the detection+mask data so nvosd is able to display it perfectly.
Now what I would like to do is to mix this functions so I’m able to parse the mrcnn .uff file and output the correct detection+mask data.
So far, this is what i have:
extern "C"
bool AureCustomMrcnn (std::vector<NvDsInferLayerInfo> const &outputLayersInfo,
NvDsInferNetworkInfo const &networkInfo,
NvDsInferParseDetectionParams const &detectionParams,
std::vector<NvDsInferInstanceMaskInfo> &objectList) {
auto layerFinder = [&outputLayersInfo](const std::string &name)
-> const NvDsInferLayerInfo *{
for (auto &layer : outputLayersInfo) {
if (layer.dataType == FLOAT &&
(layer.layerName && name == layer.layerName)) {
return &layer;
}
}
return nullptr;
};
const NvDsInferLayerInfo *detectionLayer = layerFinder("mrcnn_detection");
const NvDsInferLayerInfo *maskLayer = layerFinder("mrcnn_mask/Sigmoid");
if (!detectionLayer || !maskLayer) {
std::cerr << "ERROR: some layers missing or unsupported data types "
<< "in output tensors" << std::endl;
return false;
}
if(maskLayer->inferDims.numDims != 4U) {
std::cerr << "Network output number of dims is : " <<
maskLayer->inferDims.numDims << " expect is 4"<< std::endl;
return false;
}
const unsigned int det_max_instances = maskLayer->inferDims.d[0];
const unsigned int num_classes = maskLayer->inferDims.d[1];
if(num_classes != detectionParams.numClassesConfigured) {
std::cerr << "WARNING: Num classes mismatch. Configured:" <<
detectionParams.numClassesConfigured << ", detected by network: " <<
num_classes << std::endl;
}
const unsigned int mask_instance_height= maskLayer->inferDims.d[2];
const unsigned int mask_instance_width = maskLayer->inferDims.d[3];
std::cout << "det_max_instances: "<<det_max_instances << "num_classes: "<<num_classes << "mask_instance_height: "<<mask_instance_height << "mask_instance_height: "<<mask_instance_height << std::endl;
auto out_det = reinterpret_cast<RawDetection*>( detectionLayer->buffer);
auto out_mask = reinterpret_cast<float(*)[mask_instance_width *
mask_instance_height]>(maskLayer->buffer);
for(auto i = 0U; i < det_max_instances; i++) {
RawDetection &rawDec = out_det[i];
if(rawDec.score < detectionParams.perClassPreclusterThreshold[0])
continue;
NvDsInferInstanceMaskInfo obj;
obj.left = CLIP(rawDec.x1*networkInfo.width, 0, networkInfo.width - 1);
obj.top = CLIP(rawDec.y1*networkInfo.height, 0, networkInfo.height - 1);
obj.width = CLIP((rawDec.x2-rawDec.x1)*networkInfo.width, 0, networkInfo.width - 1);
obj.height = CLIP((rawDec.y2-rawDec.y1)*networkInfo.height, 0, networkInfo.height - 1);
if(obj.width <= 0 || obj.height <= 0)
continue;
obj.classId = static_cast<int>(rawDec.class_id);
obj.detectionConfidence = rawDec.score;
obj.mask_size = sizeof(float)*mask_instance_width*mask_instance_height;
obj.mask = new float[mask_instance_width*mask_instance_height];
obj.mask_width = mask_instance_width;
obj.mask_height = mask_instance_height;
std::cout << "id:" << i << "l: "<<obj.left << "t: "<<obj.top << "w: "<<obj.width << "h: "<<obj.height << "clid:" << obj.classId << std::endl;
std::cout << "mask_width: " << obj.mask_width << "mask_height: " << obj.mask_height << std::endl;
float *rawMask = reinterpret_cast<float *>(out_mask + i
* detectionParams.numClassesConfigured + obj.classId);
memcpy (obj.mask, rawMask, sizeof(float)*mask_instance_width*mask_instance_height);
for(unsigned int k=0;k<mask_instance_height;k++){
for(unsigned int j=0;j<mask_instance_width;j++){
std::cout << obj.mask[k*mask_instance_height+j] << " ";
}
std::cout << std::endl;
}
std::cout << std::endl;
objectList.push_back(obj);
}
return true;
}
This does successfully display the boxes but the mask it displays is the whole rectangle detection (and not the desired “object shape”). Can you give me any hint on how to continue ??