I am using TLT pre-trained model DashCamNet cache file (the weights and config file from DeepStream SDK) for inference using TensorRT. The issue here is the bounding boxes are off from the regions of interest.
Here’s the code snippet that I am using for preprocessing and parsing the output:
float net_scale_factor = 0.0039215697906911373;
cv::cvtColor(frame,frame,CV_BGR2RGB);
int kINPUT_C = 3, mHeight = 544, mWidth = 960;
uint8_t buffer[kINPUT_C * mHeight * mWidth];
int nCols = mWidth* kINPUT_C, nRows = mHeight;
if (frame.isContinuous())
{
nCols *= nRows;
nRows = 1;
}
uchar* p;
for(int i = 0; i < nRows; ++i)
{
p = frame.ptr<uchar>(i);
for (int j = 0; j < nCols; ++j)
{
buffer[i*nCols+j] = p[j];
}
}
for (int i = 0, volImg = kINPUT_C * frame.rows * frame.cols; i < mMaxBatchSize; ++i)
{
for (int c = 0; c < kINPUT_C; ++c)
{
// The color image to input should be in RGB order
for (unsigned j = 0, volChl = mWidth * mHeight; j < volChl; ++j)
{
data[i * volImg + c * volChl + j] = float(buffer[j * kINPUT_C + 2 - c])*net_scale_factor;
}
}
}
cudaMemcpy(mInputCUDA, data, mInputSize, cudaMemcpyHostToDevice);
// process with GIE
void* inferenceBuffers[] = { mInputCUDA, mOutputs[0].CUDA, mOutputs[1].CUDA };
if( !mContext->execute(1, inferenceBuffers) )
{
printf(LOG_GIE "Failed to execute tensorRT context\n");
numBoundingBoxes = 0;
}
So, please refer to postprocess code which is exposed in C++ in /opt/nvidia/deepstream/deepstream/sources/libs/nvdsinfer_customparser/nvdsinfer_custombboxparser.cpp .
I have referred to the code for parsing the output and I don’t think the issue is with parsing the output. It’s more to do with the input image pre-processing. Can you check if the order of the channels and netscale factor in the code are in-place?
Please double check your code.
For postprocess, refer to /opt/nvidia/deepstream/deepstream/sources/libs/nvdsinfer_customparser/nvdsinfer_custombboxparser.cpp .
For preprocess, refer to Run PeopleNet with tensorrt - #5 by steventel
I have double checked output parsing(postprocess) and input preprocessing and made sure that I am doing exactly what’s mentioned in the reference links you pointed out.
Here’s the postprocess code snippet:
float *outputCoverageBuffer = mOutputs[OUTPUT_CVG].CPU;
float *outputBboxBuffer = mOutputs[OUTPUT_BBOX].CPU;
const int ow = mOutputs[OUTPUT_CVG].dims.d[2]; // number of columns in bbox grid in X dimension
const int oh = mOutputs[OUTPUT_CVG].dims.d[1]; // number of rows in bbox grid in Y dimension
const int owh = ow * oh; // total number of bbox in grid
const int cls = GetNumClasses(); // number of object classes in coverage map
unsigned int targetShape[2] = { ow, oh };
float bboxNorm[2] = { 35.0, 35.0 };
float gcCenters0[targetShape[0]];
float gcCenters1[targetShape[1]];
int strideX = DIVIDE_AND_ROUND_UP(imgW, mOutputs[OUTPUT_BBOX].dims.d[2]);
int strideY = DIVIDE_AND_ROUND_UP(imgH, mOutputs[OUTPUT_BBOX].dims.d[1]);
for (unsigned int i = 0; i < targetShape[0]; i++)
{
gcCenters0[i] = (float)(i * strideX + 0.5);
gcCenters0[i] /= (float)bboxNorm[0];
}
for (unsigned int i = 0; i < targetShape[1]; i++)
{
gcCenters1[i] = (float)(i * strideY + 0.5);
gcCenters1[i] /= (float)bboxNorm[1];
}
float preClusterThreshold = 0.10; // passed as config-file parameter in deepstream SDK
std::vector< std::vector<float6> > rects;
std::vector<float6> objectList;
std::vector<std::vector<cv::Rect>> PreMergeBoundingBoxes(cls);
std::vector<std::vector<float>> ConfidenceScores(cls);
std::vector<std::vector<int>> ResultIndices(cls);
for (unsigned int i = 0; i < targetShape[0]; i++)
{
gcCenters0[i] = (float)(i * strideX + 0.5);
gcCenters0[i] /= (float)bboxNorm[0];
}
for (unsigned int i = 0; i < targetShape[1]; i++)
{
gcCenters1[i] = (float)(i * strideY + 0.5);
gcCenters1[i] /= (float)bboxNorm[1];
}
for (unsigned int classIndex = 0; classIndex < cls; classIndex++)
{
/* Pointers to memory regions containing the (x1,y1) and (x2,y2) coordinates
* of rectangles in the output bounding box layer. */
float *outputX1 = outputBboxBuffer + classIndex * sizeof (float) *oh * ow;
float *outputY1 = outputX1 + owh;
float *outputX2 = outputY1 + owh;
float *outputY2 = outputX2 + owh;
/* Iterate through each point in the grid and check if the rectangle at that
* point meets the minimum threshold criteria. */
for (unsigned int h = 0; h < oh; h++)
{
for (unsigned int w = 0; w < ow; w++)
{
int i = w + h * ow;
float confidence = outputCoverageBuffer[classIndex * owh + i];
if (confidence < preClusterThreshold)
continue;
float rectX1f, rectY1f, rectX2f, rectY2f;
/* Centering and normalization of the rectangle. */
rectX1f = outputX1[w + h * ow] - gcCenters0[w];
rectY1f = outputY1[w + h * ow] - gcCenters1[h];
rectX2f = outputX2[w + h * ow] + gcCenters0[w];
rectY2f = outputY2[w + h * ow] + gcCenters1[h];
rectX1f *= -bboxNorm[0];
rectY1f *= -bboxNorm[1];
rectX2f *= bboxNorm[0];
rectY2f *= bboxNorm[1];
/* Clip parsed rectangles to frame bounds. */
rectX1f = CLIP(rectX1f, 0, imgW - 1);
rectX2f = CLIP(rectY1f, 0, imgW - 1);
rectY1f = CLIP(rectX2f, 0, imgH - 1);
rectY2f = CLIP(rectY2f, 0, imgH - 1);
//Prevent underflows
if(((rectX2f - rectX1f) < 0) || ((rectY2f - rectY1f) < 0))
continue;
objectList.push_back(make_float6(classIndex, rectX1f,
rectY1f, (rectX2f - rectX1f),
(rectY2f - rectY1f), confidence));
PreMergeBoundingBoxes[classIndex].push_back(cv::Rect(cv::Point2f(rectX1f, rectY1f),cv::Point2f(rectX2f,rectY2f)));
ConfidenceScores[classIndex].push_back(confidence);
}
}
}
Can you please let me know if I’m missing anything?
Some comments about RGB and CHW here. Hope it helps you implement the code.
For opencv, after imread, the Mat data will be:
b0, g0, r0, b1, g1, r1, b2, g2,r2, …
For detectnet, it needs:
r0, r1, r2, … g0, g1,g2, … b0, b1, b2, …
I have understood the input requirement and the code snippet I posted in my previous reply does exactly the job of RGB+CHW (r0, r1, r2, … g0, g1,g2, … b0, b1, b2, …)
Copying the pixels in order of RGB+CHW into data and data is copied to device memory
Hi. Did you resolve pre- and post- processing issues? Could you please share example of inference detectnet using c++ and tensor rt? Thanks in advance.