DetectnetV2 C++ tensorrt inference,no result

xyz · August 5, 2021, 8:41am

I got the trt engine file using the following command
./tlt-converter resnet34_peoplenet_pruned.etlt -k tlt_encode -o output_cov/Sigmoid,output_bbox/BiasAdd -d 3,544,960 -i nchw -t fp16 -e resnet34_peoplenet_pruned.etlt_b1_gpu0_fp16.engine -m 1 -b 1

using the following code to process but got no outputs

//g++ inferPeopleNet.cpp `pkg-config --cflags --libs opencv4` -I /usr/local/cuda-10.2/include -I /usr/include/aarch64-linux-gnu/ -L /usr/lib/aarch64-linux-gnu -lnvinfer -lnvinfer_plugin -L /usr/local/cuda-10.2/lib64 -lcudart -lcublas -lcurand

/* OpenCV headers */
#include <opencv2/core/core.hpp>
#include <opencv2/dnn/dnn.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#include <opencv2/imgcodecs/imgcodecs.hpp>

#include <iostream>
#include <fstream>
#include <cudnn.h>
#include "NvInfer.h"
#include "NvInferPlugin.h"

//#define MIN(a,b) ((a) < (b) ? (a) : (b))
//#define MAX(a,b) ((a) > (b) ? (a) : (b))
#define CLIP(a,min,max) (MAX(MIN(a, max), min))
#define DIVIDE_AND_ROUND_UP(a, b) ((a + b - 1) / b)
  
using namespace cv;
using namespace std;

class Logger : public nvinfer1::ILogger
{
public:
	void log(nvinfer1::ILogger::Severity severity, const char* msg) override
	{
		// suppress info-level messages
		if (severity == Severity::kINFO) return;

		switch (severity)
		{
			case Severity::kINTERNAL_ERROR: std::cerr << "INTERNAL_ERROR: "; break;
			case Severity::kERROR: std::cerr << "ERROR: "; break;
			case Severity::kWARNING: std::cerr << "WARNING: "; break;
			case Severity::kINFO: std::cerr << "INFO: "; break;
			default: std::cerr << "UNKNOWN: "; break;
		}
		std::cerr << msg << std::endl;
	}
};


int main()
{
	std::string engineFilePath = "/opt/nvidia/deepstream/deepstream-5.0/samples/models/tlt_pretrained_models/peoplenet/resnet34_peoplenet_pruned.etlt_b1_gpu0_fp16.engine";
	std::string imagePath = "test.jpg";

	// General parameters
	uint16_t m_InputH, m_OutputW;
	uint16_t m_InputW, m_OutputH;
	uint16_t m_InputC;
	uint64_t m_InputSize, m_OutputBBoxSize, m_OutputConfidenceSize;
	uint16_t m_NumOutputClasses;

	// TRT specific parameters
	uint16_t m_maxBatchSize = 1;
	int m_InputIndex = -1;
	int m_OutputBboxIndex = -1, m_OutputClassIndex = -1;

	Logger m_Logger;
	nvinfer1::ICudaEngine* m_Engine;
	nvinfer1::IExecutionContext* m_Context;
	nvinfer1::IRuntime* runtime;

	std::vector<void*> m_Bindings;
	std::vector<float*> m_TrtOutputBuffers;  
	cudaStream_t m_CudaStream;


	m_InputW = 960;
	m_InputH = 544;
	m_InputC = 3;
	m_OutputW = 60;
	m_OutputH = 34;
	m_NumOutputClasses = 3;

	m_InputSize = m_InputW * m_InputH * m_InputC;
	m_OutputBBoxSize = m_OutputW * m_OutputH * m_NumOutputClasses * 4;
	m_OutputConfidenceSize = m_OutputW * m_OutputH * m_NumOutputClasses;
	
	
	// Deserializing engine	
	// reading the model in memory
	std::cout << "[Info] Loading TRT Engine...\n";
	std::stringstream trtModelStream;
	trtModelStream.seekg(0, trtModelStream.beg);
	std::ifstream cache(engineFilePath);
	assert(cache.good());
	trtModelStream << cache.rdbuf();
	cache.close();

	// calculating model size
	trtModelStream.seekg(0, std::ios::end);
	const int modelSize = trtModelStream.tellg();
	trtModelStream.seekg(0, std::ios::beg);
	void* modelMem = malloc(modelSize);
	trtModelStream.read((char*) modelMem, modelSize);

	runtime = nvinfer1::createInferRuntime(m_Logger);
	m_Engine = runtime->deserializeCudaEngine(modelMem, modelSize, nullptr);
	free(modelMem);
	runtime->destroy();
	std::cout << "[Info] Loading Complete!\n";

	if(m_Engine == nullptr)
	{
		std::cout << "[Error] TensorRT engine loading failed\n";
		return -1;	
	}

	m_Context = m_Engine->createExecutionContext();
	if(m_Context == nullptr)
	{
		std::cout << "[Error] TensorRT getting context failed\n";
		return -2;	
	}

	// Get the bindings
	std::cout << "[Info] Getting the Bindings...\n";
	m_Bindings.resize(m_Engine->getNbBindings(), nullptr);
	m_TrtOutputBuffers.resize(m_Engine->getNbBindings() - 1, nullptr);	
	m_InputIndex = m_Engine->getBindingIndex("input_1");
	m_OutputBboxIndex = m_Engine->getBindingIndex("output_bbox/BiasAdd");
	m_OutputClassIndex = m_Engine->getBindingIndex("output_cov/Sigmoid");
	if (m_InputIndex == -1 || m_OutputBboxIndex == -1 || m_OutputClassIndex == -1)
	{
		std::cout << "[Error] TensorRT binding not found\n";
		return -3;
	}
	std::cout << "[Info] Bindings size : " << m_Engine->getNbBindings() << "\n";
	std::cout << "[Info] Bindings " << m_InputIndex << " " << m_OutputBboxIndex << " " << m_OutputClassIndex << "\n";
	
	// Allocate Buffers	
	(cudaMalloc(&m_Bindings.at(m_InputIndex), m_maxBatchSize * m_InputSize * sizeof(float)));
	(cudaMalloc(&m_Bindings.at(m_OutputBboxIndex), m_maxBatchSize * m_OutputBBoxSize * sizeof(float)));
	(cudaMalloc(&m_Bindings.at(m_OutputClassIndex), m_maxBatchSize * m_OutputConfidenceSize * sizeof(float)));
	(cudaMallocHost(&m_TrtOutputBuffers[0], m_OutputBBoxSize * m_maxBatchSize * sizeof(float)));
	(cudaMallocHost(&m_TrtOutputBuffers[1], m_OutputConfidenceSize * m_maxBatchSize * sizeof(float)));
	(cudaStreamCreate(&m_CudaStream));


	// Loading input image to device
	std::cout << "[Info] Loading input image\n";
	Mat inputImage = imread(imagePath);
	//cv::cvtColor(inputImage, inputImage, cv::COLOR_BGR2RGB);
	Mat inferImage = cv::dnn::blobFromImage(inputImage, 0.0039215697906911373, cv::Size(m_InputW, m_InputH), cv::Scalar(0.0, 0.0, 0.0), true, false);
	//Mat inferImage = cv::dnn::blobFromImage(inputImage, 1.0, cv::Size(m_InputW, m_InputH), cv::Scalar(0.0, 0.0, 0.0), false, false);
	cudaMemcpyAsync(m_Bindings.at(m_InputIndex), inferImage.data,
								  m_maxBatchSize * m_InputSize * sizeof(float), cudaMemcpyHostToDevice,
								  m_CudaStream);

	// Running Inference
	std::cout << "[Info] Running Inference\n";
	m_Context->enqueue(m_maxBatchSize, m_Bindings.data(), m_CudaStream, nullptr);
        //m_Context->execute(m_maxBatchSize, m_Bindings.data());
        
	cudaMemcpyAsync(m_TrtOutputBuffers.at(0), m_Bindings.at(m_OutputBboxIndex),
								  m_maxBatchSize * m_OutputBBoxSize * sizeof(float),
								  cudaMemcpyDeviceToHost, m_CudaStream);
	cudaMemcpyAsync(m_TrtOutputBuffers.at(1), m_Bindings.at(m_OutputClassIndex),
								  m_maxBatchSize * m_OutputConfidenceSize * sizeof(float),
								  cudaMemcpyDeviceToHost, m_CudaStream);

	// Decoding output buffers
	std::cout << "[Info] Decoding the output Buffers\n";
	int gridW = m_OutputW;
	int gridH = m_OutputH;
	int gridSize = gridW * gridH;
	float gcCentersX[gridW];
	float gcCentersY[gridH];
	float bboxNormX = 35.0;
	float bboxNormY = 35.0;
	float* outputBboxBuf = &m_TrtOutputBuffers.at(0)[0];
	float* outputCovBuf = &m_TrtOutputBuffers.at(1)[0];

	int strideX = DIVIDE_AND_ROUND_UP(m_InputW, gridW);
	int strideY = DIVIDE_AND_ROUND_UP(m_InputH, gridH);

	for (int i = 0; i < gridW; i++)
	{
		gcCentersX[i] = (float)(i * strideX + 0.5);
		gcCentersX[i] /= (float)bboxNormX;
	}
	for (int i = 0; i < gridH; i++)
	{
		gcCentersY[i] = (float)(i * strideY + 0.5);
		gcCentersY[i] /= (float)bboxNormY;
	}

	for (int c = 0; c < m_NumOutputClasses; c++)
	{
		float *outputX1 = outputBboxBuf + (c * 4 * gridW * gridH);

		float *outputY1 = outputX1 + gridSize;
		float *outputX2 = outputY1 + gridSize;
		float *outputY2 = outputX2 + gridSize;

		float threshold = 0.1;//detectionParams.perClassPreclusterThreshold[c];
		for (int h = 0; h < gridH; h++)
		{
			for (int w = 0; w < gridW; w++)
			{
				int i = w + h * gridW;
				float confidence = outputCovBuf[c * gridSize + i];
				if (confidence >= threshold)
				{
					//NvDsInferObjectDetectionInfo object;
					float rectX1f, rectY1f, rectX2f, rectY2f;

					rectX1f = (outputX1[w + h * gridW] - gcCentersX[w]) * -bboxNormX;
					rectY1f = (outputY1[w + h * gridW] - gcCentersY[h]) * -bboxNormY;
					rectX2f = (outputX2[w + h * gridW] + gcCentersX[w]) * bboxNormX;
					rectY2f = (outputY2[w + h * gridW] + gcCentersY[h]) * bboxNormY;

					//object.classId = c;
					//object.detectionConfidence = outputCovBuf[c * gridSize + i];

					/* Clip object box co-ordinates to network resolution */
					rectX1f = CLIP(rectX1f, 0, m_InputW - 1);
					rectY1f = CLIP(rectY1f, 0, m_InputH - 1);
					rectX2f = CLIP(rectX2f, 0, m_InputW - 1);
					rectY2f = CLIP(rectY2f, 0, m_InputH - 1);

					//Prevent underflows
					if(((rectX2f - rectX1f) < 0) || ((rectY2f - rectY1f) < 0))
						continue;

					// Detected boxes
					std::cout << "[Info] ClassIdx : " << c << " BBox : " << rectX1f << "," << rectY1f << "," << (rectX2f) << "," << (rectY2f) << "," << confidence << "\n";          
				}
			}
		}
	}
	//imshow("Display", inputImage);
	//waitKey();

	return 0;
}

Morganh · August 6, 2021, 7:29am

Can you try to debug your code firstly to check where has output and where has no output?

xyz · August 6, 2021, 8:30am

what’s the difference between enqueue and execute? execute works,using tensorrt 7.1.3.4

Morganh · August 8, 2021, 10:54am

See TensorRT: nvinfer1::IExecutionContext Class Reference
TensorRT: nvinfer1::IExecutionContext Class Reference

Topic		Replies	Views
How to run Detectnet_v2_resnet18.trt without deepstream TAO Toolkit	11	1688	October 12, 2021
TRT FIle TensorRT	0	237	April 16, 2024
Post processing logic for TensorRT Peoplenet in Python TensorRT tensorrt , cuda	10	818	April 22, 2024
Falure to do inference TAO Toolkit tensorrt	9	1070	January 11, 2022
PeopleNet. Coverage output is always zero TAO Toolkit	12	1796	October 12, 2021
Run PeopleNet with tensorrt TAO Toolkit	35	9713	August 10, 2021
Tensorrt Batch Inference TensorRT tensorrt	8	1562	December 1, 2020
Incorrect Results When Using TensorRT Inference Server With TLT Model TAO Toolkit tensorrt	19	1852	October 12, 2021
Running nvidia pretrained models in Tensorrt inference TAO Toolkit	14	896	October 6, 2022
Inferring detectnet_v2 .trt model in python TAO Toolkit tensorrt	58	3563	August 17, 2021

DetectnetV2 C++ tensorrt inference,no result

Related topics