DetectnetV2 C++ tensorrt inference,no result

I got the trt engine file using the following command
./tlt-converter resnet34_peoplenet_pruned.etlt -k tlt_encode -o output_cov/Sigmoid,output_bbox/BiasAdd -d 3,544,960 -i nchw -t fp16 -e resnet34_peoplenet_pruned.etlt_b1_gpu0_fp16.engine -m 1 -b 1

using the following code to process but got no outputs

//g++ inferPeopleNet.cpp `pkg-config --cflags --libs opencv4` -I /usr/local/cuda-10.2/include -I /usr/include/aarch64-linux-gnu/ -L /usr/lib/aarch64-linux-gnu -lnvinfer -lnvinfer_plugin -L /usr/local/cuda-10.2/lib64 -lcudart -lcublas -lcurand

/* OpenCV headers */
#include <opencv2/core/core.hpp>
#include <opencv2/dnn/dnn.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#include <opencv2/imgcodecs/imgcodecs.hpp>

#include <iostream>
#include <fstream>
#include <cudnn.h>
#include "NvInfer.h"
#include "NvInferPlugin.h"

//#define MIN(a,b) ((a) < (b) ? (a) : (b))
//#define MAX(a,b) ((a) > (b) ? (a) : (b))
#define CLIP(a,min,max) (MAX(MIN(a, max), min))
#define DIVIDE_AND_ROUND_UP(a, b) ((a + b - 1) / b)
  
using namespace cv;
using namespace std;

class Logger : public nvinfer1::ILogger
{
public:
	void log(nvinfer1::ILogger::Severity severity, const char* msg) override
	{
		// suppress info-level messages
		if (severity == Severity::kINFO) return;

		switch (severity)
		{
			case Severity::kINTERNAL_ERROR: std::cerr << "INTERNAL_ERROR: "; break;
			case Severity::kERROR: std::cerr << "ERROR: "; break;
			case Severity::kWARNING: std::cerr << "WARNING: "; break;
			case Severity::kINFO: std::cerr << "INFO: "; break;
			default: std::cerr << "UNKNOWN: "; break;
		}
		std::cerr << msg << std::endl;
	}
};


int main()
{
	std::string engineFilePath = "/opt/nvidia/deepstream/deepstream-5.0/samples/models/tlt_pretrained_models/peoplenet/resnet34_peoplenet_pruned.etlt_b1_gpu0_fp16.engine";
	std::string imagePath = "test.jpg";

	// General parameters
	uint16_t m_InputH, m_OutputW;
	uint16_t m_InputW, m_OutputH;
	uint16_t m_InputC;
	uint64_t m_InputSize, m_OutputBBoxSize, m_OutputConfidenceSize;
	uint16_t m_NumOutputClasses;

	// TRT specific parameters
	uint16_t m_maxBatchSize = 1;
	int m_InputIndex = -1;
	int m_OutputBboxIndex = -1, m_OutputClassIndex = -1;

	Logger m_Logger;
	nvinfer1::ICudaEngine* m_Engine;
	nvinfer1::IExecutionContext* m_Context;
	nvinfer1::IRuntime* runtime;

	std::vector<void*> m_Bindings;
	std::vector<float*> m_TrtOutputBuffers;  
	cudaStream_t m_CudaStream;


	m_InputW = 960;
	m_InputH = 544;
	m_InputC = 3;
	m_OutputW = 60;
	m_OutputH = 34;
	m_NumOutputClasses = 3;

	m_InputSize = m_InputW * m_InputH * m_InputC;
	m_OutputBBoxSize = m_OutputW * m_OutputH * m_NumOutputClasses * 4;
	m_OutputConfidenceSize = m_OutputW * m_OutputH * m_NumOutputClasses;
	
	
	// Deserializing engine	
	// reading the model in memory
	std::cout << "[Info] Loading TRT Engine...\n";
	std::stringstream trtModelStream;
	trtModelStream.seekg(0, trtModelStream.beg);
	std::ifstream cache(engineFilePath);
	assert(cache.good());
	trtModelStream << cache.rdbuf();
	cache.close();

	// calculating model size
	trtModelStream.seekg(0, std::ios::end);
	const int modelSize = trtModelStream.tellg();
	trtModelStream.seekg(0, std::ios::beg);
	void* modelMem = malloc(modelSize);
	trtModelStream.read((char*) modelMem, modelSize);

	runtime = nvinfer1::createInferRuntime(m_Logger);
	m_Engine = runtime->deserializeCudaEngine(modelMem, modelSize, nullptr);
	free(modelMem);
	runtime->destroy();
	std::cout << "[Info] Loading Complete!\n";

	if(m_Engine == nullptr)
	{
		std::cout << "[Error] TensorRT engine loading failed\n";
		return -1;	
	}

	m_Context = m_Engine->createExecutionContext();
	if(m_Context == nullptr)
	{
		std::cout << "[Error] TensorRT getting context failed\n";
		return -2;	
	}

	// Get the bindings
	std::cout << "[Info] Getting the Bindings...\n";
	m_Bindings.resize(m_Engine->getNbBindings(), nullptr);
	m_TrtOutputBuffers.resize(m_Engine->getNbBindings() - 1, nullptr);	
	m_InputIndex = m_Engine->getBindingIndex("input_1");
	m_OutputBboxIndex = m_Engine->getBindingIndex("output_bbox/BiasAdd");
	m_OutputClassIndex = m_Engine->getBindingIndex("output_cov/Sigmoid");
	if (m_InputIndex == -1 || m_OutputBboxIndex == -1 || m_OutputClassIndex == -1)
	{
		std::cout << "[Error] TensorRT binding not found\n";
		return -3;
	}
	std::cout << "[Info] Bindings size : " << m_Engine->getNbBindings() << "\n";
	std::cout << "[Info] Bindings " << m_InputIndex << " " << m_OutputBboxIndex << " " << m_OutputClassIndex << "\n";
	
	// Allocate Buffers	
	(cudaMalloc(&m_Bindings.at(m_InputIndex), m_maxBatchSize * m_InputSize * sizeof(float)));
	(cudaMalloc(&m_Bindings.at(m_OutputBboxIndex), m_maxBatchSize * m_OutputBBoxSize * sizeof(float)));
	(cudaMalloc(&m_Bindings.at(m_OutputClassIndex), m_maxBatchSize * m_OutputConfidenceSize * sizeof(float)));
	(cudaMallocHost(&m_TrtOutputBuffers[0], m_OutputBBoxSize * m_maxBatchSize * sizeof(float)));
	(cudaMallocHost(&m_TrtOutputBuffers[1], m_OutputConfidenceSize * m_maxBatchSize * sizeof(float)));
	(cudaStreamCreate(&m_CudaStream));


	// Loading input image to device
	std::cout << "[Info] Loading input image\n";
	Mat inputImage = imread(imagePath);
	//cv::cvtColor(inputImage, inputImage, cv::COLOR_BGR2RGB);
	Mat inferImage = cv::dnn::blobFromImage(inputImage, 0.0039215697906911373, cv::Size(m_InputW, m_InputH), cv::Scalar(0.0, 0.0, 0.0), true, false);
	//Mat inferImage = cv::dnn::blobFromImage(inputImage, 1.0, cv::Size(m_InputW, m_InputH), cv::Scalar(0.0, 0.0, 0.0), false, false);
	cudaMemcpyAsync(m_Bindings.at(m_InputIndex), inferImage.data,
								  m_maxBatchSize * m_InputSize * sizeof(float), cudaMemcpyHostToDevice,
								  m_CudaStream);

	// Running Inference
	std::cout << "[Info] Running Inference\n";
	m_Context->enqueue(m_maxBatchSize, m_Bindings.data(), m_CudaStream, nullptr);
        //m_Context->execute(m_maxBatchSize, m_Bindings.data());
        
	cudaMemcpyAsync(m_TrtOutputBuffers.at(0), m_Bindings.at(m_OutputBboxIndex),
								  m_maxBatchSize * m_OutputBBoxSize * sizeof(float),
								  cudaMemcpyDeviceToHost, m_CudaStream);
	cudaMemcpyAsync(m_TrtOutputBuffers.at(1), m_Bindings.at(m_OutputClassIndex),
								  m_maxBatchSize * m_OutputConfidenceSize * sizeof(float),
								  cudaMemcpyDeviceToHost, m_CudaStream);

	// Decoding output buffers
	std::cout << "[Info] Decoding the output Buffers\n";
	int gridW = m_OutputW;
	int gridH = m_OutputH;
	int gridSize = gridW * gridH;
	float gcCentersX[gridW];
	float gcCentersY[gridH];
	float bboxNormX = 35.0;
	float bboxNormY = 35.0;
	float* outputBboxBuf = &m_TrtOutputBuffers.at(0)[0];
	float* outputCovBuf = &m_TrtOutputBuffers.at(1)[0];

	int strideX = DIVIDE_AND_ROUND_UP(m_InputW, gridW);
	int strideY = DIVIDE_AND_ROUND_UP(m_InputH, gridH);

	for (int i = 0; i < gridW; i++)
	{
		gcCentersX[i] = (float)(i * strideX + 0.5);
		gcCentersX[i] /= (float)bboxNormX;
	}
	for (int i = 0; i < gridH; i++)
	{
		gcCentersY[i] = (float)(i * strideY + 0.5);
		gcCentersY[i] /= (float)bboxNormY;
	}

	for (int c = 0; c < m_NumOutputClasses; c++)
	{
		float *outputX1 = outputBboxBuf + (c * 4 * gridW * gridH);

		float *outputY1 = outputX1 + gridSize;
		float *outputX2 = outputY1 + gridSize;
		float *outputY2 = outputX2 + gridSize;

		float threshold = 0.1;//detectionParams.perClassPreclusterThreshold[c];
		for (int h = 0; h < gridH; h++)
		{
			for (int w = 0; w < gridW; w++)
			{
				int i = w + h * gridW;
				float confidence = outputCovBuf[c * gridSize + i];
				if (confidence >= threshold)
				{
					//NvDsInferObjectDetectionInfo object;
					float rectX1f, rectY1f, rectX2f, rectY2f;

					rectX1f = (outputX1[w + h * gridW] - gcCentersX[w]) * -bboxNormX;
					rectY1f = (outputY1[w + h * gridW] - gcCentersY[h]) * -bboxNormY;
					rectX2f = (outputX2[w + h * gridW] + gcCentersX[w]) * bboxNormX;
					rectY2f = (outputY2[w + h * gridW] + gcCentersY[h]) * bboxNormY;

					//object.classId = c;
					//object.detectionConfidence = outputCovBuf[c * gridSize + i];

					/* Clip object box co-ordinates to network resolution */
					rectX1f = CLIP(rectX1f, 0, m_InputW - 1);
					rectY1f = CLIP(rectY1f, 0, m_InputH - 1);
					rectX2f = CLIP(rectX2f, 0, m_InputW - 1);
					rectY2f = CLIP(rectY2f, 0, m_InputH - 1);

					//Prevent underflows
					if(((rectX2f - rectX1f) < 0) || ((rectY2f - rectY1f) < 0))
						continue;

					// Detected boxes
					std::cout << "[Info] ClassIdx : " << c << " BBox : " << rectX1f << "," << rectY1f << "," << (rectX2f) << "," << (rectY2f) << "," << confidence << "\n";          
				}
			}
		}
	}
	//imshow("Display", inputImage);
	//waitKey();

	return 0;
}

Can you try to debug your code firstly to check where has output and where has no output?

what’s the difference between enqueue and execute? execute works,using tensorrt 7.1.3.4

See TensorRT: nvinfer1::IExecutionContext Class Reference
TensorRT: nvinfer1::IExecutionContext Class Reference