Runtime error of Tensorrt 7.1.3 on Jetson Xavier AGX

We are developing a deep learning application with Nvidia Jetson AGX Xavier. There has a run-time error occured, Segmentation fault (core dump),when we destruct an object containing ICUDAEngine with the TensorRT SDK C++ API.

The runtime enviroments of Xavier is,
TensorRT 7.1.3
cuda 10.2.89
libcudnn 8.0.0.180
opencv 4.1.1
JetPack 4.4.1

#include <dirent.h>

#include <iostream>
#include <vector>
#include <opencv2/opencv.hpp>

#include "buffers.h"
#include "common.h"
#include "NvInfer.h"
#include "NvUffParser.h"
#include <cuda_runtime_api.h>

static const int INPUT_H = 224;
static const int INPUT_W = 224;
static const int INPUT_C = 3;

template <typename T>
using SampleUniquePtr = std::unique_ptr<T, samplesCommon::InferDeleter>;
struct Detection
{
	SampleUniquePtr<nvinfer1::IExecutionContext> context;
	std::shared_ptr<nvinfer1::ICudaEngine> engine;
	cudaStream_t stream;
	std::string pathToModelBin;
	std::string input_name;
	std::string output_name;
	int batchSize;
	int class_num;

	Detection(const std::string &pathToModelBin);
	Detection(const Detection& bdobject);
	~Detection();

	void read();
	void infer(const cv::Mat &frame);
	bool processInput(const samplesCommon::BufferManager& buffers, const cv::Mat &frame);
};

Detection::Detection(const std::string &pathToModelBin)
	: pathToModelBin(pathToModelBin), engine(nullptr), context(nullptr)
{
	CHECK(cudaStreamCreate(&stream));
	input_name = "input";
	output_name = "MobilenetV2/Predictions/Reshape_1";
	batchSize = 1;
	class_num = 15;
}

Detection::Detection(const Detection& bdObject)
{
	this->pathToModelBin = bdObject.pathToModelBin;
	this->engine = bdObject.engine;
	this->stream = bdObject.stream;
	// this->context = bdObject.context;
}

Detection::~Detection() 
{
	cudaStreamSynchronize(stream);
	cudaStreamDestroy(stream);
	nvuffparser::shutdownProtobufLibrary();
	std::cout << "~Detection()" << std::endl;
}

void Detection::read()
{
	std::fstream file;
	std::cout << "loading filename from:" << pathToModelBin << std::endl;
	nvinfer1::IRuntime* trtRuntime;
	file.open(pathToModelBin, std::ios::binary | std::ios::in);
	file.seekg(0, std::ios::end);
	int length = file.tellg();
	file.seekg(0, std::ios::beg);
	std::unique_ptr<char[]> data(new char[length]);
	file.read(data.get(), length);
	file.close();
	trtRuntime = createInferRuntime(sample::gLogger.getTRTLogger());
	
	ICudaEngine* engine1 = trtRuntime->deserializeCudaEngine(data.get(), length, nullptr);
	engine = std::shared_ptr<nvinfer1::ICudaEngine>(engine1, samplesCommon::InferDeleter());
	context = SampleUniquePtr<nvinfer1::IExecutionContext>(engine->createExecutionContext());
	std::cout << "deserialize done" << std::endl;
}

bool Detection::processInput(const samplesCommon::BufferManager& buffers, const cv::Mat &frame)
    {
        cv::Mat dst = cv::Mat::zeros(INPUT_H, INPUT_W, CV_8UC3);
        cv::resize(frame, dst, dst.size());
        cv::cvtColor(dst, dst, cv::COLOR_BGR2RGB);

        float* hostDataBuffer = static_cast<float*>(buffers.getHostBuffer(input_name));
        for (int c = 0; c < INPUT_C; ++c)
        {
            for (int i = 0; i < INPUT_H; ++i)
            {
                cv::Vec3b *p1 = dst.ptr<cv::Vec3b>(i);
                for (int j = 0; j < INPUT_W; ++j)
                {
                    hostDataBuffer[c * INPUT_W * INPUT_H + i * INPUT_W + j] = (p1[j][c] - 127.5) / 127.5;
                }
            }
        }
        return true;
    }

void Detection::infer(const cv::Mat &frame)
{
	context = SampleUniquePtr<nvinfer1::IExecutionContext>(engine->createExecutionContext());
	samplesCommon::BufferManager buffers(engine, batchSize);
	if (!processInput(buffers, frame))
	{
		return;
	}

	buffers.copyInputToDeviceAsync(stream);

	const bool status = context->execute(batchSize, buffers.getDeviceBindings().data());
	if (!status)
	{
		return;
	}

	buffers.copyOutputToHostAsync(stream);
	const float *results = static_cast<const float*>(buffers.getHostBuffer(output_name));
	std::cout << "results: ";
	for (size_t i = 0; i < batchSize * class_num; i++)
	{
		std::cout << results[i] << ", ";
	}
	std::cout << std::endl;
}


void printHelpInfo()
{
    std::cout << "Usage: ./test data_path\n";
}

int parseInputFiles(const std::string file_path, std::vector<std::string>& filenames)
{
    DIR *pDir;
    struct dirent* ptr;
    if(!(pDir = opendir(file_path.c_str())))
        return -1;
    while((ptr = readdir(pDir))!=0) {
        if (strcmp(ptr->d_name, ".") != 0 && strcmp(ptr->d_name, "..") != 0)
            filenames.push_back(file_path + "/" + ptr->d_name);
    }
    closedir(pDir);
    return 0;
}

int main(int argc, char** argv)
{
    if (argc < 2)
    {
        printHelpInfo();
        return 0;
    }
    
    try {
        std::string file_path = argv[1];

		Detection *detector = nullptr;
		detector = new Detection("./model/seg_model_jetson_trt7.engine");
		detector->read();

		std::vector<std::string> imageNames;
		parseInputFiles(file_path, imageNames);
		if (imageNames.empty()) throw std::logic_error("No suitable images were found");

		for (size_t i = 0; i < imageNames.size(); i++)
		{
			cv::Mat image = cv::imread(imageNames[i]);
			detector->infer(image);
		}
		
		delete detector; // RUNTIME ERROR here ! "Segmentation fault (core dumped)"
	}
	catch (const std::exception& error) {
		std::cout << error.what() << std::endl;
		return 1;
	}
	catch (...) {
		std::cout << "Unknown/internal exception happened." << std::endl;
		return 1;
	}

	return 0;
}

Hi,

Please add a synchronization (ex. cudaStreamSynchronize(stream)) before accessing the output buffer with CPU.
You will need to make sure all the GPU tasks is done and the buffer is ready.

Thanks.

hi,
The application is crashed when delete detector object,not in inference.

delete detector; // RUNTIME ERROR here ! "Segmentation fault (core dumped)"

And it has called synchronization function before output CPU buffer in the follow code.

buffers.copyOutputToHostAsync(stream);

hi,
we check the codes again.we found that IExecutionContext should not be set as a class member variant, if we use it as a temp variant in the function, there is not any error in the runtime .

Good to know this!
Also thanks for the update.