TensorRT 7.0 memory leak

Description

I use Pytorch to train my model, and then convert the *.pth model to *.onnx. I use TensorRT C++ to do inference. In my program, first initial work of tensorrt will be done, then do inference and release resources. All these three operations are in one loop, and this process is decided by my task. Therefore, I will repeat the process of engine initial, model inference and release resources.
But I found there is a memory leak increased by 2M. When I comment the following C++ line, the problem is fixed,
context->enqueue(1, buffers_, stream, nullptr);
Could you help me to solve this problem?

Environment

TensorRT Version: TensorRT-7.0.0.11-18.04
GPU Type: GTX 1070 notebook
Nvidia Driver Version: 450.66
CUDA Version: V10.0.130
CUDNN Version: 7.6.5
Operating System + Version: Ubuntu 18.04
Python Version (if applicable): python 3.5
PyTorch Version (if applicable): 1.4.0

Relevant Files

Initial Process:
nvinfer1::IHostMemory* model_stream_{nullptr};
ReadTrtEngine(path0 + meter_type + “/” + meter_type + “.trt”, model_stream_);
runtime = nvinfer1::createInferRuntime(logger_);
engine = runtime->deserializeCudaEngine(model_stream_->data(), model_stream_->size(), nullptr);
model_stream_->destroy();
context = engine->createExecutionContext();
std::vector<int64_t>().swap(buffer_size_);
int nbBindings = engine->getNbBindings();
buffer_size_.resize(nbBindings);
for (int i = 0; i < nbBindings; ++i)
{
nvinfer1::Dims dims = engine->getBindingDimensions(i);
nvinfer1::DataType dtype = engine->getBindingDataType(i);
int64_t totalSize = Volume(dims) * 1 * GetElementSize(dtype);
buffer_size_[i] = totalSize;
CHECK(cudaMalloc(&buffers_[i], totalSize));
}
outSize1 = buffer_size_[1] / sizeof(float);
outSize2 = buffer_size_[2] / sizeof(float);
CHECK(cudaStreamCreate(&stream));

inference:
outSize1 = buffer_size_[1] / sizeof(float);
outSize2 = buffer_size_[2] / sizeof(float);
float* out1 = new float[outSize1];
float* out2 = new float[outSize2];
org_image_width_ = img.cols;
org_image_height_ = img.rows;
float image_data[3 * detect_parameters_.kInputModelWidth * detect_parameters_.kInputModelHeight];
PreProcess(img, image_data);
CHECK(cudaMemcpyAsync(buffers_[0], image_data, buffer_size_[0], cudaMemcpyHostToDevice, stream));
context->enqueue(1, buffers_, stream, nullptr);
CHECK(cudaMemcpyAsync(out1, buffers_[1], buffer_size_[1], cudaMemcpyDeviceToHost, stream));
CHECK(cudaMemcpyAsync(out2, buffers_[2], buffer_size_[2], cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);

release:
cudaStreamDestroy(stream);
CHECK(cudaFree(buffers_[0]));
CHECK(cudaFree(buffers_[1]));
CHECK(cudaFree(buffers_[2]));
context->destroy();
engine->destroy();
runtime->destroy();

Hi @jiangnan,
This is a known issue in TRT 7 which has been fixed in latest available release.
Can you please try using the latest release.

Thanks!

Thanks very much. I will have a try on TensorRT 7.2.1 with cuda 10.2 today.

And I have another problems as following:
Since I use pytorch to train my model, and convert the *.pth to *.onnx, and then I use TensorRT C++ code to parse the onnx model and save an engine as *.trt. But I found the size of trt file is different for the same onnx file. Just like the code below, trtModelStream->size() is different for the same onnx file every time. Is that a problem? And Why this happens?

This is my TensorRT code to parse onnx model and save as an trt model file:
bool onnxToTRTModel(string modelFile, string trtFilename, IHostMemory*& trtModelStream)
{
IBuilder* builder = createInferBuilder(gLogger.getTRTLogger());
const auto explicitBatch = 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
nvinfer1::INetworkDefinition *network = builder->createNetworkV2(explicitBatch);

auto parser = nvonnxparser::createParser(*network, gLogger.getTRTLogger());
if (!parser->parseFromFile(modelFile.c_str(), static_cast<int>(gLogger.getReportableSeverity())))
{
	cout << "Failure while parsing ONNX file" << std::endl;
	return false;
}

builder->setMaxBatchSize(1);
builder->setMaxWorkspaceSize(1 << 30);
printf("builder->getMaxWorkspaceSize() = %d\n", builder->getMaxWorkspaceSize());
ICudaEngine* engine = builder->buildCudaEngine(*network);
assert(engine);

trtModelStream = engine->serialize();

std::ofstream file;
file.open(trtFilename.c_str(), std::ios::binary);
file.write((char*)(trtModelStream->data()), trtModelStream->size());
file.close();
cout << "*.trt file has been saved." << endl;
parser->destroy();
engine->destroy();
network->destroy();
builder->destroy();

return true;

}