Description
I use Pytorch to train my model, and then convert the *.pth model to *.onnx. I use TensorRT C++ to do inference. In my program, first initial work of tensorrt will be done, then do inference and release resources. All these three operations are in one loop, and this process is decided by my task. Therefore, I will repeat the process of engine initial, model inference and release resources.
But I found there is a memory leak increased by 2M. When I comment the following C++ line, the problem is fixed,
context->enqueue(1, buffers_, stream, nullptr);
Could you help me to solve this problem?
Environment
TensorRT Version: TensorRT-7.0.0.11-18.04
GPU Type: GTX 1070 notebook
Nvidia Driver Version: 450.66
CUDA Version: V10.0.130
CUDNN Version: 7.6.5
Operating System + Version: Ubuntu 18.04
Python Version (if applicable): python 3.5
PyTorch Version (if applicable): 1.4.0
Relevant Files
Initial Process:
nvinfer1::IHostMemory* model_stream_{nullptr};
ReadTrtEngine(path0 + meter_type + “/” + meter_type + “.trt”, model_stream_);
runtime = nvinfer1::createInferRuntime(logger_);
engine = runtime->deserializeCudaEngine(model_stream_->data(), model_stream_->size(), nullptr);
model_stream_->destroy();
context = engine->createExecutionContext();
std::vector<int64_t>().swap(buffer_size_);
int nbBindings = engine->getNbBindings();
buffer_size_.resize(nbBindings);
for (int i = 0; i < nbBindings; ++i)
{
nvinfer1::Dims dims = engine->getBindingDimensions(i);
nvinfer1::DataType dtype = engine->getBindingDataType(i);
int64_t totalSize = Volume(dims) * 1 * GetElementSize(dtype);
buffer_size_[i] = totalSize;
CHECK(cudaMalloc(&buffers_[i], totalSize));
}
outSize1 = buffer_size_[1] / sizeof(float);
outSize2 = buffer_size_[2] / sizeof(float);
CHECK(cudaStreamCreate(&stream));
inference:
outSize1 = buffer_size_[1] / sizeof(float);
outSize2 = buffer_size_[2] / sizeof(float);
float* out1 = new float[outSize1];
float* out2 = new float[outSize2];
org_image_width_ = img.cols;
org_image_height_ = img.rows;
float image_data[3 * detect_parameters_.kInputModelWidth * detect_parameters_.kInputModelHeight];
PreProcess(img, image_data);
CHECK(cudaMemcpyAsync(buffers_[0], image_data, buffer_size_[0], cudaMemcpyHostToDevice, stream));
context->enqueue(1, buffers_, stream, nullptr);
CHECK(cudaMemcpyAsync(out1, buffers_[1], buffer_size_[1], cudaMemcpyDeviceToHost, stream));
CHECK(cudaMemcpyAsync(out2, buffers_[2], buffer_size_[2], cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);
release:
cudaStreamDestroy(stream);
CHECK(cudaFree(buffers_[0]));
CHECK(cudaFree(buffers_[1]));
CHECK(cudaFree(buffers_[2]));
context->destroy();
engine->destroy();
runtime->destroy();