Hello
I was running a test suite under amd64 architecture and I got a GPU out of memory error. I was able to pin point the problem to nvinfer1::IExecutionContext::execute() and enqueue(). It is perhaps related to this other issue https://devtalk.nvidia.com/default/topic/1071670/jetson-agx-xavier/tensorrt-6-memory-leak/post/5430568/#5430568
My system is
Ubuntu 18.04.4 LTS (Linux 5.3.0-40-generic #32~18.04.1-Ubuntu x86_64 GNU/Linux)
g++ (Ubuntu 7.4.0-1ubuntu1~18.04.1) 7.4.0
Cuda compilation tools, release 10.0, V10.0.326
tensorrt 6.0.1.5-1+cuda10.0
#include <iostream>
#include <NvOnnxParser.h>
#include <memory>
#include <experimental/filesystem>
#include <fstream>
template<typename T>
struct NvInferDestroyDelete {
void operator()(T* t) {
std::cout << "Destroying object" << std::endl;
t->destroy();
}
};
template<typename T>
using NvInferUniquePtr = std::unique_ptr<T, NvInferDestroyDelete<T>>;
class TRTLogger : public nvinfer1::ILogger {
public:
void log(Severity severity, const char* msg) override
{
static const std::array<const char*, 5> type{
{"Internal Error", "Error", "Warning", "Info", "Verbose"}};
std::cout << '[' << type.at(static_cast<size_t>(severity)) << "] "
<< msg << '\n';
}
};
std::streamoff stream_size(std::istream& f)
{
std::istream::pos_type current_pos = f.tellg();
if (1 == current_pos) {
return 1;
}
f.seekg(0, std::istream::end);
std::istream::pos_type end_pos = f.tellg();
f.seekg(current_pos);
return end_pos - current_pos;
}
bool stream_read_string(std::istream& f, std::string& result)
{
std::streamoff len = stream_size(f);
if (len == 1) {
return false;
}
result.resize(static_cast<std::string::size_type>(len));
f.read(&result[0], len);
return true;
}
std::string read_file(const std::experimental::filesystem::path& path)
{
std::ifstream file(path, std::ios::binary);
// disable skipping new lines in binary mode
file.unsetf(std::ios::skipws);
std::string result;
if (!stream_read_string(file, result)) {
throw std::runtime_error("Failed to read file");
}
return result;
}
auto get_gpuinfo()
{
size_t free_byte ;
size_t total_byte ;
cudaMemGetInfo( &free_byte, &total_byte ) ;
double free_db = (double)free_byte ;
double total_db = (double)total_byte ;
double used_db = total_db - free_db ;
return used_db/1024.0/1024.0;
}
constexpr inline void checkCuda(int status)
{
if (status != 0) {
std::abort();
}
}
template<typename T>
struct CudaMemoryDeleter {
void operator()(T* buf) {
checkCuda(cudaFree(buf));
}
};
template<typename T>
using CudaMemoryUniquePtr = std::unique_ptr<T, CudaMemoryDeleter<T>>;
template<typename T>
CudaMemoryUniquePtr<T> cuda_alloc(size_t number_of_elements)
{
T* gpu_ptr{nullptr};
auto bytes = number_of_elements * sizeof(T);
auto status = cudaMalloc(&gpu_ptr, bytes);
if (status != cudaSuccess) {
std::cerr << "Failed to allocate " << bytes
<< " bytes of GPU memory. Shutting down." << std::endl;
std::terminate();
}
return CudaMemoryUniquePtr<T>(gpu_ptr);
}
struct Inference {
explicit Inference(nvinfer1::ICudaEngine* engine)
{
auto nb_bindings = engine->getNbBindings();
for (int i = 0; i < nb_bindings; ++i) {
auto dim = engine->getBindingDimensions(i);
int num_elements{1};
for (int j = 0; j < dim.nbDims; ++j) {
num_elements *= dim.d[static_cast<size_t>(j)];
}
memory_ptrs.push_back(
cuda_alloc<float>(static_cast<size_t>(num_elements)));
bindings.push_back(memory_ptrs.back().get());
}
}
void forward(NvInferUniquePtr<nvinfer1::IExecutionContext>& context)
{
auto batch_size = 1;
// Commenting this line avoids leaking
context->execute(static_cast<int>(batch_size), bindings.data());
// Also happens with enqueue
// context->enqueue(static_cast<int>(batch_size), bindings.data(), 0, nullptr);
}
std::vector<CudaMemoryUniquePtr<float>> memory_ptrs;
std::vector<void*> bindings;
};
int main()
{
static TRTLogger logger;
for(auto loop = 0; loop < 10; ++loop)
{
std::cout << "Memory used: " << get_gpuinfo() << std::endl;
std::string model = read_file("santis.onnx.engine");
auto infer = NvInferUniquePtr<nvinfer1::IRuntime>(nvinfer1::createInferRuntime(logger));
auto engine = NvInferUniquePtr<nvinfer1::ICudaEngine>(infer->deserializeCudaEngine(model.data(), model.size(), nullptr));
if (engine == nullptr) {
throw std::runtime_error("Could not initialize engine");
}
auto context = NvInferUniquePtr<nvinfer1::IExecutionContext>(
engine->createExecutionContext());
Inference inference(engine.get());
for (int i = 0; i < 100; ++i) {
std::cout << "\rIteration " << i;
std::cout.flush();
inference.forward(context);
}
std::cout << std::endl;
}
return 0;
}
Compile command nvcc tensorrt_leak.cu -o tensorrt_leak -lnvinfer -lstdc++fs
Execution command ./tensorrt_leak
model.zip (81.8 MB)