Memory leak in IExecutionContext TRT6

Hello

I was running a test suite under amd64 architecture and I got a GPU out of memory error. I was able to pin point the problem to nvinfer1::IExecutionContext::execute() and enqueue(). It is perhaps related to this other issue https://devtalk.nvidia.com/default/topic/1071670/jetson-agx-xavier/tensorrt-6-memory-leak/post/5430568/#5430568

My system is
Ubuntu 18.04.4 LTS (Linux 5.3.0-40-generic #32~18.04.1-Ubuntu x86_64 GNU/Linux)
g++ (Ubuntu 7.4.0-1ubuntu1~18.04.1) 7.4.0
Cuda compilation tools, release 10.0, V10.0.326
tensorrt 6.0.1.5-1+cuda10.0

#include <iostream>
#include <NvOnnxParser.h>
#include <memory>
#include <experimental/filesystem>
#include <fstream>

template<typename T>
struct NvInferDestroyDelete {
    void operator()(T* t) {
        std::cout << "Destroying object" << std::endl;
        t->destroy();
    }
};

template<typename T>
using NvInferUniquePtr = std::unique_ptr<T, NvInferDestroyDelete<T>>;

class TRTLogger : public nvinfer1::ILogger {
public:
  void log(Severity severity, const char* msg) override
  {
      static const std::array<const char*, 5> type{
          {"Internal Error", "Error", "Warning", "Info", "Verbose"}};
      std::cout << '[' << type.at(static_cast<size_t>(severity)) << "] "
                << msg << '\n';
  }
};

std::streamoff stream_size(std::istream& f)
{
    std::istream::pos_type current_pos = f.tellg();
    if (1 == current_pos) {
        return 1;
    }

    f.seekg(0, std::istream::end);
    std::istream::pos_type end_pos = f.tellg();
    f.seekg(current_pos);
    return end_pos - current_pos;
}

bool stream_read_string(std::istream& f, std::string& result)
{
    std::streamoff len = stream_size(f);
    if (len == 1) {
        return false;
    }
    result.resize(static_cast<std::string::size_type>(len));
    f.read(&result[0], len);
    return true;
}

std::string read_file(const std::experimental::filesystem::path& path)
{
    std::ifstream file(path, std::ios::binary);
    // disable skipping new lines in binary mode
    file.unsetf(std::ios::skipws);
    std::string result;
    if (!stream_read_string(file, result)) {
        throw std::runtime_error("Failed to read file");
    }

    return result;
}

auto get_gpuinfo()
{
    size_t free_byte ;
    size_t total_byte ;

    cudaMemGetInfo( &free_byte, &total_byte ) ;

    double free_db = (double)free_byte ;
    double total_db = (double)total_byte ;
    double used_db = total_db - free_db ;

    return used_db/1024.0/1024.0;
}

constexpr inline void checkCuda(int status)
{
    if (status != 0) {
        std::abort();
    }
}

template<typename T>
struct CudaMemoryDeleter {
    void operator()(T* buf) { 
        checkCuda(cudaFree(buf)); 
    }
};
template<typename T>
using CudaMemoryUniquePtr = std::unique_ptr<T, CudaMemoryDeleter<T>>;

template<typename T>
CudaMemoryUniquePtr<T> cuda_alloc(size_t number_of_elements)
{
    T* gpu_ptr{nullptr};
    auto bytes = number_of_elements * sizeof(T);
    auto status = cudaMalloc(&gpu_ptr, bytes);
    if (status != cudaSuccess) {
        std::cerr << "Failed to allocate " << bytes
                  << " bytes of GPU memory. Shutting down." << std::endl;
        std::terminate();
    }

    return CudaMemoryUniquePtr<T>(gpu_ptr);
}

struct Inference {
    explicit Inference(nvinfer1::ICudaEngine* engine)
    {
        auto nb_bindings = engine->getNbBindings();
        for (int i = 0; i < nb_bindings; ++i) {
            auto dim = engine->getBindingDimensions(i);
            int num_elements{1};
            for (int j = 0; j < dim.nbDims; ++j) {
                num_elements *= dim.d[static_cast<size_t>(j)];
            }
            memory_ptrs.push_back(
                cuda_alloc<float>(static_cast<size_t>(num_elements)));
            bindings.push_back(memory_ptrs.back().get());
        }
    }
    void forward(NvInferUniquePtr<nvinfer1::IExecutionContext>& context)
    {
        auto batch_size = 1;
        // Commenting this line avoids leaking
        context->execute(static_cast<int>(batch_size), bindings.data());
        // Also happens with enqueue
        // context->enqueue(static_cast<int>(batch_size), bindings.data(), 0, nullptr);
    }

    std::vector<CudaMemoryUniquePtr<float>> memory_ptrs;
    std::vector<void*> bindings;
};

int main()
{
    static TRTLogger logger;

    for(auto loop = 0; loop < 10; ++loop)
    {
        std::cout << "Memory used: " << get_gpuinfo() << std::endl;
        std::string model = read_file("santis.onnx.engine");

        auto infer = NvInferUniquePtr<nvinfer1::IRuntime>(nvinfer1::createInferRuntime(logger));
	    auto engine = NvInferUniquePtr<nvinfer1::ICudaEngine>(infer->deserializeCudaEngine(model.data(), model.size(), nullptr));
        if (engine == nullptr) {
            throw std::runtime_error("Could not initialize engine");
        }

        auto context = NvInferUniquePtr<nvinfer1::IExecutionContext>(
            engine->createExecutionContext());

        Inference inference(engine.get());
        for (int i = 0; i < 100; ++i) {
            std::cout << "\rIteration " << i;
            std::cout.flush();
            inference.forward(context);
        }
        std::cout << std::endl;
    }
    
    return 0;
}

Compile command nvcc tensorrt_leak.cu -o tensorrt_leak -lnvinfer -lstdc++fs
Execution command ./tensorrt_leak
model.zip (81.8 MB)

Hi,

I would recommend you to follow below issues for further updates to avoid duplicate issues:
https://devtalk.nvidia.com/default/topic/1071670/jetson-agx-xavier/tensorrt-6-memory-leak/post/5430568/#5430568

Thanks