TensorRT7 run explicit shape model much slower than TensorRT5

Because in TensorRT7 i use ONNX file and must run explicit shape model as implicit shape model (set min, optimum, max dims are same in profile). It run much slower than TensorRT5 use UFF file.

How to run fastest?

Hi,

Could you please share the model file and platform details so we can help better?
TensorRT version:
Cuda version:
Cudnn version:
GPU:
Ubuntu/Windows OS:
Python:
Nvidia driver version:
TensorFlow/PyTorch version:

Thanks

@SunilJB I use GPU 960, driver 440.48.02 and Ubuntu 18.04. Two models converted from one tensorflow_model.

I tested UFF_file in docker image nvcr.io/nvidia/tensorrt:19.07-py3 (TensorRT5 cuda 10.1). My code:

#include "NvInfer.h"
#include <iostream>
#include "NvUtils.h"
using namespace nvinfer1;
// #include "NvOnnxParser.h"
#include "NvUffParser.h"
using namespace nvuffparser;
#include <thread>
#include <mutex>
#include <chrono>
#include <list>

#include "common/logger.h"
#include "common/buffers.h"
std::string model_path = "model_run_with_TRT5.uff";

void convert_dims_to_vect(const nvinfer1::Dims& dims, std::vector<int>& v){
    v.resize(dims.nbDims);
    for (int i=0; i<dims.nbDims; ++i)
        v[i] = dims.d[i];
}
void make_explicit_shapes(IExecutionContext* context,const  std::vector<std::string>& tensorNames, std::vector<std::vector<int>>& explicit_shapes){
	int n = tensorNames.size();
	explicit_shapes.resize(n);
	std::vector<int> v;
	for (int i=0; i<n; ++i){
		int index = context->getEngine().getBindingIndex((tensorNames[i]).c_str());
		convert_dims_to_vect(context->getEngine().getBindingDimensions(index), v);
		explicit_shapes[i] = v;
	}
}

std::string input_name = "fts_input_images";
Dims4 dims1(5,40,140,1);
Dims4 dims2(5,40,140,1);
Dims4 dims3(5,40,140,1);
std::vector<std::string> tensorNames;
int num_context = 3;

std::vector<std::vector<samplesCommon::DeviceBuffer>> deviceBuffers;
std::vector<cudaStream_t> streams;
void run(IExecutionContext* context, int i, std::vector<std::vector<int>> explicit_shapes_){
    std::vector<std::vector<int>>& explicit_shapes = explicit_shapes_;

    std::vector<samplesCommon::DeviceBuffer>* deviceBuffers_p;
    deviceBuffers_p = &deviceBuffers[i];

    std::vector<samplesCommon::HostBuffer> hostBuffers;
    for (int i=0; i<explicit_shapes.size(); ++i){
        auto size = samplesCommon::getElementSize(context->getEngine().getBindingDataType(i));
        size_t allocationSize = std::accumulate(explicit_shapes[i].begin(), explicit_shapes[i].end(), 1, std::multiplies<int>()) * size *dims2.d[0];
        hostBuffers.emplace_back(allocationSize);
        // std::cout<<"allocationSize: "<<allocationSize<<"\n";
        if (deviceBuffers_p->size()<=i)
            deviceBuffers_p->emplace_back(allocationSize);
    }

    std::vector<void*> mDeviceBindings;
    for (auto& buffer:(*deviceBuffers_p)){
        // std::cout<<buffer.data()<<" buffer\n";
        mDeviceBindings.emplace_back(buffer.data());
    }
    // cudaStream_t stream;
    // CHECK(cudaStreamCreate(&stream));
    cudaStream_t& stream = streams[i];

    cudaMemcpyKind memcpyType =  cudaMemcpyHostToDevice;
    for (int i=0; i<hostBuffers.size(); ++i){
        void* dstPtr = (*deviceBuffers_p)[i].data();
        void* srcPtr = hostBuffers[i].data();
        size_t byteSize = hostBuffers[i].size();
        CHECK(cudaMemcpyAsync(dstPtr, srcPtr, byteSize, memcpyType, stream));
    }

    if (!context->enqueue(dims2.d[0], mDeviceBindings.data(), stream, nullptr)){
        std::cout<<"error when run graph TensorRT\n";
        exit(-1);
    }

    memcpyType = cudaMemcpyDeviceToHost;
    for (int i=0; i<hostBuffers.size(); ++i){
        void* srcPtr = (*deviceBuffers_p)[i].data();
        void* dstPtr = hostBuffers[i].data();
        size_t byteSize = hostBuffers[i].size();
        CHECK(cudaMemcpyAsync(dstPtr, srcPtr, byteSize, memcpyType, stream));
    }

    cudaStreamSynchronize(stream);
    // cudaStreamDestroy(stream);
// std::cout<<i<<" done\n";
}

int main(int argc, char** argv) {
  if (getenv("num_context")!=NULL) num_context = std::stoi(std::string(getenv("num_context")));
  deviceBuffers.resize(num_context);
//   setReportableSeverity(Logger::Severity::kVERBOSE);// log of TensorRT

  auto builder = createInferBuilder(gLogger);
  builder->setMaxBatchSize(dims3.d[0]);
  auto network = builder->createNetwork();

    auto parser = createUffParser();
    auto order = UffInputOrder::kNHWC;
    nvinfer1::Dims3 dims(40,140,1);
    parser->registerInput(input_name.c_str(), dims, order);
    parser->registerOutput("fts_output");
	parser->parse(model_path.c_str(), *network, DataType::kFLOAT);

  std::cout<<"parse done\n";
  auto engine = builder->buildCudaEngine(*network);

    for (int i=0; i<engine->getNbBindings(); ++i){
        std::string name(engine->getBindingName(i));
        std::cout<<name<<"\n";
        if (name.find("[profile")==-1){
            tensorNames.emplace_back(name);
        }
    }

  std::vector<IExecutionContext*> contexts;
  std::vector<std::vector<int>> explicit_shapes;
  for (int i=0; i<num_context; ++i){
    contexts.emplace_back(engine->createExecutionContext());

    streams.emplace_back();
    CHECK(cudaStreamCreateWithPriority(&streams.back(), cudaStreamDefault, 0));

    explicit_shapes.clear();
    make_explicit_shapes(contexts.back(), tensorNames, explicit_shapes);
  }
    
    for (int i=0;i<num_context;++i)
        run(contexts[i], i, explicit_shapes);
    int n2 = 0;
    auto begin_time = std::chrono::high_resolution_clock::now();
    std::list<std::thread> v_thread;
    for (int i=0;i<num_context;++i)
        v_thread.emplace_back(run, contexts[i], i, explicit_shapes);
    for (;;){
        for (int i=0;i<num_context;++i){
            v_thread.front().join();
            n2 += dims2.d[0];
            if (n2>=5000){
                auto total_time = (std::chrono::high_resolution_clock::now() - begin_time);
                std::cout<< "process "<<n2<<" images in " << double(std::chrono::duration_cast<std::chrono::milliseconds>(total_time).count())/1000. << "s\n";
                n2 = 0;
                begin_time = std::chrono::high_resolution_clock::now();
            }
            v_thread.pop_front();
            v_thread.emplace_back(run, contexts[i], i, explicit_shapes);
        }
    }

}

And I tested ONNX_file with TensorRT7 cuda 10.2 cuDNN 7.6.5. My code:

#include "NvInfer.h"
#include <iostream>
#include "NvUtils.h"
#include "NvOnnxParser.h"
using namespace nvinfer1;
#include <thread>
#include <mutex>
#include <chrono>
#include <list>

#include "common/logger.h"
#include "common/buffers.h"
std::string model_path = "model_run_with_TRT7.onnx";

void convert_dims_to_vect(const nvinfer1::Dims& dims, std::vector<int>& v){
    v.resize(dims.nbDims);
    for (int i=0; i<dims.nbDims; ++i)
        v[i] = dims.d[i];
}
void make_explicit_shapes(IExecutionContext* context,const  std::vector<std::string>& tensorNames, std::vector<std::vector<int>>& explicit_shapes){
	int n = tensorNames.size();
	explicit_shapes.resize(n);
	std::string suffix;
	int profile_index = context->getOptimizationProfile();
	if (profile_index!=0)
		suffix = " [profile "+std::to_string(profile_index)+"]";
	std::vector<int> v;
	for (int i=0; i<n; ++i){
		int index = context->getEngine().getBindingIndex((tensorNames[i]+suffix).c_str());
		convert_dims_to_vect(context->getBindingDimensions(index), v);
		explicit_shapes[i] = v;
	}
}

std::string input_name = "fts_input_images:0";
Dims4 dims1(5,40,140,1);
Dims4 dims2(5,40,140,1);
Dims4 dims3(5,40,140,1);
std::vector<std::string> tensorNames;
int num_context = 3;

std::mutex mtx;// fix issue leak memory when run multi context parallel

std::vector<std::vector<samplesCommon::DeviceBuffer>> deviceBuffers;
std::vector<cudaStream_t> streams;
void run(IExecutionContext* context, int i, std::vector<std::vector<int>> explicit_shapes_){
    std::vector<std::vector<int>>& explicit_shapes = explicit_shapes_;

    std::vector<samplesCommon::DeviceBuffer>* deviceBuffers_p;
    // std::vector<samplesCommon::DeviceBuffer> deviceBuffers;
    // deviceBuffers_p = &deviceBuffers;
    deviceBuffers_p = &deviceBuffers[i];

    std::vector<samplesCommon::HostBuffer> hostBuffers;
    for (int i=0; i<explicit_shapes.size(); ++i){
        auto size = samplesCommon::getElementSize(context->getEngine().getBindingDataType(i));
        size_t allocationSize = std::accumulate(explicit_shapes[i].begin(), explicit_shapes[i].end(), 1, std::multiplies<int>()) * size;
        hostBuffers.emplace_back(allocationSize);
        // std::cout<<"allocationSize: "<<allocationSize<<"\n";
        if (deviceBuffers_p->size()<=i)
            deviceBuffers_p->emplace_back(allocationSize);
    }

    std::vector<void*> mDeviceBindings(i*hostBuffers.size(), NULL);
    for (auto& buffer:(*deviceBuffers_p)){
        // std::cout<<buffer.data()<<" buffer\n";
        mDeviceBindings.emplace_back(buffer.data());
    }
    // cudaStream_t stream;
    // CHECK(cudaStreamCreate(&stream));
    cudaStream_t& stream = streams[i];

    cudaMemcpyKind memcpyType =  cudaMemcpyHostToDevice;
    for (int i=0; i<hostBuffers.size(); ++i){
        void* dstPtr = (*deviceBuffers_p)[i].data();
        void* srcPtr = hostBuffers[i].data();
        size_t byteSize = hostBuffers[i].size();
        CHECK(cudaMemcpyAsync(dstPtr, srcPtr, byteSize, memcpyType, stream));
    }

    // mtx.lock();
    if (!context->enqueueV2(mDeviceBindings.data(), stream, nullptr)){
    // if (!context->enqueue(dims2.d[0], mDeviceBindings.data(), stream, nullptr)){
        std::cout<<"error when run graph TensorRT\n";
    }
    // mtx.unlock();

    memcpyType = cudaMemcpyDeviceToHost;
    for (int i=0; i<hostBuffers.size(); ++i){
        void* srcPtr = (*deviceBuffers_p)[i].data();
        void* dstPtr = hostBuffers[i].data();
        size_t byteSize = hostBuffers[i].size();
        CHECK(cudaMemcpyAsync(dstPtr, srcPtr, byteSize, memcpyType, stream));
    }

    cudaStreamSynchronize(stream);
    // cudaStreamDestroy(stream);
// std::cout<<i<<" done\n";
}

int main(int argc, char** argv) {
  if (getenv("num_context")!=NULL) num_context = std::stoi(std::string(getenv("num_context")));
  deviceBuffers.resize(num_context);
//   setReportableSeverity(Logger::Severity::kVERBOSE);// log of TensorRT
  
  auto builder = createInferBuilder(gLogger);
//   builder->setMaxBatchSize(dims3.d[0]);

  auto config = builder->createBuilderConfig();
  for (int i=0; i<num_context; ++i){
    auto profile = builder->createOptimizationProfile();
    profile->setDimensions(input_name.c_str(), OptProfileSelector::kMIN, dims1);
    profile->setDimensions(input_name.c_str(), OptProfileSelector::kOPT, dims2);
    profile->setDimensions(input_name.c_str(), OptProfileSelector::kMAX, dims3);
    config->addOptimizationProfile(profile);
  }

  auto network = builder->createNetworkV2(1U << static_cast<int>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH));
  auto parser = nvonnxparser::createParser(*network, gLogger);
  parser->parseFromFile(model_path.c_str(), 3);
  auto engine = builder->buildEngineWithConfig(*network,*config);

    for (int i=0; i<engine->getNbBindings(); ++i){
        std::string name(engine->getBindingName(i));
        std::cout<<name<<"\n";
        if (name.find("[profile")==-1){
            tensorNames.emplace_back(name);
        }
    }

  std::vector<IExecutionContext*> contexts;
  std::vector<std::vector<int>> explicit_shapes;
  for (int i=0; i<num_context; ++i){
    contexts.emplace_back(engine->createExecutionContext());
    auto context = contexts.back();
    context->setOptimizationProfile(i);
    std::cout<<"allInputDimensionsSpecified: "<<context->allInputDimensionsSpecified()<<"\n";
    int index;
    if (i==0)
        index = engine->getBindingIndex((input_name).c_str());
    else
        index = engine->getBindingIndex((input_name+" [profile "+std::to_string(i)+"]").c_str());
    context->setBindingDimensions(index, dims2);
    std::cout<<"allInputDimensionsSpecified must equal 1: "<<context->allInputDimensionsSpecified()<<"\n";

    streams.emplace_back();
    CHECK(cudaStreamCreateWithPriority(&streams.back(), cudaStreamDefault, 0));

    explicit_shapes.clear();
    make_explicit_shapes(context, tensorNames, explicit_shapes);
  }
    
    for (int i=0;i<num_context;++i)
        run(contexts[i], i, explicit_shapes);
    int n2 = 0;
    auto begin_time = std::chrono::high_resolution_clock::now();
    std::list<std::thread> v_thread;
    for (int i=0;i<num_context;++i)
        v_thread.emplace_back(run, contexts[i], i, explicit_shapes);
    for (;;){
        for (int i=0;i<num_context;++i){
            v_thread.front().join();
            n2 += dims2.d[0];
            if (n2>=5000){
                auto total_time = (std::chrono::high_resolution_clock::now() - begin_time);
                std::cout<< "process "<<n2<<" images in " << double(std::chrono::duration_cast<std::chrono::milliseconds>(total_time).count())/1000. << "s\n";
                n2 = 0;
                begin_time = std::chrono::high_resolution_clock::now();
            }
            v_thread.pop_front();
            v_thread.emplace_back(run, contexts[i], i, explicit_shapes);
        }
    }

}

TRT5 process 5000 images in about 1.1s. TRT7 process 5000 images in about 2.2s. I used batch 5 and 3 context parallel.
Run with 1 context TRT5 still faster TRT7.

Can you share the source model file as well from which UFF and ONNX models are generated?

Thanks

frozen_model_convert_to_onnx
frozen_model_convert_to_uff
It may be due to different tensorflow versions (may be tf13.1 and tf14), so the two files have slightly different size. Because I converted to UFF file a few months ago. But I sure 100% It same keras model and same weights.

Hi @anhtu812,
The ONNX model exported from TF seems to be using ops like Add, Sqrt, Neg, Div and Mul to implement the BatchNormalization op.
TRT does not fuse these ops to BatchNorm right now and this might be the cause of the slowdown.
Could you please replace these ops with the ONNX BatchNormalization op and try that model again with TRT7?

Thanks