TensorRT: input_1: dynamic input is missing dimensions in profile 0
I created an NN I trained in Python, converted it to ONNX, and now am trying to run that with TensorRT in C++. The C++ code I have is below (it is based on the code in How To Run Inference Using TensorRT C++ API | LearnOpenCV)
#include <iostream>
#include <fstream>
#include <NvInfer.h>
#include <NvInferRuntime.h>
#include <memory>
#include <NvOnnxParser.h>
#include <vector>
#include <cuda_runtime_api.h>
#include <opencv2/imgcodecs.hpp>
#include <opencv2/core/cuda.hpp>
#include <opencv2/core.hpp>
#include <algorithm>
#include <numeric>
class Logger : public nvinfer1::ILogger
{
public:
void log(Severity severity, const char* msg) override {
// remove this 'if' if you need more logged info
if ((severity == Severity::kERROR) || (severity == Severity::kINTERNAL_ERROR)) {
std::cout << msg << "\n";
}
}
} gLogger;
struct TRTDestroy
{
template <class T>
void operator()(T* obj) const
{
if (obj)
{
obj->destroy();
}
}
};
template <class T>
using TRTUniquePtr = std::unique_ptr<T, TRTDestroy>;
size_t getSizeByDim(const nvinfer1::Dims& dims)
{
size_t size = 1;
for (size_t i = 0; i < dims.nbDims; ++i)
{
size *= dims.d[i];
}
return size;
}
std::vector<std::string> getClassNames(const std::string& imagenet_classes)
{
std::ifstream classes_file(imagenet_classes);
std::vector<std::string> classes;
if (!classes_file.good())
{
std::cerr << "ERROR: can't read file with classes names.\n";
return classes;
}
std::string class_name;
while (std::getline(classes_file, class_name))
{
classes.push_back(class_name);
}
return classes;
}
void preprocessImage(const std::string& image_path, float* gpu_input, const nvinfer1::Dims& dims)
{
cv::Mat frame = cv::imread(image_path);
if (frame.empty())
{
std::cerr << "Input image " << image_path << " load failed\n";
return;
}
cv::cuda::GpuMat gpu_frame;
gpu_frame.upload(frame);
int channels=1;
auto input_width = dims.d[1];
auto input_height = dims.d[0];
auto input_size = cv::Size(input_width, input_height);
cv::cuda::GpuMat resized=gpu_frame;
cv::cuda::GpuMat flt_image;
resized.convertTo(flt_image, CV_32FC1, 1.f / 255.f);
std::vector<cv::cuda::GpuMat> chw;
for (size_t i = 0; i < channels; ++i)
{
chw.emplace_back(cv::cuda::GpuMat(input_size, CV_32FC1, gpu_input + i * input_width * input_height));
}
cv::split(flt_image, chw);
}
void postprocessResults(float *gpu_output, const nvinfer1::Dims &dims, int batch_size)
{
std::vector<float> cpu_output(getSizeByDim(dims) * batch_size);
cudaMemcpy(cpu_output.data(), gpu_output, cpu_output.size() * sizeof(float), cudaMemcpyDeviceToHost);
auto cols = dims.d[1];
auto rows = dims.d[0];
cv::Mat Finalmat = cv::Mat(rows, cols, CV_32FC1);
memcpy(Finalmat.data, cpu_output.data(), cpu_output.size()*sizeof(float));
Finalmat.convertTo(Finalmat, CV_8UC3, 255.0);
cv::imwrite("/tensorRT.bmp", Finalmat);
}
void parseOnnxModel(const std::string& model_path, TRTUniquePtr<nvinfer1::ICudaEngine>& engine,
TRTUniquePtr<nvinfer1::IExecutionContext>& context)
{
nvinfer1::IBuilder *builder = nvinfer1::createInferBuilder(gLogger);
const auto explicitBatch = 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
nvinfer1::INetworkDefinition *network = builder->createNetworkV2(explicitBatch);
TRTUniquePtr<nvonnxparser::IParser> parser{nvonnxparser::createParser(*network, gLogger)};
// parse ONNX
if (!parser->parseFromFile(model_path.c_str(), static_cast<int>(nvinfer1::ILogger::Severity::kINFO)))
{
std::cerr << "ERROR: could not parse the model.\n";
return;
}
//create Config to configure engine parameters such as max memory or set FP16 mode
TRTUniquePtr<nvinfer1::IBuilderConfig> config{builder->createBuilderConfig()};
nvinfer1::IOptimizationProfile* profile = builder->createOptimizationProfile();
profile->setDimensions("foo", nvinfer1::OptProfileSelector::kMIN, nvinfer1::Dims3(3,100,200));
profile->setDimensions("foo", nvinfer1::OptProfileSelector::kOPT, nvinfer1::Dims3(3,1024,1024));
profile->setDimensions("foo", nvinfer1::OptProfileSelector::kMAX, nvinfer1::Dims3(3,4096,4096));
config->addOptimizationProfile(profile);
// allow TensorRT to use up to 1GB of GPU memory for tactic selection.
config->setMaxWorkspaceSize(1ULL << 30);
// use FP16 mode if possible
if (builder->platformHasFastFp16())
{
config->setFlag(nvinfer1::BuilderFlag::kFP16);
}
// we have only one image in batch
builder->setMaxBatchSize(1);
// generate TensorRT engine optimized for the target platform
engine.reset(builder->buildEngineWithConfig(*network, *config));
context.reset(engine->createExecutionContext());
}
// main pipeline ------------------------------------------------------------------------------------------------------
int main(int argc, char* argv[])
if (argc < 3)
{
std::cerr << "usage: " << argv[0] << " [model_name].onnx [image_name].jpg\n";
return -1;
}
std::string model_path(argv[1]);
std::string image_path(argv[2]);
int batch_size = 1;
TRTUniquePtr<nvinfer1::ICudaEngine> engine{nullptr};
TRTUniquePtr<nvinfer1::IExecutionContext> context{nullptr};
parseOnnxModel(model_path, engine, context);
std::vector<nvinfer1::Dims> input_dims; // we expect only one input
std::vector<nvinfer1::Dims> output_dims; // and one output
std::vector<void*> buffers(engine->getNbBindings()); // buffers for input and output data
for (size_t i = 0; i < engine->getNbBindings(); ++i)
{
auto binding_size = getSizeByDim(engine->getBindingDimensions(i)) * batch_size * sizeof(float);
cudaMalloc(&buffers[i], binding_size);
if (engine->bindingIsInput(i))
{
input_dims.emplace_back(engine->getBindingDimensions(i));
}
else
{
output_dims.emplace_back(engine->getBindingDimensions(i));
}
}
if (input_dims.empty() || output_dims.empty())
{
std::cerr << "Expect at least one input and one output for network\n";
return -1;
}
// preprocess input data
preprocessImage(image_path, (float *) buffers[0], input_dims[0]);
// inference - "enqueue" asynchronously executes inference on a batch.
context->enqueue(batch_size, buffers.data(), 0, nullptr);
// postprocess results
postprocessResults((float *) buffers[1], output_dims[0], batch_size);
for (void* buf : buffers)
{
cudaFree(buf);
}
return 0;
}
Operating System + Version: Ubuntu 18
TensorRT Version:
For training the NN, creating the .keras and ONNX files, I used Docker container 19.10, which has CUDA 10.1 because according to this source, if I wanted to use the GPU with TF then I needed CUDA 10.1: 從原始碼開始建構 | TensorFlow
When I ran ./trt_sample unet.onnx testImage.bmp
, I got this error:
----------------------------------------------------------------
Input filename: unet.onnx
ONNX IR version: 0.0.7
Opset version: 12
Producer name: keras2onnx
Producer version: 1.7.0
Domain: onnxmltools
Model version: 0
Doc string:
----------------------------------------------------------------
WARNING: ONNX model has a newer ir_version (0.0.7) than this parser was built against (0.0.3).
While parsing node number 1 [Conv]:
ERROR: ModelImporter.cpp:296 In function importModel:
[5] Assertion failed: tensors.count(input_name)
ERROR: could not parse the model.
Segmentation fault (core dumped)
If I then tried to run the ONNX file and TensorRT with Docker container 20.03, which has CUDA 10.2 and TensorRT 7.0.0, then I get this error:
----------------------------------------------------------------
Input filename: unet.onnx
ONNX IR version: 0.0.7
Opset version: 12
Producer name: keras2onnx
Producer version: 1.7.0
Domain: onnxmltools
Model version: 0
Doc string:
----------------------------------------------------------------
input_1: dynamic input is missing dimensions in profile 0.
Network validation failed.
Segmentation fault (core dumped)
can anyone help?