TensorRT: input_1: dynamic input is missing dimensions in profile 0

TensorRT: input_1: dynamic input is missing dimensions in profile 0

I created an NN I trained in Python, converted it to ONNX, and now am trying to run that with TensorRT in C++. The C++ code I have is below (it is based on the code in https://www.learnopencv.com/how-to-run-inference-using-tensorrt-c-api/)

#include <iostream>
#include <fstream>
#include <NvInfer.h>
#include <NvInferRuntime.h>
#include <memory>
#include <NvOnnxParser.h>
#include <vector>
#include <cuda_runtime_api.h>
#include <opencv2/imgcodecs.hpp>
#include <opencv2/core/cuda.hpp>
#include <opencv2/core.hpp>
#include <algorithm>
#include <numeric>

class Logger : public nvinfer1::ILogger
{
public:
    void log(Severity severity, const char* msg) override {
        // remove this 'if' if you need more logged info
        if ((severity == Severity::kERROR) || (severity == Severity::kINTERNAL_ERROR)) {
            std::cout << msg << "\n";
        }
    }
} gLogger;

struct TRTDestroy
{
    template <class T>
    void operator()(T* obj) const
    {
        if (obj)
        {
            obj->destroy();
        }
    }
};

template <class T>
using TRTUniquePtr = std::unique_ptr<T, TRTDestroy>;

size_t getSizeByDim(const nvinfer1::Dims& dims)
{
    size_t size = 1;
    for (size_t i = 0; i < dims.nbDims; ++i)
    {
        size *= dims.d[i];
    }
    return size;
}

std::vector<std::string> getClassNames(const std::string& imagenet_classes)
{
    std::ifstream classes_file(imagenet_classes);
    std::vector<std::string> classes;
    if (!classes_file.good())
    {
        std::cerr << "ERROR: can't read file with classes names.\n";
        return classes;
    }
    std::string class_name;
    while (std::getline(classes_file, class_name))
    {
        classes.push_back(class_name);
    }
    return classes;
}

void preprocessImage(const std::string& image_path, float* gpu_input, const nvinfer1::Dims& dims)
{
    cv::Mat frame = cv::imread(image_path);
    if (frame.empty())
    {
        std::cerr << "Input image " << image_path << " load failed\n";
        return;
    }
    cv::cuda::GpuMat gpu_frame;
    gpu_frame.upload(frame);
    int channels=1;
    auto input_width = dims.d[1];
    auto input_height = dims.d[0];
    auto input_size = cv::Size(input_width, input_height);
    cv::cuda::GpuMat resized=gpu_frame;
    cv::cuda::GpuMat flt_image;
    resized.convertTo(flt_image, CV_32FC1, 1.f / 255.f);
    std::vector<cv::cuda::GpuMat> chw;
    for (size_t i = 0; i < channels; ++i)
    {
        chw.emplace_back(cv::cuda::GpuMat(input_size, CV_32FC1, gpu_input + i * input_width * input_height));
    }
    cv::split(flt_image, chw);
}

void postprocessResults(float *gpu_output, const nvinfer1::Dims &dims, int batch_size)
{
    std::vector<float> cpu_output(getSizeByDim(dims) * batch_size);
    cudaMemcpy(cpu_output.data(), gpu_output, cpu_output.size() * sizeof(float), cudaMemcpyDeviceToHost);

    auto cols = dims.d[1];
    auto rows = dims.d[0];
    cv::Mat Finalmat = cv::Mat(rows, cols, CV_32FC1); 
    memcpy(Finalmat.data, cpu_output.data(), cpu_output.size()*sizeof(float));
    Finalmat.convertTo(Finalmat, CV_8UC3, 255.0);
    cv::imwrite("/tensorRT.bmp", Finalmat);
}

void parseOnnxModel(const std::string& model_path, TRTUniquePtr<nvinfer1::ICudaEngine>& engine,
                    TRTUniquePtr<nvinfer1::IExecutionContext>& context)
{
    nvinfer1::IBuilder *builder = nvinfer1::createInferBuilder(gLogger);
    const auto explicitBatch = 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
    nvinfer1::INetworkDefinition *network = builder->createNetworkV2(explicitBatch);
    TRTUniquePtr<nvonnxparser::IParser> parser{nvonnxparser::createParser(*network, gLogger)};
    
    // parse ONNX
    if (!parser->parseFromFile(model_path.c_str(), static_cast<int>(nvinfer1::ILogger::Severity::kINFO)))
    {
        std::cerr << "ERROR: could not parse the model.\n";
        return;
    }

    //create Config to configure engine parameters such as max memory or set FP16 mode
    TRTUniquePtr<nvinfer1::IBuilderConfig> config{builder->createBuilderConfig()};

    nvinfer1::IOptimizationProfile* profile = builder->createOptimizationProfile();
    profile->setDimensions("foo", nvinfer1::OptProfileSelector::kMIN, nvinfer1::Dims3(3,100,200));
    profile->setDimensions("foo", nvinfer1::OptProfileSelector::kOPT, nvinfer1::Dims3(3,1024,1024));
    profile->setDimensions("foo", nvinfer1::OptProfileSelector::kMAX, nvinfer1::Dims3(3,4096,4096));

    config->addOptimizationProfile(profile);
    // allow TensorRT to use up to 1GB of GPU memory for tactic selection.
    config->setMaxWorkspaceSize(1ULL << 30);
    // use FP16 mode if possible
    if (builder->platformHasFastFp16())
    {
        config->setFlag(nvinfer1::BuilderFlag::kFP16);
    }
    // we have only one image in batch
    builder->setMaxBatchSize(1);
    // generate TensorRT engine optimized for the target platform
    engine.reset(builder->buildEngineWithConfig(*network, *config));
    context.reset(engine->createExecutionContext());
}

// main pipeline ------------------------------------------------------------------------------------------------------
int main(int argc, char* argv[])
    if (argc < 3)
    {
        std::cerr << "usage: " << argv[0] << " [model_name].onnx [image_name].jpg\n";
        return -1;
    }
    std::string model_path(argv[1]);
    std::string image_path(argv[2]);
    int batch_size = 1;

    TRTUniquePtr<nvinfer1::ICudaEngine> engine{nullptr};
    TRTUniquePtr<nvinfer1::IExecutionContext> context{nullptr};
    parseOnnxModel(model_path, engine, context);

    std::vector<nvinfer1::Dims> input_dims; // we expect only one input
    std::vector<nvinfer1::Dims> output_dims; // and one output

    std::vector<void*> buffers(engine->getNbBindings()); // buffers for input and output data
    for (size_t i = 0; i < engine->getNbBindings(); ++i)
    {
        auto binding_size = getSizeByDim(engine->getBindingDimensions(i)) * batch_size * sizeof(float);
        cudaMalloc(&buffers[i], binding_size);
        if (engine->bindingIsInput(i))
        {
            input_dims.emplace_back(engine->getBindingDimensions(i));
        }
        else
        {
            output_dims.emplace_back(engine->getBindingDimensions(i));
        }
    }
    if (input_dims.empty() || output_dims.empty())
    {
        std::cerr << "Expect at least one input and one output for network\n";
        return -1;
    }

    // preprocess input data
    preprocessImage(image_path, (float *) buffers[0], input_dims[0]);
    // inference - "enqueue" asynchronously executes inference on a batch. 
    context->enqueue(batch_size, buffers.data(), 0, nullptr);
    // postprocess results
    postprocessResults((float *) buffers[1], output_dims[0], batch_size);


    for (void* buf : buffers)
    {
        cudaFree(buf);
    }
    return 0;
}

Operating System + Version: Ubuntu 18
TensorRT Version:
For training the NN, creating the .keras and ONNX files, I used Docker container 19.10, which has CUDA 10.1 because according to this source, if I wanted to use the GPU with TF then I needed CUDA 10.1: https://www.tensorflow.org/install/source#gpu

When I ran ./trt_sample unet.onnx testImage.bmp, I got this error:

----------------------------------------------------------------
Input filename:   unet.onnx
ONNX IR version:  0.0.7
Opset version:    12
Producer name:    keras2onnx
Producer version: 1.7.0
Domain:           onnxmltools
Model version:    0
Doc string:       
----------------------------------------------------------------

	WARNING: ONNX model has a newer ir_version (0.0.7) than this parser was built against (0.0.3).
While parsing node number 1 [Conv]:
ERROR: ModelImporter.cpp:296 In function importModel:
[5] Assertion failed: tensors.count(input_name)
ERROR: could not parse the model.
Segmentation fault (core dumped)

If I then tried to run the ONNX file and TensorRT with Docker container 20.03, which has CUDA 10.2 and TensorRT 7.0.0, then I get this error:

	----------------------------------------------------------------
Input filename:   unet.onnx
ONNX IR version:  0.0.7
Opset version:    12
Producer name:    keras2onnx
Producer version: 1.7.0
Domain:           onnxmltools
Model version:    0
Doc string:       
----------------------------------------------------------------
input_1: dynamic input is missing dimensions in profile 0.
Network validation failed.
Segmentation fault (core dumped)

can anyone help?

Hi @mke489,
Could you please share the onnx model file so we can help better.
Meanwhile, could you please try trtexec command in verbose mode?

Thanks

Hi @mke489,
I could see your model has passed the conversion, and doesnt look like any issue with your model.
And there might be an issue with your script.
Will check and get back to you.
Thanks!

Hi @mke489,
Can you try using trtexec command to run your model and see if this works?

Thanks!
Aakanksha

@AakankshaS, @SunilJB run trtexec on my ONNX file? I already did that and showed the output I got in trtexec_output.txt

@AakankshaS can you please clarify your last question?

Hi @mke489,
trtexec is an alternative way of generating your serialized engine quickly without having to develop your own application
And from the trtexec logs, you can see that the model has passed.
So you can generate engine file using trtexec command, alternatively.

Thanks!

you want me to generate the engine file with trtexec? How do I do that?

I did make some changes to the code that fixed the previous error:

  1. I replaced the foo and nvinfer1::Dims3(x,y,z) in the following lines:
profile->setDimensions("foo", nvinfer1::OptProfileSelector::kMIN, nvinfer1::Dims3(3,100,200));
    profile->setDimensions("foo", nvinfer1::OptProfileSelector::kOPT, nvinfer1::Dims3(3,1024,1024));
    profile->setDimensions("foo", nvinfer1::OptProfileSelector::kMAX, nvinfer1::Dims3(3,4096,4096));

with input_1 and nvinfer1::Dims4(1,1024,1024,1), respectively

  1. Replaced cv::split(flt_image, chw); with cuda::cv::split(flt_image, chw);

  2. Replaced context->enqueue(batch_size, buffers.data(), 0, nullptr); with context->executeV2(buffers.data());
    I also added context->setBindingDimensions(0, nvinfer1::Dims4(1,1024,1024,1)) in the line above the executeV2

  3. Replaced

for (size_t i = 0; i < engine->getNbBindings(); ++i)
{
    auto binding_size = getSizeByDim(engine->getBindingDimensions(i)) * batch_size * sizeof(float);
    cudaMalloc(&buffers[i], binding_size);
    if (engine->bindingIsInput(i))
    {
        input_dims.emplace_back(engine->getBindingDimensions(i));
    }
    else
    {
        output_dims.emplace_back(engine->getBindingDimensions(i));
    }
}

with

for (size_t i = 0; i < engine->getNbBindings(); ++i)
{
    if (engine->bindingIsInput(i))
    {
        input_dims.emplace_back(engine->getBindingDimensions(i));
    }
    else
    {
        output_dims.emplace_back(engine->getBindingDimensions(i));
    }

    auto binding_size = getSizeByDim(context->getBindingDimensions(i)) * batch_size * sizeof(float);
    auto status = cudaMalloc(&buffers[i], binding_size);
}

The problem now is binding_size shows 18446744073705357312. That doesn’t make sense as I expected it would be around 1x1024x1024x1 = 1048576, since that’s the size of the input image I used. I get

parameter check failed at engine.cpp::resolveSlots::1092, condition: allInputDimensionsSpecified(routine

it seems like the problem is that input_dims[0].d[0] is -1 for i=0 in the loop

 for (size_t i = 0; i < engine->getNbBindings(); ++i)

But input_dims[0].d[1], input_dims[0].d[2], input_dims[0].d[3] are 1024,1024,1, respectively, which makes more sense

Also, at the line

auto binding_size = getSizeByDim(context->getBindingDimensions(i)) * batch_size * sizeof(float);

the code shows that context->getBindingDimensions(i).nbDims is -1 when i=0

Hi @mke489,
did you try running your model with the latest script?

You can use the below command with your model name and shape to generate the engine/trt file.
trtexec --onnx=your_model.onnx --verbose --explicitBatch --shapes=input_name:64x3x288x288 --saveEngine=engineName.engine

Thanks!

output of

trtexec --onnx=owl.onnx --verbose --explicitBatch --shapes=input_name:64x3x288x288 --saveEngine=engineName.engine    

is attached

trtexec_output.txt (256.6 KB)

Hi @mke489,

In the running dir, after running this command, an engine file will be generated which you can use for inferencing.
Also, you will need to change the input shape of your model.
Please refer the below link for details


Thanks!

change the input shape to what? that link doesn’t mention the input shape or engine file anywhere. you mean change --shapes=input_name:64x3x288x288 to --shapes=input_name:1x1024x1024x1?

the engine file is at https://drive.google.com/file/d/1BIYYoo_GnlrBeRdhePGB-wwmVHvIJ7GR/view?usp=sharing

@AakankshaS can you answer my last question?