V2DynamicExt when there are runtime input dimensions

I have made plugin with fixed size input. I have errors as

[08/27/2020-15:00:52] [W] [TRT] onnx2trt_utils.cpp:220: Your ONNX model has been generated with INT64 weights, while TensorRT does not natively support INT64. Attempting to cast down to INT32.
[08/27/2020-15:00:52] [I] [TRT] ModelImporter.cpp:135: No importer registered for op: CTCGreedyDecoder. Attempting to import as plugin.
[08/27/2020-15:00:52] [I] [TRT] builtin_op_importers.cpp:3659: Searching for plugin: CTCGreedyDecoder, plugin_version: 1, plugin_namespace: 
[08/27/2020-15:00:52] [I] [TRT] builtin_op_importers.cpp:3676: Successfully created plugin: CTCGreedyDecoder
[08/27/2020-15:00:52] [E] [TRT] CTCGreedyDecoder: PluginV2Layer must be V2DynamicExt when there are runtime input dimensions.
[08/27/2020-15:00:52] [E] [TRT] CTCGreedyDecoder: PluginV2Layer must be V2DynamicExt when there are runtime input dimensions.
[08/27/2020-15:00:52] [E] [TRT] CTCGreedyDecoder: PluginV2Layer must be V2DynamicExt when there are runtime input dimensions.
[08/27/2020-15:00:52] [E] [TRT] Layer CTCGreedyDecoder failed validation
[08/27/2020-15:00:52] [E] [TRT] Network validation failed.

I can’t find where runtime input dimensions are set in my plugin code.

My plugin is as follows and where did I set runtime input dimensions?

class CTCGreedyDecoder : public IPluginV2IOExt
{
public:
    //CTCGreedyDecoder has two inputs (import/transpose:0 (88, 1, 43), import/Fill:0 (1,)) 
    //and two outputs (import/CTCGreedyDecoder:0 (7, 2), import/ToInt32:0 (7,))
    CTCGreedyDecoder(const PluginFieldCollection& fc)
    {        
        (void) fc;
    }
    //data represents the class member variables data serialized in 
    //void serialize(void* buffer) const override
    //length is the length of data in serialization
    //Need to deserialize in this Constructor method    
    CTCGreedyDecoder(const void* data, size_t length)
    {
        const char* d = static_cast<const char*>(data);
        const char* const a = d; 
        mInputDims_1.nbDims = read<int>(d);
        for (int i = 0; i < mInputDims_1.nbDims; ++i)
        {
            mInputDims_1.d[i] = read<int>(d);
        }
        mInputDims_2.nbDims = read<int>(d);
        for (int i = 0; i < mInputDims_2.nbDims; ++i)
        {
            mInputDims_2.d[i] = read<int>(d);
        }
        mOutputDims.nbDims = read<int>(d);
        for (int i = 0; i < mOutputDims.nbDims; ++i)
        {
            mOutputDims.d[i] = read<int>(d);
        }       
        mDataType = static_cast<DataType>(read<int>(d));
        if (mDataType == DataType::kINT8)
        {
            mInHostScale = read<float>(d);
            mOutHostScale = read<float>(d);
        }       
        assert(d == a + length);
    }

    // It makes no sense to construct CTCGreedyDecoder without arguments.
    CTCGreedyDecoder() = delete;

    virtual ~CTCGreedyDecoder() {}

public:
    int getNbOutputs() const override
    {
        return 1;//it has one output
    }

    Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override
    {
        std::cout << "nbInputDims " << nbInputDims << std::endl;//for debugging
        assert(index == 0 && nbInputDims == 2 && inputs[0].nbDims == 3 && inputs[1].nbDims == 1);
        //Actually CTCGreedyDecoder has dynamic output shape. Varying wrt the different text lengths
        //But now changed fixed output shape
        //20 is set as License plate text length is less than 20
        //This plugin will combine CTCGreedyDecoder and SparseToDense
        //So output is taken from Sparse to Dense
        //Output size is fixed and not related to input size so set fixed size
        return Dims2(1, 20);
    }

    int initialize() override
    {        
        return 0;
    }

    void terminate() override
    {
        //To release memory
    }

    size_t getWorkspaceSize(int maxBatchSize) const override
    {
        return 0;
    }
    
    int enqueue(int batchSize, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream) override
    {   
        int rows = mInputDims_1.d[0];
        int batch = mInputDims_1.d[1];   
        int widths = mInputDims_1.d[2]; 
        float* output = reinterpret_cast<float*>(outputs[0]);   
        interface(stream, inputs[0], output, rows, batch, widths);
        return 0;
    }
    //All class member private variables are serialized 
    //in the following two methods one after another
    //serializationSize is the size of all member variables
    size_t getSerializationSize() const override
    {
        size_t serializationSize = 0;
        serializationSize += sizeof(mInputDims_1.nbDims);
        serializationSize += sizeof(mInputDims_1.d[0]) * mInputDims_1.nbDims;
        serializationSize += sizeof(mInputDims_2.nbDims);
        serializationSize += sizeof(mInputDims_2.d[0]) * mInputDims_2.nbDims;
        serializationSize += sizeof(mOutputDims.nbDims);
        serializationSize += sizeof(mOutputDims.d[0]) * mOutputDims.nbDims;
        serializationSize += sizeof(static_cast<int>(mDataType));
        if (mDataType == DataType::kINT8)
        {
            serializationSize += sizeof(float) * 2;
        }      
        return serializationSize;
    }
    //serialize to char pointer
    void serialize(void* buffer) const override
    {
        char* d = static_cast<char*>(buffer);
        const char* const a = d;
        write(d, mInputDims_1.nbDims);
        assert(mInputDims_1.nbDims <= mInputDims_1.MAX_DIMS);
        for (int i = 0; i < mInputDims_1.nbDims; ++i)
        {
            write(d, mInputDims_1.d[i]);
        }
        write(d, mInputDims_2.nbDims);
        assert(mInputDims_2.nbDims <= mInputDims_2.MAX_DIMS);
        for (int i = 0; i < mInputDims_2.nbDims; ++i)
        {
            write(d, mInputDims_2.d[i]);
        }
        write(d, mOutputDims.nbDims);
        assert(mOutputDims.nbDims <= mOutputDims.MAX_DIMS);
        for (int i = 0; i < mOutputDims.nbDims; ++i)
        {
            write(d, mOutputDims.d[i]);
        }
        write(d, static_cast<int>(mDataType));
        if (mDataType == DataType::kINT8)
        {
            write(d, mInHostScale);
            write(d, mOutHostScale);
        }    
        assert(d == a + getSerializationSize());
    }
    //Plugin configuration for the input/output types, formats and sizes
    //PluginTensorDesc are fields that a plugin might see for an input or output.
    //it has 4 attributes (Dims, DataType, TensorFormat, float 	scale)
    //you can assert all match to the requirements
    //you can check all input/output types meets the expectations
    //For this CTCGreedyDecoder
    //first input is 3D vector 
    //second is 1D vector     
    void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) override
    {
        assert(in && nbInput == 2);
        assert(out && nbOutput == 1);
        assert(in[0].type == out[0].type);
        assert(in[0].format == TensorFormat::kLINEAR && out[0].format == TensorFormat::kLINEAR);
        mDataType = in[0].type;  
        mInputDims_1 = in[0].dims;//Now batch size is set to 1. one image by one image
        mInputDims_2 = in[1].dims;//Now batch size is set to 1. one image by one image
        mOutputDims = out[0].dims;
        mInHostScale = in[0].scale >= 0.0f ? in[0].scale : -1.0f;
        mOutHostScale = out[0].scale >= 0.0f ? out[0].scale : -1.0f;      
        return ;
    }

    //! The combination of kLINEAR + kINT8/kHALF/kFLOAT is supported.
    bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const override
    {        
        assert(nbInputs == 1 && nbOutputs == 1 && pos < nbInputs + nbOutputs);
        bool condition = inOut[pos].format == TensorFormat::kLINEAR;
        condition &= inOut[pos].type != DataType::kINT32;
        condition &= inOut[pos].type == inOut[0].type;
        return condition;
    }

    DataType getOutputDataType(int index, const DataType* inputTypes, int nbInputs) const override
    {
        assert(inputTypes && nbInputs == 2);
        (void) index;
        return inputTypes[0];
    }

    const char* getPluginType() const override
    {
        return "CTCGreedyDecoder";
    }

    const char* getPluginVersion() const override
    {
        return "1";
    }

    void destroy() override
    {
        delete this;
    }

    IPluginV2Ext* clone() const override
    {
        auto* plugin = new CTCGreedyDecoder(*this);
        return plugin;
    }

    void setPluginNamespace(const char* libNamespace) override
    {
        mNamespace = libNamespace;
    }

    const char* getPluginNamespace() const override
    {
        return mNamespace.data();
    }

    bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const override
    {
        return false;
    }

    bool canBroadcastInputAcrossBatch(int inputIndex) const override
    {
        return false;
    }

private:
    template <typename T>
    void write(char*& buffer, const T& val) const
    {
        *reinterpret_cast<T*>(buffer) = val;
        buffer += sizeof(T);
    }

    template <typename T>
    T read(const char*& buffer) const
    {
        T val = *reinterpret_cast<const T*>(buffer);
        buffer += sizeof(T);
        return val;
    }
    //input is (88, 1, 43)
    /*void copyDeviceInputToFP32(const void* src, void*& dst)
    {
        assert(mDataType == DataType::kINT8);
        size_t inCount_1 = getC(mInputDims_1) * getH(mInputDims_1) * getW(mInputDims_1);
        std::cout << "getC(mInputDims_1) " << getC(mInputDims_1) << " getH(mInputDims_1) " << getH(mInputDims_1) << " getW(mInputDims_1) " << getW(mInputDims_1) << std::endl;//for debugging
        std::unique_ptr<char> inputTmp_1{new char[inCount_1 * elementSize(mDataType)]};
        CHECK(cudaMemcpy(inputTmp_1.get(), src[0], inCount_1 * elementSize(mDataType), cudaMemcpyDeviceToHost));
        std::unique_ptr<float> inputFP32_1{new float[inCount_1]};
        transform<DataType::kINT8, DataType::kFLOAT>(inputTmp_1.get(), inputFP32_1.get(), inCount_1);
        // int8 scale
        int hw = mInputDims_1.d[1] * mInputDims_1.d[2];
        for (int j = 0; j < mInputDims_1.d[0]; ++j)
        {
            std::transform(inputFP32_1.get() + hw * j, inputFP32_1.get() + hw * (j + 1), inputFP32_1.get() + hw * j,
                [&](float in) -> float { return in * mInHostScale; });
        }
        CHECK(cudaMalloc(&dst[0], inCount_1 * elementSize(DataType::kFLOAT)));
        CHECK(cudaMemcpy(dst[0], inputFP32_1.get(), inCount_1 * elementSize(DataType::kFLOAT), cudaMemcpyHostToDevice));
    }

    void copyDeviceToInt8Output(const void* src, void* dst)
    {
        size_t outCount = getH(mOutputDims) * getW(mOutputDims);
        std::cout << " getH(mOutputDims) " << getH(mOutputDims) << std::endl;//for debugging
        std::unique_ptr<float> outTmp{new float[outCount]};
        CHECK(cudaMemcpy(outTmp.get(), src, outCount * elementSize(DataType::kFLOAT), cudaMemcpyDeviceToHost));
        std::unique_ptr<char> outInt8{new char[outCount * elementSize(DataType::kINT8)]};
        // int8 + scale
        int hw = mOutputDims.d[1] * mOutputDims.d[2];
        //for (int j = 0; j < mInputDims_1.d[0]; ++j)//since only one channel
        //{
        std::transform(outTmp.get() + hw * j, outTmp.get() + hw * (j + 1), outTmp.get() + hw * j,
                [&](float in) -> float { return in / mOutHostScale; });
        //}
        transform<DataType::kFLOAT, DataType::kINT8>(outTmp.get(), outInt8.get(), outCount);
        CHECK(cudaMemcpy(dst, outInt8.get(), outCount, cudaMemcpyHostToDevice));
    }*/

private:    
    //This sparsetodense plugin doesn't need private members
    //All inputs/outputs are in the stream from previous outputs
    //so it is not necessary to have private members  
    DataType mDataType;
    Dims mInputDims_1;
    Dims mInputDims_2;
    Dims mOutputDims;
    float mInHostScale{-1.0f};
    float mOutHostScale{-1.0f};
    std::string mNamespace;
};

class CTCGreedyDecoderCreator : public IPluginCreator
{
public:
    const char* getPluginName() const override
    {
        return "CTCGreedyDecoder";
    }

    const char* getPluginVersion() const override
    {
        return "1";
    }

    const PluginFieldCollection* getFieldNames() override
    {
        return &mFieldCollection;
    }

    IPluginV2* createPlugin(const char* name, const PluginFieldCollection* fc) override
    {
        auto plugin = new CTCGreedyDecoder(*fc);
        mFieldCollection = *fc;
        mPluginName = name;
        return plugin;
    }
    
    IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override
    {
        //serialData is all data serialized in fun void serialize(void* buffer) const override
        //serialLength is length of data seiralized in serialize(void* buffer)
        auto plugin = new CTCGreedyDecoder(serialData, serialLength);
        mPluginName = name;
        return plugin;
    }

    void setPluginNamespace(const char* libNamespace) override
    {
        mNamespace = libNamespace;
    }

    const char* getPluginNamespace() const override
    {
        return mNamespace.c_str();
    }

private:
    std::string mNamespace;
    std::string mPluginName;
    PluginFieldCollection mFieldCollection{0, nullptr};
};

Hi @edit_or,
Request you to share your model so that we can assist you better.

Thanks!

Here is the model.

Thank you.

Hi @edit_or,
I tried running your model, and getting a different error.


Can you validate the same by running your model with the below command?
trtexec --onnx=recg_sparsetodenseremoved.onnx --verbose --explicitBatch

Thanks!

Yes I have the following error

[08/28/2020-16:56:14] [V] [TRT] ModelImporter.cpp:179: Fill [Expand] outputs: [Fill:0 -> (-1)], 
[08/28/2020-16:56:14] [V] [TRT] ModelImporter.cpp:103: Parsing node: CTCGreedyDecoder [CTCGreedyDecoder]
[08/28/2020-16:56:14] [V] [TRT] ModelImporter.cpp:119: Searching for input: transpose:0
[08/28/2020-16:56:14] [V] [TRT] ModelImporter.cpp:119: Searching for input: Fill:0
[08/28/2020-16:56:14] [V] [TRT] ModelImporter.cpp:125: CTCGreedyDecoder [CTCGreedyDecoder] inputs: [transpose:0 -> (-1, -1, -1)], [Fill:0 -> (-1)], 
[08/28/2020-16:56:14] [I] [TRT] ModelImporter.cpp:135: No importer registered for op: CTCGreedyDecoder. Attempting to import as plugin.
[08/28/2020-16:56:14] [I] [TRT] builtin_op_importers.cpp:3659: Searching for plugin: CTCGreedyDecoder, plugin_version: 1, plugin_namespace: 
[08/28/2020-16:56:14] [E] [TRT] INVALID_ARGUMENT: getPluginCreator could not find plugin CTCGreedyDecoder version 1
ERROR: builtin_op_importers.cpp:3661 In function importFallbackPluginImporter:
[8] Assertion failed: creator && "Plugin not found, are the plugin name, version, and namespace correct?"
[08/28/2020-16:56:14] [E] Failed to parse onnx file
[08/28/2020-16:56:14] [E] Parsing model failed
[08/28/2020-16:56:14] [E] Engine creation failed
[08/28/2020-16:56:14] [E] Engine set up failed
&&&& FAILED TensorRT.trtexec # ../../bin/trtexec --onnx=recg_sparsetodenseremoved.onnx --verbose --explicitBatch

That is the error without plugin. So I have created plugin as shown in the very first post.
I followed the sample sampleUffPluginV2Ext and sampleOnnxMNIST. With plugin the error is

PluginV2Layer must be V2DynamicExt when there are runtime input dimensions.

My query is where is runtime input dimensions in my plugin?