I have made plugin with fixed size input. I have errors as
[08/27/2020-15:00:52] [W] [TRT] onnx2trt_utils.cpp:220: Your ONNX model has been generated with INT64 weights, while TensorRT does not natively support INT64. Attempting to cast down to INT32.
[08/27/2020-15:00:52] [I] [TRT] ModelImporter.cpp:135: No importer registered for op: CTCGreedyDecoder. Attempting to import as plugin.
[08/27/2020-15:00:52] [I] [TRT] builtin_op_importers.cpp:3659: Searching for plugin: CTCGreedyDecoder, plugin_version: 1, plugin_namespace:
[08/27/2020-15:00:52] [I] [TRT] builtin_op_importers.cpp:3676: Successfully created plugin: CTCGreedyDecoder
[08/27/2020-15:00:52] [E] [TRT] CTCGreedyDecoder: PluginV2Layer must be V2DynamicExt when there are runtime input dimensions.
[08/27/2020-15:00:52] [E] [TRT] CTCGreedyDecoder: PluginV2Layer must be V2DynamicExt when there are runtime input dimensions.
[08/27/2020-15:00:52] [E] [TRT] CTCGreedyDecoder: PluginV2Layer must be V2DynamicExt when there are runtime input dimensions.
[08/27/2020-15:00:52] [E] [TRT] Layer CTCGreedyDecoder failed validation
[08/27/2020-15:00:52] [E] [TRT] Network validation failed.
I can’t find where runtime input dimensions are set in my plugin code.
My plugin is as follows and where did I set runtime input dimensions?
class CTCGreedyDecoder : public IPluginV2IOExt
{
public:
//CTCGreedyDecoder has two inputs (import/transpose:0 (88, 1, 43), import/Fill:0 (1,))
//and two outputs (import/CTCGreedyDecoder:0 (7, 2), import/ToInt32:0 (7,))
CTCGreedyDecoder(const PluginFieldCollection& fc)
{
(void) fc;
}
//data represents the class member variables data serialized in
//void serialize(void* buffer) const override
//length is the length of data in serialization
//Need to deserialize in this Constructor method
CTCGreedyDecoder(const void* data, size_t length)
{
const char* d = static_cast<const char*>(data);
const char* const a = d;
mInputDims_1.nbDims = read<int>(d);
for (int i = 0; i < mInputDims_1.nbDims; ++i)
{
mInputDims_1.d[i] = read<int>(d);
}
mInputDims_2.nbDims = read<int>(d);
for (int i = 0; i < mInputDims_2.nbDims; ++i)
{
mInputDims_2.d[i] = read<int>(d);
}
mOutputDims.nbDims = read<int>(d);
for (int i = 0; i < mOutputDims.nbDims; ++i)
{
mOutputDims.d[i] = read<int>(d);
}
mDataType = static_cast<DataType>(read<int>(d));
if (mDataType == DataType::kINT8)
{
mInHostScale = read<float>(d);
mOutHostScale = read<float>(d);
}
assert(d == a + length);
}
// It makes no sense to construct CTCGreedyDecoder without arguments.
CTCGreedyDecoder() = delete;
virtual ~CTCGreedyDecoder() {}
public:
int getNbOutputs() const override
{
return 1;//it has one output
}
Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override
{
std::cout << "nbInputDims " << nbInputDims << std::endl;//for debugging
assert(index == 0 && nbInputDims == 2 && inputs[0].nbDims == 3 && inputs[1].nbDims == 1);
//Actually CTCGreedyDecoder has dynamic output shape. Varying wrt the different text lengths
//But now changed fixed output shape
//20 is set as License plate text length is less than 20
//This plugin will combine CTCGreedyDecoder and SparseToDense
//So output is taken from Sparse to Dense
//Output size is fixed and not related to input size so set fixed size
return Dims2(1, 20);
}
int initialize() override
{
return 0;
}
void terminate() override
{
//To release memory
}
size_t getWorkspaceSize(int maxBatchSize) const override
{
return 0;
}
int enqueue(int batchSize, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream) override
{
int rows = mInputDims_1.d[0];
int batch = mInputDims_1.d[1];
int widths = mInputDims_1.d[2];
float* output = reinterpret_cast<float*>(outputs[0]);
interface(stream, inputs[0], output, rows, batch, widths);
return 0;
}
//All class member private variables are serialized
//in the following two methods one after another
//serializationSize is the size of all member variables
size_t getSerializationSize() const override
{
size_t serializationSize = 0;
serializationSize += sizeof(mInputDims_1.nbDims);
serializationSize += sizeof(mInputDims_1.d[0]) * mInputDims_1.nbDims;
serializationSize += sizeof(mInputDims_2.nbDims);
serializationSize += sizeof(mInputDims_2.d[0]) * mInputDims_2.nbDims;
serializationSize += sizeof(mOutputDims.nbDims);
serializationSize += sizeof(mOutputDims.d[0]) * mOutputDims.nbDims;
serializationSize += sizeof(static_cast<int>(mDataType));
if (mDataType == DataType::kINT8)
{
serializationSize += sizeof(float) * 2;
}
return serializationSize;
}
//serialize to char pointer
void serialize(void* buffer) const override
{
char* d = static_cast<char*>(buffer);
const char* const a = d;
write(d, mInputDims_1.nbDims);
assert(mInputDims_1.nbDims <= mInputDims_1.MAX_DIMS);
for (int i = 0; i < mInputDims_1.nbDims; ++i)
{
write(d, mInputDims_1.d[i]);
}
write(d, mInputDims_2.nbDims);
assert(mInputDims_2.nbDims <= mInputDims_2.MAX_DIMS);
for (int i = 0; i < mInputDims_2.nbDims; ++i)
{
write(d, mInputDims_2.d[i]);
}
write(d, mOutputDims.nbDims);
assert(mOutputDims.nbDims <= mOutputDims.MAX_DIMS);
for (int i = 0; i < mOutputDims.nbDims; ++i)
{
write(d, mOutputDims.d[i]);
}
write(d, static_cast<int>(mDataType));
if (mDataType == DataType::kINT8)
{
write(d, mInHostScale);
write(d, mOutHostScale);
}
assert(d == a + getSerializationSize());
}
//Plugin configuration for the input/output types, formats and sizes
//PluginTensorDesc are fields that a plugin might see for an input or output.
//it has 4 attributes (Dims, DataType, TensorFormat, float scale)
//you can assert all match to the requirements
//you can check all input/output types meets the expectations
//For this CTCGreedyDecoder
//first input is 3D vector
//second is 1D vector
void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) override
{
assert(in && nbInput == 2);
assert(out && nbOutput == 1);
assert(in[0].type == out[0].type);
assert(in[0].format == TensorFormat::kLINEAR && out[0].format == TensorFormat::kLINEAR);
mDataType = in[0].type;
mInputDims_1 = in[0].dims;//Now batch size is set to 1. one image by one image
mInputDims_2 = in[1].dims;//Now batch size is set to 1. one image by one image
mOutputDims = out[0].dims;
mInHostScale = in[0].scale >= 0.0f ? in[0].scale : -1.0f;
mOutHostScale = out[0].scale >= 0.0f ? out[0].scale : -1.0f;
return ;
}
//! The combination of kLINEAR + kINT8/kHALF/kFLOAT is supported.
bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const override
{
assert(nbInputs == 1 && nbOutputs == 1 && pos < nbInputs + nbOutputs);
bool condition = inOut[pos].format == TensorFormat::kLINEAR;
condition &= inOut[pos].type != DataType::kINT32;
condition &= inOut[pos].type == inOut[0].type;
return condition;
}
DataType getOutputDataType(int index, const DataType* inputTypes, int nbInputs) const override
{
assert(inputTypes && nbInputs == 2);
(void) index;
return inputTypes[0];
}
const char* getPluginType() const override
{
return "CTCGreedyDecoder";
}
const char* getPluginVersion() const override
{
return "1";
}
void destroy() override
{
delete this;
}
IPluginV2Ext* clone() const override
{
auto* plugin = new CTCGreedyDecoder(*this);
return plugin;
}
void setPluginNamespace(const char* libNamespace) override
{
mNamespace = libNamespace;
}
const char* getPluginNamespace() const override
{
return mNamespace.data();
}
bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const override
{
return false;
}
bool canBroadcastInputAcrossBatch(int inputIndex) const override
{
return false;
}
private:
template <typename T>
void write(char*& buffer, const T& val) const
{
*reinterpret_cast<T*>(buffer) = val;
buffer += sizeof(T);
}
template <typename T>
T read(const char*& buffer) const
{
T val = *reinterpret_cast<const T*>(buffer);
buffer += sizeof(T);
return val;
}
//input is (88, 1, 43)
/*void copyDeviceInputToFP32(const void* src, void*& dst)
{
assert(mDataType == DataType::kINT8);
size_t inCount_1 = getC(mInputDims_1) * getH(mInputDims_1) * getW(mInputDims_1);
std::cout << "getC(mInputDims_1) " << getC(mInputDims_1) << " getH(mInputDims_1) " << getH(mInputDims_1) << " getW(mInputDims_1) " << getW(mInputDims_1) << std::endl;//for debugging
std::unique_ptr<char> inputTmp_1{new char[inCount_1 * elementSize(mDataType)]};
CHECK(cudaMemcpy(inputTmp_1.get(), src[0], inCount_1 * elementSize(mDataType), cudaMemcpyDeviceToHost));
std::unique_ptr<float> inputFP32_1{new float[inCount_1]};
transform<DataType::kINT8, DataType::kFLOAT>(inputTmp_1.get(), inputFP32_1.get(), inCount_1);
// int8 scale
int hw = mInputDims_1.d[1] * mInputDims_1.d[2];
for (int j = 0; j < mInputDims_1.d[0]; ++j)
{
std::transform(inputFP32_1.get() + hw * j, inputFP32_1.get() + hw * (j + 1), inputFP32_1.get() + hw * j,
[&](float in) -> float { return in * mInHostScale; });
}
CHECK(cudaMalloc(&dst[0], inCount_1 * elementSize(DataType::kFLOAT)));
CHECK(cudaMemcpy(dst[0], inputFP32_1.get(), inCount_1 * elementSize(DataType::kFLOAT), cudaMemcpyHostToDevice));
}
void copyDeviceToInt8Output(const void* src, void* dst)
{
size_t outCount = getH(mOutputDims) * getW(mOutputDims);
std::cout << " getH(mOutputDims) " << getH(mOutputDims) << std::endl;//for debugging
std::unique_ptr<float> outTmp{new float[outCount]};
CHECK(cudaMemcpy(outTmp.get(), src, outCount * elementSize(DataType::kFLOAT), cudaMemcpyDeviceToHost));
std::unique_ptr<char> outInt8{new char[outCount * elementSize(DataType::kINT8)]};
// int8 + scale
int hw = mOutputDims.d[1] * mOutputDims.d[2];
//for (int j = 0; j < mInputDims_1.d[0]; ++j)//since only one channel
//{
std::transform(outTmp.get() + hw * j, outTmp.get() + hw * (j + 1), outTmp.get() + hw * j,
[&](float in) -> float { return in / mOutHostScale; });
//}
transform<DataType::kFLOAT, DataType::kINT8>(outTmp.get(), outInt8.get(), outCount);
CHECK(cudaMemcpy(dst, outInt8.get(), outCount, cudaMemcpyHostToDevice));
}*/
private:
//This sparsetodense plugin doesn't need private members
//All inputs/outputs are in the stream from previous outputs
//so it is not necessary to have private members
DataType mDataType;
Dims mInputDims_1;
Dims mInputDims_2;
Dims mOutputDims;
float mInHostScale{-1.0f};
float mOutHostScale{-1.0f};
std::string mNamespace;
};
class CTCGreedyDecoderCreator : public IPluginCreator
{
public:
const char* getPluginName() const override
{
return "CTCGreedyDecoder";
}
const char* getPluginVersion() const override
{
return "1";
}
const PluginFieldCollection* getFieldNames() override
{
return &mFieldCollection;
}
IPluginV2* createPlugin(const char* name, const PluginFieldCollection* fc) override
{
auto plugin = new CTCGreedyDecoder(*fc);
mFieldCollection = *fc;
mPluginName = name;
return plugin;
}
IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override
{
//serialData is all data serialized in fun void serialize(void* buffer) const override
//serialLength is length of data seiralized in serialize(void* buffer)
auto plugin = new CTCGreedyDecoder(serialData, serialLength);
mPluginName = name;
return plugin;
}
void setPluginNamespace(const char* libNamespace) override
{
mNamespace = libNamespace;
}
const char* getPluginNamespace() const override
{
return mNamespace.c_str();
}
private:
std::string mNamespace;
std::string mPluginName;
PluginFieldCollection mFieldCollection{0, nullptr};
};