yes, I have set the explicitBatch flag:
nvinfer1::ICudaEngine* getBRNNEngine(asrSample::LSTM::ptr brnn)
{
nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(gLogger);
nvinfer1::IBuilderConfig* config = builder->createBuilderConfig();
builder->setMaxBatchSize(gMaxBatchSize);
config->setMaxWorkspaceSize(gMaxWorkspaceSize);
if (gFp16) {
config->setFlag(nvinfer1::BuilderFlag::kFP16);
config->setFlag(BuilderFlag::kSTRICT_TYPES);
builder->setFp16Mode(true);
}
nvinfer1::INetworkDefinition* network = builder->createNetworkV2(1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH));
nvinfer1::Dims inputDims{3, {-1, -1, 2688}, {nvinfer1::DimensionType::kINDEX, nvinfer1::DimensionType::kSEQUENCE, nvinfer1::DimensionType::kCHANNEL}};
nvinfer1::Dims stateDims = brnn->getStateDims();
nvinfer1::Dims sequenceLengthDims{0, {}, {}};
auto inputTensor = network->addInput("encoder_brnn_input_data", nvinfer1::DataType::kFLOAT, inputDims);
auto sequenceLengthTensor = network->addInput("encoder_sequence_length", nvinfer1::DataType::kINT32, sequenceLengthDims);
auto hiddenStateTensor = network->addInput("encoder_hidden_state", nvinfer1::DataType::kFLOAT, stateDims);
auto cellStateTensor = network->addInput("encoder_cell_state", nvinfer1::DataType::kFLOAT, stateDims);
nvinfer1::ITensor *outputState, *lastHiddenState, *linearOutput, *actOutput;
brnn->addToModel(network, inputTensor, sequenceLengthTensor, hiddenStateTensor, cellStateTensor, &outputState, &lastHiddenState);
outputState->setName("brnn_output");
network->markOutput(*outputState);
auto profile = builder->createOptimizationProfile();
profile->setDimensions(inputTensor->getName(), OptProfileSelector::kMIN, Dims3{1, 1, 2688});
profile->setDimensions(inputTensor->getName(), OptProfileSelector::kOPT, Dims3{50, 150, 2688});
profile->setDimensions(inputTensor->getName(), OptProfileSelector::kMAX, Dims3{100, 300, 2688});
config->addOptimizationProfile(profile);
samplesCommon::enableDLA(builder, config, gUseDLACore);
auto res = builder->buildEngineWithConfig(*network, *config);
network->destroy();
builder->destroy();
config->destroy();
return res;
}
void LSTM::addToModel(
nvinfer1::INetworkDefinition* network,
nvinfer1::ITensor* inputData,
nvinfer1::ITensor* sequenceLength,
nvinfer1::ITensor* hiddenState,
nvinfer1::ITensor* cellState,
nvinfer1::ITensor** outputState,
nvinfer1::ITensor** lastHiddenState)
{
//int maxSeqLen = inputData->getDimensions().d[0];
int maxSeqLen = -1;
auto rnn = network->addRNNv2(
*inputData,
mNumLayers,
mHiddenSize,
maxSeqLen,
nvinfer1::RNNOperation::kLSTM);
assert(rnn != nullptr);
rnn->setInputMode(nvinfer1::RNNInputMode::kLINEAR);
rnn->setDirection(nvinfer1::RNNDirection::kBIDIRECTION);
rnn->setSequenceLengths(*sequenceLength);
std::vector<nvinfer1::RNNGateType> gateOrder({nvinfer1::RNNGateType::kINPUT,
nvinfer1::RNNGateType::kFORGET,
nvinfer1::RNNGateType::kCELL,
nvinfer1::RNNGateType::kOUTPUT});
for (size_t i = 0; i < mGateKernelWeights.size(); i++)
{
bool isW = ((i%8) < 4);
rnn->setWeightsForGate(i/8, gateOrder[i % 4], isW, mGateKernelWeights[i]);
rnn->setBiasForGate(i/8, gateOrder[i % 4], isW, mGateBiasWeights[i]);
}
rnn->setHiddenState(*hiddenState);
rnn->setCellState(*cellState);
*outputState = rnn->getOutput(0);
*lastHiddenState = rnn->getOutput(1);
}
Also, I find that if using dynamic batch dimension, the memory usage will increased quickly,it is very easy to out of gpu memory.