OOM error with Shuffle and topK layer tensorRT 7.0

I implemented a CRNN model (refer to https://arxiv.org/abs/1507.05717) using C++ in TensorRT samples folder. When batch size was set to 10 and got some errors as below:

[02/21/2020-06:51:45] [E] [TRT] …/rtSafe/safeRuntime.cpp (25) - Cuda Error in allocate: 2 (out of memory)
[02/21/2020-06:51:45] [W] [TRT] GPU memory allocation error during getBestTactic: (Unnamed Layer* 119) [Shuffle]
[02/21/2020-06:51:45] [E] [TRT] …/rtSafe/safeRuntime.cpp (25) - Cuda Error in allocate: 2 (out of memory)
[02/21/2020-06:51:45] [W] [TRT] GPU memory allocation error during timeReformat.
[02/21/2020-06:51:45] [E] [TRT] …/builder/cudnnBuilderUtils.cpp (360) - Cuda Error in findFastestTactic: 2 (out of memory)
[02/21/2020-06:51:45] [W] [TRT] GPU memory allocation error during getBestTactic: (Unnamed Layer* 120) [TopK]
[02/21/2020-06:51:45] [E] [TRT] Internal error: could not find any implementation for node (Unnamed Layer* 120) [TopK], try increasing the workspace size with IBuilder::setMaxWorkspaceSize()
[02/21/2020-06:51:45] [E] [TRT] …/builder/tacticOptimizer.cpp (1523) - OutOfMemory Error in computeCosts: 0

Platform details are:
o Ubuntu 16.04.5 LTS
o GeForce RTX 2080ti
o 430.40
o CUDA 10.0
o CUDNN 7.6.5
o TensorRT 7.0.0.11

setMaxWorkspaceSize was 8GB.

If swithed to TensorRT 5.0/6.0, batch size equals 80 will be ok using the same code and model.

Optimize algorithms of shuffle and topK ops are different between the latest 7.0 and older versions?

Thanks!

Hi,

Could you please share the repro script and model file so we can help better?

Thanks

Hello,

Core function is below:

[code]ICudaEngine* createMNISTEngine(int maxBatchSize, IBuilder* builder, DataType dt, size_t input_h, size_t input_w, size_t label_cnt, std::string model_weights)
{   
    INetworkDefinition* network = builder->createNetwork();
    
    std::map<std::string, Weights> weightMap = loadWeights(model_weights);
    std::cout << "load weights finished" << std::endl;
    
    // Create input tensor with name INPUT_BLOB_NAME
    ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims4{1, 1, input_h, input_w});
    assert(data);
    
    // Create scale layer with default power/shift and specified scale parameter.
    const float scaleParam = 1; // 0.00390625;
    const Weights power{DataType::kFLOAT, nullptr, 0}; 
    const Weights shift{DataType::kFLOAT, nullptr, 0}; 
    const Weights scale{DataType::kFLOAT, &scaleParam, 1};
    IScaleLayer* scale_1 = network->addScale(*data, ScaleMode::kUNIFORM, shift, scale, power);
    assert(scale_1);
    
    // CNN backbone
    // conv0_1
    scale_1->getOutput(0)->setName("conv0_1_input");
    IConvolutionLayer* conv0_1 = network->addConvolution(*scale_1->getOutput(0), 32, DimsHW{3, 3}, weightMap["FeatureExtraction.ConvNet.conv0_1.weight"], weightMap["FeatureExtraction.ConvNet.conv0_1.bias"]);
    assert(conv0_1);
    conv0_1->setStride(DimsHW{1, 1});
    conv0_1->setPadding(DimsHW{1, 1});
    
    // bn0_1
    IScaleLayer* bn0_1 = network->addScale(*conv0_1->getOutput(0), ScaleMode::kCHANNEL, weightMap["FeatureExtraction.ConvNet.bn0_1.shift"], weightMap["FeatureExtraction.ConvNet.bn0_1.scale"], power);
    assert(bn0_1);
    
    IActivationLayer* relu0_1 = network->addActivation(*bn0_1->getOutput(0), ActivationType::kRELU);
    assert(relu0_1);
    
    assert(conv0_2);
    conv0_2->setStride(DimsHW{1, 1});
    conv0_2->setPadding(DimsHW{1, 1});
    IScaleLayer* bn0_2 = network->addScale(*conv0_2->getOutput(0), ScaleMode::kCHANNEL, weightMap["FeatureExtraction.ConvNet.bn0_2.shift"], weightMap["FeatureExtraction.ConvNet.bn0_2.scale"], power);
    assert(bn0_2);
    
    // pool1
    // Add max pooling layer with stride of 2x2 and kernel size of 2x2.
    assert(pool1);
    pool1->setStride(DimsHW{2, 2});
    pool1->setPadding(DimsHW{0, 0});

    // layer1
    ITensor* layer1 = getLayer(network, weightMap, *pool1->getOutput(0), 128, 1, 1, true);

    // conv1 + bn1 + relu
    IConvolutionLayer* conv1 = network->addConvolution(*layer1, 128, DimsHW{3, 3}, weightMap["FeatureExtraction.ConvNet.conv1.weight"], weightMap["FeatureExtraction.ConvNet.conv1.bias"]);
    assert(conv1);
    conv1->setStride(DimsHW{1, 1});
    conv1->setPadding(DimsHW{1, 1});
    IScaleLayer* bn1 = network->addScale(*conv1->getOutput(0), ScaleMode::kCHANNEL, weightMap["FeatureExtraction.ConvNet.bn1.shift"], weightMap["FeatureExtraction.ConvNet.bn1.scale"], power);
    IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
    assert(relu1);
    
    // maxpool2
    IPoolingLayer* pool2 = network->addPooling(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{2, 2});
    pool2->setStride(DimsHW{2, 2});
    ITensor* layer2 = getLayer(network, weightMap, *pool2->getOutput(0), 256, 2, 2, true);
    
    // conv2 + bn2 + relu2
    assert(conv2);
    conv2->setStride(DimsHW{1, 1});
    conv2->setPadding(DimsHW{1, 1});
    IScaleLayer* bn2 = network->addScale(*conv2->getOutput(0), ScaleMode::kCHANNEL, weightMap["FeatureExtraction.ConvNet.bn2.shift"], weightMap["FeatureExtraction.ConvNet.bn2.scale"], power);
    assert(bn2); 
    IActivationLayer* relu2 = network->addActivation(*bn2->getOutput(0), ActivationType::kRELU);
    assert(relu2);
    
    // maxpool3
    IPoolingLayer* pool3 = network->addPooling(*relu2->getOutput(0), PoolingType::kMAX, DimsHW{2, 2});
    assert(pool3); 
    pool3->setStride(DimsHW{2, 1});
    pool3->setPadding(DimsHW{0, 1});
    
    // layer3
    ITensor* layer3 = getLayer(network, weightMap, *pool3->getOutput(0), 512, 5, 3, true);
    
    // conv3 + bn3 + relu
    IConvolutionLayer* conv3 = network->addConvolution(*layer3, 512, DimsHW{3, 3}, weightMap["FeatureExtraction.ConvNet.conv3.weight"], weightMap["FeatureExtraction.ConvNet.conv3.bias"]);
    assert(conv3);
    conv3->setStride(DimsHW{1, 1});
    assert(bn3); 
    IActivationLayer* relu3 = network->addActivation(*bn3->getOutput(0), ActivationType::kRELU);
    // layer4
    
    // conv4_1 + bn4_1 + conv4_2 + bn4_2
    IConvolutionLayer* conv4_1 = network->addConvolution(*layer4, 512, DimsHW{2, 2}, weightMap["FeatureExtraction.ConvNet.conv4_1.weight"], weightMap["FeatureExtraction.ConvNet.conv4_1.bias"]);
    assert(conv4_1);
    conv4_1->setStride(DimsHW{2, 1});
    conv4_1->setPadding(DimsHW{0, 1});
    IScaleLayer* bn4_1 = network->addScale(*conv4_1->getOutput(0), ScaleMode::kCHANNEL, weightMap["FeatureExtraction.ConvNet.bn4_1.shift"], weightMap["FeatureExtraction.ConvNet.bn4_1.scale"], power);
    assert(bn4_1);
    IActivationLayer* relu4_1 = network->addActivation(*bn4_1->getOutput(0), ActivationType::kRELU);
    assert(relu4_1);
    IConvolutionLayer* conv4_2 = network->addConvolution(*relu4_1->getOutput(0), 512, DimsHW{2, 2}, weightMap["FeatureExtraction.ConvNet.conv4_2.weight"], weightMap["FeatureExtraction.ConvNet.conv4_2.bias"]);
    assert(conv4_2);
    conv4_2->setStride(DimsHW{1, 1});
    conv4_2->setPadding(DimsHW{0, 0});
    IScaleLayer* bn4_2 = network->addScale(*conv4_2->getOutput(0), ScaleMode::kCHANNEL, weightMap["FeatureExtraction.ConvNet.bn4_2.shift"], weightMap["FeatureExtraction.ConvNet.bn4_2.scale"], power);
    assert(bn4_2);
    IActivationLayer* relu4_2 = network->addActivation(*bn4_2->getOutput(0), ActivationType::kRELU);
    
    auto permuted_data = network->addShuffle(*relu4_2->getOutput(0));
    assert(permuted_data);
    permuted_data->setFirstTranspose(nvinfer1::Permutation{0, 3, 1, 2});
    permuted_data->setReshapeDimensions(Dims3{0, 0, -1});
    permuted_data->getOutput(0)->setName("visual_features");
    
    // Sequence modeling stage
    size_t hidden_size = 512;
    std::vector<Weights> weight_ih, weight_hh, bias_ih, bias_hh, weight_ih_reverse, weight_hh_reverse, bias_ih_reverse, bias_hh_reverse;
    
    // 1st BiLSTM
    IRNNv2Layer* bilstm1 = network->addRNNv2(*permuted_data->getOutput(0), 1, hidden_size, times, RNNOperation::kLSTM);
    assert(bilstm1);
    bilstm1->getOutput(0)->setName("BiLSTM1");
    weight_ih.push_back(weightMap["SequenceModeling.0.rnn.weight_ih_l0"]);
    weight_hh.push_back(weightMap["SequenceModeling.0.rnn.weight_hh_l0"]);
    bias_ih.push_back(weightMap["SequenceModeling.0.rnn.bias_ih_l0"]);
    bias_hh.push_back(weightMap["SequenceModeling.0.rnn.bias_hh_l0"]);
    weight_ih_reverse.push_back(weightMap["SequenceModeling.0.rnn.weight_ih_l0_reverse"]);
    weight_hh_reverse.push_back(weightMap["SequenceModeling.0.rnn.weight_hh_l0_reverse"]);
    bias_ih_reverse.push_back(weightMap["SequenceModeling.0.rnn.bias_ih_l0_reverse"]);
    bias_hh_reverse.push_back(weightMap["SequenceModeling.0.rnn.bias_hh_l0_reverse"]);
    addBiLSTM(bilstm1, hidden_size, hidden_size, 1, weight_ih, weight_hh, bias_ih, bias_hh, weight_ih_reverse, weight_hh_reverse, bias_ih_reverse, bias_hh_reverse);
    auto bilstm1_reshape = network->addShuffle(*bilstm1->getOutput(0));
    assert(bilstm1_reshape);
    bilstm1_reshape->setReshapeDimensions(Dims4{-1, 1024, 1, 1});
    auto bilstm1_linear = network->addFullyConnected(*bilstm1_reshape->getOutput(0), 512, weightMap["SequenceModeling.0.linear.weight"], weightMap["SequenceModeling.0.linear.bias"]);
    auto bilstm1_linear_reshape = network->addShuffle(*bilstm1_linear->getOutput(0));
    assert(bilstm1_linear_reshape);
    bilstm1_linear_reshape->setReshapeDimensions(Dims3{-1, (int)times, 512});
    
    // 2nd BiLSTM
    IRNNv2Layer* bilstm2 = network->addRNNv2(*bilstm1_linear_reshape->getOutput(0), 1, hidden_size, times, RNNOperation::kLSTM);
    assert(bilstm2);
    weight_ih.clear();
    weight_ih.push_back(weightMap["SequenceModeling.1.rnn.weight_ih_l0"]);
    weight_hh.clear();
    weight_hh.push_back(weightMap["SequenceModeling.1.rnn.weight_hh_l0"]);
    bias_ih.clear();
    bias_ih.push_back(weightMap["SequenceModeling.1.rnn.bias_ih_l0"]);
    bias_hh.clear();
    bias_hh.push_back(weightMap["SequenceModeling.1.rnn.bias_hh_l0"]);
    weight_ih_reverse.clear();
    weight_ih_reverse.push_back(weightMap["SequenceModeling.1.rnn.weight_ih_l0_reverse"]);
    weight_hh_reverse.clear();
    weight_hh_reverse.push_back(weightMap["SequenceModeling.1.rnn.weight_hh_l0_reverse"]);
    bias_ih_reverse.push_back(weightMap["SequenceModeling.1.rnn.bias_ih_l0_reverse"]);
    bias_hh_reverse.push_back(weightMap["SequenceModeling.1.rnn.bias_hh_l0_reverse"]);
    addBiLSTM(bilstm2, hidden_size, hidden_size, 1, weight_ih, weight_hh, bias_ih, bias_hh, weight_ih_reverse, weight_hh_reverse, bias_ih_reverse, bias_hh_reverse);
    auto bilstm2_reshape = network->addShuffle(*bilstm2->getOutput(0));
    assert(bilstm2_reshape);
    bilstm2_reshape->setReshapeDimensions(Dims4{-1, 1024, 1, 1});
    auto bilstm2_linear = network->addFullyConnected(*bilstm2_reshape->getOutput(0), 512, weightMap["SequenceModeling.1.linear.weight"], weightMap["SequenceModeling.1.linear.bias"]);
    assert(bilstm2_linear);
    auto bilstm2_linear_reshape = network->addShuffle(*bilstm2_linear->getOutput(0));
    assert(bilstm2_linear_reshape);
    bilstm2_linear_reshape->setReshapeDimensions(Dims4{-1, 512, 1, 1});
    
    // Prediction Stage
    auto prd_linear = network->addFullyConnected(*bilstm2_linear_reshape->getOutput(0), label_cnt, weightMap["Prediction.weight"], weightMap["Prediction.bias"]);
    
    ISoftMaxLayer* prob = network->addSoftMax(*prd_linear->getOutput(0));
    assert(prob);
    
    auto permuted_output = network->addShuffle(*prob->getOutput(0));
    permuted_output->setReshapeDimensions(Dims3{-1, (int)times, label_cnt});
    auto prd = network->addTopK(*permuted_output->getOutput(0), nvinfer1::TopKOperation::kMAX, 1, 1<<2);
    auto output_layer = prd;
    
    // output prob
    output_layer->getOutput(0)->setName(OUTPUT_BLOB_PROB);
    network->markOutput(*output_layer->getOutput(0));
    // output index
    output_layer->getOutput(1)->setName(OUTPUT_BLOB_INDEX);
    network->markOutput(*output_layer->getOutput(1));
    output_layer->getOutput(1)->setType(DataType::kINT32);
    
    // Build engine
    builder->setMaxBatchSize(maxBatchSize);
    std::cout << "===== maxBatchSize: " << maxBatchSize << std::endl;
    // max memory can be used
    builder->setMaxWorkspaceSize(8000000000);
    ICudaEngine* engine = builder->buildCudaEngine(*network);
    
    // Don't need the network any more
    network->destroy();
    
    // Release host memory
    for (auto& mem: weightMap)
    {
        free((void*) (mem.second.values));
    }   
    return engine;
}

[/code]

According to the error message, it seams that “network->addShuffle” and “network->addTopK” (in sequence modeling stage, begins at Line 113) were OOM.

Thanks.

Hi SunilJB, the network’s details were showed in the code.

Any suggestion from you will be appreciated greatly.

Thanks

Hi,

Can you try using different workspace size (like 2G, 4G,6G,8G,10G)?

Thanks

Hello SunilJB,

The results of trt7 as below:

workspace -> maxBatchSize(before OOM)
10G -> 2
8G -> 7
6G -> 13
4G -> 19
2G -> 24
1G -> 27
Thanks.

Hi,

Could you please provide a repro that we can run and debug into, only with the code segment it’s difficult to root cause the issue?

Thanks