cudnn error when using pooling layer

hi,
I am using tensorrt c++ api on my px2, but an error occurred when adding the pooling layer(it’s ok when adding convolution layer).

My enviroment:
CUDA 9.2.78, CUDNN 7.1.2, TensorRT 4.1.1, Ubuntu 16.04

The error:

ERROR: cudnnPoolingLayer.cpp (130) - Cudnn Error in execute: 3
ERROR: cudnnPoolingLayer.cpp (130) - Cudnn Error in execute: 3
build newwork finsih

My test code:

ICudaEngine* createNetworkEngine(IBuilder* builder)
{
    INetworkDefinition* network = builder->createNetwork();
    ITensor* data_input = network->addInput("data", DataType::kFLOAT, Dims4{1, 3, 4, 8});
    assert(data_input);

    IPoolingLayer* pool = network->addPooling(*data_input, PoolingType::kMAX, DimsHW{3, 3});
    assert(pool);
    pool->setStride(DimsHW{2, 2});
    pool->setPadding(DimsHW{1, 1});
    pool->setName("pool");

    ILayer* output = pool;
    output->getOutput(0)->setName(config.output_name.c_str());
    network->markOutput(*output->getOutput(0));
    output->getOutput(0)->setType(DataType::kFLOAT);


    // Build engine
    std::cout << "****** building network ***********" << std::endl;
    builder->setMaxBatchSize(maxBatchSize);
    builder->setMaxWorkspaceSize(1 << 20);
    builder->setAverageFindIterations(1);
    builder->setMinFindIterations(1);
    builder->setDebugSync(true);

    ICudaEngine* engine = builder->buildCudaEngine(*network);
    std::cout << "build newwork finsih" << std::endl;
    network->destroy();
    return engine;
}

any suggestion?
thanks!!!

Dear jolly.ming2005,
I am not able to reproduce this is on my DRIVE PX 2. I had tested using addPooling with input data inside sampleMNISTAPI sample without any runtime issue.

hi SivaRamaKrishna,

Thanks for your reply.

I tested the sampleMNISTAPI sample, and it’s successful.
For my test code, it’s ok if the input shape is Dims3{3,4,8}, but it failed if the input shape is Dims4{1,3,4,8}

The pooling layer doesn’t support Dims4(DimsNCHW)?

Dear jolly.ming2005,
Could you please provide complete TRT sample to reproduce this issue.

hi SivaRamaKrishna,

My complete TRT sample:

#include "NvInfer.h"
#include "cuda_runtime_api.h"
#include <cassert>
#include <string>
#include <fstream>
#include <iostream>
#include <sstream>

using namespace nvinfer1;

// Logger for TensorRT info/warning/errors
class Logger : public nvinfer1::ILogger
{
public:
    Logger(): Logger(Severity::kWARNING) {}
    Logger(Severity severity): reportableSeverity(severity) {}
    void log(Severity severity, const char* msg) override
    {
        // suppress messages with severity enum value greater than the reportable
        if (severity > reportableSeverity) return;
        switch (severity)
        {
        case Severity::kINTERNAL_ERROR: std::cerr << "INTERNAL_ERROR: "; break;
        case Severity::kERROR: std::cerr << "ERROR: "; break;
        case Severity::kWARNING: std::cerr << "WARNING: "; break;
        case Severity::kINFO: std::cerr << "INFO: "; break;
        default: std::cerr << "UNKNOWN: "; break;
        }
        std::cerr << msg << std::endl;
    }

    Severity reportableSeverity{Severity::kWARNING};
};

static Logger gLogger;

void displayITensorShape(ITensor* tensor, std::string name)
{
    std::cout << name<<": ";
    Dims dims = tensor->getDimensions();
    for(int k = 0; k < dims.nbDims; ++k){
        std::cout<<dims.d[k]<<"*";
    }
    std::cout<< std::endl;
}

ICudaEngine* createNetworkEngine(unsigned int maxBatchSize, IBuilder* builder, DataType data_type)
{
    INetworkDefinition* network = builder->createNetwork();
    // Dims4: ERROR: cudnnPoolingLayer.cpp (130) - Cudnn Error in execute: 3 
    ITensor* data_input = network->addInput("data", data_type, Dims4{1, 3, 4, 8});
    // Dims3: successful
    //ITensor* data_input = network->addInput("data", data_type, Dims3{3, 4, 8});
    assert(data_input);

    IPoolingLayer* pool = network->addPooling(*data_input, PoolingType::kMAX, DimsHW{2, 2});
    assert(pool);
    pool->setStride(DimsHW{2, 2});
    //pool->setPadding(DimsHW{1, 1});
    pool->setName("pool");

ILayer* output = pool;
    output->getOutput(0)->setName("test_pool");
    network->markOutput(*output->getOutput(0));
    output->getOutput(0)->setType(DataType::kFLOAT);

    displayITensorShape(data_input, "input shape");
    displayITensorShape(output->getOutput(0), "output shape");

    // Build engine
    std::cout << "****** building network ***********" << std::endl;
    builder->setMaxBatchSize(1);
    builder->setMaxWorkspaceSize(1 << 20);
    //builder->setAverageFindIterations(1);
    //builder->setMinFindIterations(1);
    //builder->setDebugSync(true);

std::cout <<"MaxWorkspaceSize: " << builder->getMaxWorkspaceSize() << std::endl;
    ICudaEngine* engine = builder->buildCudaEngine(*network);
    std::cout << "build newwork finsih" << std::endl;
    network->destroy();
    return engine;
}

void APIToModel(int maxBatchSize, const char* plan_file)
{
    cudaSetDevice(0);
    IHostMemory* modelStream{nullptr};
    IBuilder* builder = createInferBuilder(gLogger);

    // Create model to populate the network, then set the outputs and create an engine
    ICudaEngine* engine = createNetworkEngine(maxBatchSize, builder, DataType::kFLOAT);
    assert(engine != nullptr);

    modelStream = engine->serialize();
    // Serialize the engine
    if(plan_file){
        std::stringstream gieModelStream;
	gieModelStream.seekg(0, gieModelStream.beg);
        gieModelStream.write((const char*)modelStream->data(), modelStream->size());

        std::ofstream outFile;
	outFile.open(plan_file);
	outFile << gieModelStream.rdbuf();
	outFile.close();
	gieModelStream.seekg(0, gieModelStream.beg);
    }
    modelStream->destroy();

    // Close everything down
    engine->destroy();
    builder->destroy();
}

int main(int argc, char** argv)
{
    int maxBatchSize = 1;
    std::string plan_file = "./model_trt_test_pool.plan";
    APIToModel(maxBatchSize, plan_file.c_str());
    std::cout << "FINISH: " << plan_file << std::endl;
    return 1;    

}

line 52: input shape is Dims4
line 54: input shape is Dims3

I can reproduce this issue as follows:

  1. run the following commond line to generate test_pool file:
sudo g++ build_trt_test_pool.cpp -std=c++11 -L/usr/local/cuda/lib64 -lcudart -L/usr/lib/aarch64-linux-gnu/ -lnvinfer -I/usr/local/cuda/include -o test_pool
  1. run the test_pool file:
sudo ./test_pool

when the input shape is Dime3(line54), success!!!, log:

input shape: 3*4*8*
output shape: 3*2*4*
****** building network ***********
MaxWorkspaceSize: 1048576
build newwork finsih
FINISH: ./model_trt_test_pool.plan

when the input shape is Dime4(line52), failed!!!, log:

input shape: 1*3*4*8*
output shape: 1*3*2*4*
****** building network ***********
MaxWorkspaceSize: 1048576
ERROR: cudnnPoolingLayer.cpp (130) - Cudnn Error in execute: 3
ERROR: cudnnPoolingLayer.cpp (130) - Cudnn Error in execute: 3
build newwork finsih
test_pool: build_trt_test_pool.cpp:97: void APIToModel(int, const char*): Assertion `engine != nullptr' failed.
Aborted (core dumped)

thanks!!!

Dear jolly.ming2005,
I am able to reproduce this issue on Drive PX2 with TensorRT 4.1.1. This issue is fixed in TRT 5.0. But unfortunately the last TRT release for Drive PX2 is TensorRT 4.1.1. All Drive releases are targeted for DRIVE AGX platform now. Please consider upgrading to DRIVE AGX platform to get latest DRIVE SW.