Add custom MaxPool layer using IPluginFactory

Hello
I am trying to implement my own maxPool layer in order to understand the flow of creating custom Plugins in tensorRT better. I have written an implementation of MaxPool by inheriting the IPlugin. You can find the code below.

class MaxPool: public IPlugin
{
public:
	MaxPool(int kernelSize, int strideLength)
	{
		
		kernel = kernelSize;
		stride = strideLength;
		
	}

	// create the plugin at runtime from a byte stream
	MaxPool()
	{
		kernel = 2;
		stride = 2;
	}

	~MaxPool()
	{
		//cudaFree(const_cast<void*>(mKernelWeights.values));
		//cudaFree(const_cast<void*>(mBiasWeights.values));
	}

	int getNbOutputs() const override
	{
		return 2;
	}

	Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override
	{
		assert(index == 0 && nbInputDims == 1 && inputs[0].nbDims == 3);
        out_c = inputs[index].d[0];
        out_h = inputs[index].d[1] / 2;
        out_w = inputs[index].d[2] / 2;
		return DimsCHW(out_c, out_h, out_w);
	}

	void configure(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, int maxBatchSize) override
	{
	}

	int initialize() override
	{
		//CHECK(cudnnCreate(&mCudnn));							// initialize cudnn and cublas
		//CHECK(cublasCreate(&mCublas));
		//CHECK(cudnnCreateTensorDescriptor(&mSrcDescriptor));	// create cudnn tensor descriptors we need for bias addition
		//CHECK(cudnnCreateTensorDescriptor(&mDstDescriptor));

		return 0;
	}

	virtual void terminate() override
	{
		//CHECK(cublasDestroy(mCublas));
		//CHECK(cudnnDestroy(mCudnn));
	}

	virtual size_t getWorkspaceSize(int maxBatchSize) const override
	{
		return 0;
	}

	virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override
	{
		
		size_t nn = out_c * out_w * out_h;
		forward_maxpool_layer_gpu(nn, nn, input_h, input_w, input_c, stride, kernel, 0, 
			                                                       (float*) inputs[0], (float*) outputs[0], (float*) outputs[1]);
		return 0;
	}

	virtual size_t getSerializationSize() override
	{
		// 3 integers (number of input channels, number of output channels, bias size), and then the weights:
		//return sizeof(int)*3 + mKernelWeights.count*sizeof(float) + mBiasWeights.count*sizeof(float);
		return 0;
	}

	virtual void serialize(void* buffer) override
	{
		// char* d = reinterpret_cast<char*>(buffer), *a = d;

		// write(d, mNbInputChannels);
		// write(d, mNbOutputChannels);
		// write(d, (int)mBiasWeights.count);
		// serializeFromDevice(d, mKernelWeights);
		// serializeFromDevice(d, mBiasWeights);

		// assert(d == a + getSerializationSize());
	}
private:
	
	
	
	int kernel, stride;
	int out_c, out_w, out_h;
	int input_c, input_w, input_h;
};

I have written two constructors that initializes the kernel size and stride. But i don’t know how to integrate this into IPluginFactory using createPlugin method for sampleMNISTAPI example.
I have following doubts:

  1. I don’t know what does arguments to createPlugin mean ? It will be best if it could be explained in the context of MaxPool layer.
  2. What really is happening in the following line of code:
    ICudaEngine* engine = runtime->deserializeCudaEngine(modelStream->data(), modelStream->size(), &pluginFactory).

FYI, I have gone through the sampleCharRNN code.

Hi lovekesh, see here for the IPluginFactory::createPlugin() function documentation:
http://docs.nvidia.com/deeplearning/sdk/tensorrt-api/topics/classnvinfer1_1_1_i_plugin_factory.html#a00effd31e6370b36a96258e5e56f62db
The parameters received are similar to the serialize/deserialize parameters described below.

The IPlugin::getSerializationSize() function that your plugin implements returns the size in bytes of the serialized block of data that is used to store all the custom state of the layer (trained weights, layer hyperparameters, additional plugin-specific customization, ect). In your getSerializationSize() implementation, you return the size that you need to store the state of your layer. Then when TensorRT needs to serialize/deserialize, it will manage the memory allocation of that state storage block to you, and pass you the buffer to serialize() for you to fill with the layer’s state. The ICudaEngine::deserializeCudaEngine() function is the high-level wrapper function that oversee’s this process for all the layers and plugins in your network.

You may be interested to see this further documentation on the plugin samples here:
https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#plugin_sample

Hi
Thanks for the detailed reply. It really improved my understanding of tensorRT.
After going through code examples, i have modified my plugin implementation of MaxPool but for some reason, i am getting segmentation fault when the code tries to serialize the engine. Here’s my updated implementation of plugin for maxpool.

class MaxPool: public IPlugin
{
public:
	MaxPool(int kernelSize, int strideLength)
	{
		
		kernel = kernelSize;
		stride = strideLength;
		
	}

	MaxPool()
	{
		kernel = 2;
		stride = 2;
	}
	MaxPool(const void* buffer, size_t size)
	{
		assert(size == 2*sizeof(int));
		const int* d = reinterpret_cast<const int*>(buffer);
		kernel = d[0];
		stride = d[1];
		
	}

	~MaxPool()
	{
		
	}

	int getNbOutputs() const override
	{
		return 1;
	}

	Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override
	{
		std::cout << "Index is " << index << std::endl;
		//assert(index == 0 && nbInputDims == 1 && inputs[0].nbDims == 3);
                out_c = inputs[index].d[0];
                out_h = inputs[index].d[1] / stride;
                out_w = inputs[index].d[2] / stride;
		return DimsCHW(out_c, out_h, out_w);
	}

	void configure(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, int maxBatchSize) override
	{
	}

	int initialize() override
	{
		
		return 0;
	}

	virtual void terminate() override
	{
		
	}

	virtual size_t getWorkspaceSize(int maxBatchSize) const override
	{
		
		return 0;
	}

	virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override
	{
		
		CHECK(cudaThreadSynchronize());
		//CHECK(cudaMemcpyAsync(outputRois, buffers[outputIndex2], batchSize * nmsMaxOut * 4 * sizeof(float), cudaMemcpyDeviceToHost, stream));
		printf("%s\n", "Forward pass started"); 
		size_t nn = out_c * out_w * out_h;
		
		// forward_maxpool_layer_gpu(nn, (int) nn, input_h, input_w, input_c, stride, kernel, 0, 
		 	                                                       // (float*) inputs[0], (float*) outputs[0]);
		std::cout << "One pass successful" << std::endl;
		return 0;
	}

	virtual size_t getSerializationSize() override
	{
		// 3 integers (number of input channels, number of output channels, bias size), and then the weights:
		//return sizeof(int)*3 + mKernelWeights.count*sizeof(float) + mBiasWeights.count*sizeof(float);
		//std::cout << "Here" << std::endl;
		//return sizeof(mCopySize);
		return sizeof(int)*2;
	}

	virtual void serialize(void* buffer) override
	{
		int* d = reinterpret_cast<int*>(buffer);
		d[0] = kernel;
		d[1] = stride;
	}
private:
	
	
	size_t mCopySize;
	int kernel, stride;
	int out_c, out_w, out_h;
	int input_c, input_w, input_h;
};
class PluginFactory : public nvinfer1::IPluginFactory
{
public:
	// deserialization plugin implementation
	IPlugin* createPlugin(const char* layerName, const void* serialData, size_t serialLength) override
	{
        assert(!strncmp(layerName, "max_pool", 8));
        if (!mPlugin) mPlugin = std::unique_ptr<MaxPool>(new MaxPool(serialData, serialLength));
        return mPlugin.get();
    }
    void destroyPlugin()
    {
        if (mPlugin) mPlugin.release();
        mPlugin = nullptr;
    }
    bool isPlugin(const char* name)
    {
    return !strcmp(name, "max_pool");
    }
private:
    std::unique_ptr<MaxPool> mPlugin{ nullptr }; 
};

In the sampleMNISTAPI code, i have replaced second maxpool with the following code.

MaxPool maxpool2(2, 2);
ITensor *ptr2 = conv2->getOutput(0);
auto pool2 = network->addPlugin(&ptr2, 1, maxpool2);
pool2->setName("max_pool2");

But when the code comes at the following line, it returns segmentation fault.

(*modelStream) = engine->serialize();

Here’s the cuda-gdb dump

Program received signal SIGSEGV, Segmentation fault.
0x000000040001a06c in ?? ()
(cuda-gdb) run
The program being debugged has been started already.
Start it from the beginning? (y or n) n
Program not restarted.
(cuda-gdb) bt
#0  0x000000040001a06c in ?? ()
#1  0x00007fffee373a81 in nvinfer1::cudnn::PluginLayer::serializeParams(flatbuffers::FlatBufferBuilder&) const ()
   from /home/madan/tensorrt/TensorRT-3.0.0/lib/libnvinfer.so.4
#2  0x00007fffee363afe in nvinfer1::cudnn::Engine::serialize() const ()
   from /home/madan/tensorrt/TensorRT-3.0.0/lib/libnvinfer.so.4
#3  0x0000000000405e3f in APIToModel (maxBatchSize=1, modelStream=0x7fffffffcc68) at sampleMNISTAPI.cpp:341
#4  0x000000000040631e in main (argc=1, argv=0x7fffffffdd38) at sampleMNISTAPI.cpp:386

I have spent quite a bit of my time trying to fix this and not sure what else i need to do to make it work. I am running this code on 1080 Ti on ubuntu 14.04 tensorRT 3.0.0
please help.
Thanks

Hi,

Guess that there is a missing createPlugin for 1st creation.
It’s required to implement two kind of createPlugin functions: first creation and de-serializing.

Check this sample for details:
https://github.com/AastaNV/Face-Recognition/blob/master/pluginImplement.cpp#L14

Thanks.

Hi AastaLL
Thanks for the link. I have updated my implementation of PluginFactory to incorporate both createPlugin functions.

However, i am still getting the segmentation fault while it tries to serialize the engine.

(*modelStream) = engine->serialize();

Since i am trying to write my custom Plugin to be integrated into the tensorRT API model instead of caffe parser, i am following the source code for sampleCharRNN as it essentially does the same with reshape layer.
So, it will be great if you could help me with following doubts in the context of sampleCharRNN implementation:

  1. In sampleCharRNN, the call to serialize engine takes place without instantiation of Pluginfactory object, so does that mean that PluginFactory plays no role in serializing the engine when the rest of the model is written using layer APIs instead of using caffe parser ?

  2. Why is that there are no two kind of createPlugin implementations in sampleCharRNN source code ?

  3. I am still getting segmentation fault (as already mentioned above) . I have copied the main file (sampleMNISTAPI.cpp) below, so that it’s easier for you to trace if i have any bug someplace else.

#include "NvInfer.h"
#include "NvCaffeParser.h"
#include "cuda_runtime_api.h"
#include <cassert>
#include <cmath>
#include <ctime>
#include <cstring>
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <sys/stat.h>
#include <vector>
#include <algorithm>
#include "common.h"
//extern "C" {
#include "maxpool_layer.h"
#include "cuda.h"

// stuff we know about the network and the input/output blobs
static const int INPUT_H = 28;
static const int INPUT_W = 28;
static const int OUTPUT_SIZE = 10;

const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";


using namespace nvinfer1;
using namespace nvcaffeparser1;
static Logger gLogger;

// Our weight files are in a very simple space delimited format.
// [type]  <data x size in hex> 
std::map<std::string, Weights> loadWeights(const std::string file)
{
    std::map<std::string, Weights> weightMap;
	std::ifstream input(file);
	assert(input.is_open() && "Unable to load weight file.");
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");
    while(count--)
    {
        Weights wt{DataType::kFLOAT, nullptr, 0};
        uint32_t type, size;
        std::string name;
        input >> name >> std::dec >> type >> size;
        wt.type = static_cast<DataType>(type);
        if (wt.type == DataType::kFLOAT)
        {
            uint32_t *val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
            for (uint32_t x = 0, y = size; x < y; ++x)
            {
                input >> std::hex >> val[x];

            }
            wt.values = val;
        } else if (wt.type == DataType::kHALF)
        {
            uint16_t *val = reinterpret_cast<uint16_t*>(malloc(sizeof(val) * size));
            for (uint32_t x = 0, y = size; x < y; ++x)
            {
                input >> std::hex >> val[x];
            }
            wt.values = val;
        }
        wt.count = size;
        weightMap[name] = wt;
    }
    return weightMap;
}

// We have the data files located in a specific directory. This 
// searches for that directory format from the current directory.
std::string locateFile(const std::string& input)
{
    std::vector<std::string> dirs{"data/samples/mnist/", "data/mnist/"};
    return locateFile(input, dirs);
}

// simple PGM (portable greyscale map) reader
void readPGMFile(const std::string& filename,  uint8_t buffer[INPUT_H*INPUT_W])
{
    readPGMFile(locateFile(filename), buffer, INPUT_H, INPUT_W);
}


class MaxPool: public nvinfer1::IPlugin
{
public:
	MaxPool(int kernelSize, int strideLength)
	{
		kernel = kernelSize;
		stride = strideLength;
	}
    MaxPool()
    {
    	kernel = 2;
    	stride = 2;
    }
	// create the plugin at runtime from a byte stream
	MaxPool(const void* buffer, size_t size)
    {
		// assert(size == 2*sizeof(int));
		// const int* d = reinterpret_cast<const int*>(buffer);
		// kernel = d[0];
		// stride = d[1];
		kernel = 2;
		stride = 2;
		
	}
	~MaxPool()
	{
		//cudaFree(const_cast<void*>(mKernelWeights.values));
		//cudaFree(const_cast<void*>(mBiasWeights.values));
	};

	int getNbOutputs() const override { return 1; }
	Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) 
	{
		std::cout << "Index is " << index << std::endl;
		assert(index == 0 && nbInputDims == 1 && inputs[0].nbDims == 3);
	    int out_c = inputs[index].d[0];
	    int out_h = inputs[index].d[1] / stride;
	    int out_w = inputs[index].d[2] / stride;
	    size = out_c * out_h * out_w ;
	    std::cout << "Size is " << size << std::endl;
		return DimsNCHW(1, out_c, out_h, out_w);
	}
	
	inline void terminate() override {;};
	
	inline size_t getWorkspaceSize(int) const override { return 0; };
	
	size_t getSerializationSize() 
	{
		std::cout << "Serialization called" << std::endl;
		return 0; //sizeof(int)*2;
	}


	void configure(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, int maxBatchSize) override
	{
	}

	int initialize() override
	{
		//CHECK(cudnnCreate(&mCudnn));							// initialize cudnn and cublas
		//CHECK(cublasCreate(&mCublas));
		//CHECK(cudnnCreateTensorDescriptor(&mSrcDescriptor));	// create cudnn tensor descriptors we need for bias addition
		//CHECK(cudnnCreateTensorDescriptor(&mDstDescriptor));

		return 0;
	}
	
	int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream)
    {
	
	    CHECK(cudaThreadSynchronize());
	    //CHECK(cudaMemcpyAsync(outputRois, buffers[outputIndex2], batchSize * nmsMaxOut * 4 * sizeof(float), cudaMemcpyDeviceToHost, stream));
	    printf("%s\n", "Forward pass started");
	    CHECK(cudaMemcpyAsync(static_cast<float*>(outputs[0]),
                       static_cast<const float*>(inputs[0]),
                       sizeof(float) * size * batchSize, cudaMemcpyDefault, stream)); 
	    //size_t nn = out_c * out_w * out_h;
	
	    // forward_maxpool_layer_gpu(nn, (int) nn, input_h, input_w, input_c, stride, kernel, 0, 
	 	                                                       // (float*) inputs[0], (float*) outputs[0]);
	    std::cout << "One pass successful" << std::endl;
	    return 0;
    }

	void serialize(void* buffer) 
	{
		std::cout << "Running" << std::endl;
		// int* d = static_cast<int*>(buffer);
		// d[0] = kernel;
		// d[1] = stride;
	}	
private:
	//size_t mCopySize;
	int kernel, stride, size;
	//int out_c, out_w, out_h;
	//int input_c, input_w, input_h;
};



class PluginFactory : public nvinfer1::IPluginFactory
{
public:
	// deserialization plugin implementation
	IPlugin* createPlugin(const char* layerName, const void* serialData, size_t serialLength) 
	{
        assert(isPlugin(layerName));
        if(!strncmp(layerName, "maxpool", 7))
        {
            assert(mPlugin.get() == nullptr);
            if (!mPlugin) mPlugin = std::unique_ptr<MaxPool>(new MaxPool(serialData, serialLength));
            return mPlugin.get();
        }
        

    }
    IPlugin* createPlugin(const char* layerName, const nvinfer1::Weights* weights, int nbWeights)
    {
        assert(isPlugin(layerName));
        if (!strncmp(layerName, "maxpool", 7))
        {
            assert(mPlugin.get() == nullptr);
            mPlugin = std::unique_ptr<MaxPool>(new MaxPool());
            return mPlugin.get();
        }
    }
    void destroyPlugin()
    {
        mPlugin.release();
        mPlugin = nullptr;
        
    }
    bool isPlugin(const char* name)
    {
    return (!strcmp(name, "maxpool"));
    }
private:
    std::unique_ptr<MaxPool> mPlugin{ nullptr };
    
}; // PluginFactory


// Creat the Engine using only the API and not any parser.
ICudaEngine *
createMNISTEngine(unsigned int maxBatchSize, IBuilder *builder, DataType dt)
{
	INetworkDefinition* network = builder->createNetwork();

	//  Create input of shape { 1, 1, 28, 28 } with name referenced by INPUT_BLOB_NAME
	auto data = network->addInput(INPUT_BLOB_NAME, dt, DimsCHW{ 1, INPUT_H, INPUT_W});
	assert(data != nullptr);

	// Create a scale layer with default power/shift and specified scale parameter.
	float scale_param = 0.0125f;
	Weights power{DataType::kFLOAT, nullptr, 0};
	Weights shift{DataType::kFLOAT, nullptr, 0};
	Weights scale{DataType::kFLOAT, &scale_param, 1};
	auto scale_1 = network->addScale(*data,	ScaleMode::kUNIFORM, shift, scale, power);
	assert(scale_1 != nullptr);

	// Add a convolution layer with 20 outputs and a 5x5 filter.
    std::map<std::string, Weights> weightMap = loadWeights(locateFile("mnistapi.wts"));
	auto conv1 = network->addConvolution(*scale_1->getOutput(0), 20, DimsHW{5, 5}, weightMap["conv1filter"], weightMap["conv1bias"]);
	assert(conv1 != nullptr);
	conv1->setStride(DimsHW{1, 1});


	// Add a max pooling layer with stride of 2x2 and kernel size of 2x2.
	auto pool1 = network->addPooling(*conv1->getOutput(0), PoolingType::kMAX, DimsHW{2, 2});
	assert(pool1 != nullptr);
	pool1->setStride(DimsHW{2, 2});
	d::endl;
	// Add a second convolution layer with 50 outputs and a 5x5 filter.
	auto conv2 = network->addConvolution(*pool1->getOutput(0), 50, DimsHW{5, 5}, weightMap["conv2filter"], weightMap["conv2bias"]);
	assert(conv2 != nullptr);
	conv2->setStride(DimsHW{1, 1});
    
    // My maxpool plugin goes here
	MaxPool maxpool(2, 2);
    ITensor *ptr2 = conv2->getOutput(0);
	auto pool2 = network->addPlugin(&ptr2, 1, maxpool);
    pool2->setName("maxpool");

 
	auto ip1 = network->addFullyConnected(*pool2->getOutput(0), 500, weightMap["ip1filter"], weightMap["ip1bias"]);
	assert(ip1 != nullptr);
	// ITensor *fc1 = ip1->getOutput(0);
	// std::cout << "fully connected 1: " << fc1->getDimensions().d[0] << ", " << fc1->getDimensions().d[1] << ", " << fc1->getDimensions().d[2]<< std::endl;

	// Add an activation layer using the ReLU algorithm.
	auto relu1 = network->addActivation(*ip1->getOutput(0), ActivationType::kRELU);
	assert(relu1 != nullptr);

	// Add a second fully connected layer with 20 outputs.
	auto ip2 = network->addFullyConnected(*relu1->getOutput(0), OUTPUT_SIZE, weightMap["ip2filter"], weightMap["ip2bias"]);
	assert(ip2 != nullptr);
	// ITensor *fc2 = ip2->getOutput(0);
	// std::cout << "fully connected 2: " << "Total dims: " << fc2->getDimensions().nbDims << "[ " << fc2->getDimensions().d[0] << ", " << fc2->getDimensions().d[1] << ", " << fc2->getDimensions().d[2]<< std::endl;

	// Add a softmax layer to determine the probability.
	auto prob = network->addSoftMax(*ip2->getOutput(0));
	assert(prob != nullptr);
	prob->getOutput(0)->setName(OUTPUT_BLOB_NAME);
	network->markOutput(*prob->getOutput(0));

	// Build the engine
	builder->setMaxBatchSize(maxBatchSize);
	builder->setMaxWorkspaceSize(1 << 25);

	auto engine = builder->buildCudaEngine(*network);
	// we don't need the network any more
	network->destroy();

	// Once we have built the cuda engine, we can release all of our held memory.
	for (auto &mem : weightMap)
    {
        free((void*)(mem.second.values));
    }
	return engine;
}

void APIToModel(unsigned int maxBatchSize, // batch size - NB must be at least as large as the batch we want to run with)
		     IHostMemory **modelStream)
{
	// create the builder
	IBuilder* builder = createInferBuilder(gLogger);

	// create the model to populate the network, then set the outputs and create an engine
	ICudaEngine* engine = createMNISTEngine(maxBatchSize, builder, DataType::kFLOAT);
	std::cout << "Created engine successfully" << std::endl;

	assert(engine != nullptr);

	// serialize the engine, then close everything down
	(*modelStream) = engine->serialize();
	std::cout << "Serialized successfully " << std::endl;
	engine->destroy();
	builder->destroy();
	//return engine;
}

void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
{
	const ICudaEngine& engine = context.getEngine();
	// input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
	// of these, but in this case we know that there is exactly one input and one output.
	assert(engine.getNbBindings() == 2);
	void* buffers[2];

	// In order to bind the buffers, we need to know the names of the input and output tensors.
	// note that indices are guaranteed to be less than IEngine::getNbBindings()
	int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME), 
	    outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);

	// create GPU buffers and a stream
	CHECK(cudaMalloc(&buffers[inputIndex], batchSize * INPUT_H * INPUT_W * sizeof(float)));
	CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));
	cudaStream_t stream;
	CHECK(cudaStreamCreate(&stream));

	// DMA the input to the GPU,  execute the batch asynchronously, and DMA it back:
	CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
	std::cout << "About to run enqueue function " << std::endl;
	context.enqueue(batchSize, buffers, stream, nullptr);
	//context.execute(batchSize, buffers);
	std::cout << "enqueue function executed" << std::endl;
	CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE*sizeof(float), cudaMemcpyDeviceToHost, stream));
	cudaStreamSynchronize(stream);

	// release the stream and the buffers
	cudaStreamDestroy(stream);
	CHECK(cudaFree(buffers[inputIndex]));
	CHECK(cudaFree(buffers[outputIndex]));
}

int main(int argc, char** argv)
{
	// create a model using the API directly and serialize it to a stream
    IHostMemory *modelStream{nullptr};
    APIToModel(1, &modelStream);

    PluginFactory pluginFactory;
    std::cout << "Engine built successfully " << std::endl;
	// read a random digit file
	srand(unsigned(time(nullptr)));
	uint8_t fileData[INPUT_H*INPUT_W];
    int num = rand() % 10;
	readPGMFile(std::to_string(num) + ".pgm", fileData);

	// print an ascii representation
	std::cout << "\n\n\n---------------------------" << "\n\n\n" << std::endl;
	for (int i = 0; i < INPUT_H*INPUT_W; i++)
		std::cout << (" .:-=+*#%@"[fileData[i] / 26]) << (((i + 1) % INPUT_W) ? "" : "\n");

	// parse the mean file produced by caffe and subtract it from the image
	nvcaffeparser1::ICaffeParser* parser = createCaffeParser();
	IBinaryProtoBlob* meanBlob = parser->parseBinaryProto(locateFile("mnist_mean.binaryproto").c_str());
	parser->destroy();
	const float *meanData = reinterpret_cast<const float*>(meanBlob->getData());

	float data[INPUT_H*INPUT_W];
	for (int i = 0; i < INPUT_H*INPUT_W; i++)
		data[i] = float(fileData[i])-meanData[i];

	meanBlob->destroy();
    
	IRuntime* runtime = createInferRuntime(gLogger);
	ICudaEngine* engine = runtime->deserializeCudaEngine(modelStream->data(), modelStream->size(), &pluginFactory);
    assert(engine != nullptr);
    if (modelStream) modelStream->destroy();

	IExecutionContext *context = engine->createExecutionContext();

	std::cout << "Running Inference " << std::endl;
	float prob[OUTPUT_SIZE];
	doInference(*context, data, prob, 1);

	// destroy the engine
	context->destroy();
	engine->destroy();
	runtime->destroy();

	// print a histogram of the output distribution
	std::cout << "\n\n";
    float val{0.0f};
    int idx{0};
	for (unsigned int i = 0; i < 10; i++)
    {
        val = std::max(val, prob[i]);
        if (val == prob[i]) idx = i;
		std::cout << i << ": " << std::string(int(std::floor(prob[i] * 10 + 0.5f)), '*') << "\n";
    }
	std::cout << std::endl;

	return (idx == num && val > 0.9f) ? EXIT_SUCCESS : EXIT_FAILURE;
}

Thanks again.

Hi,

Guess that you need to modify this line

268. MaxPool maxpool(2, 2);

to here:

212. mPlugin = std::unique_ptr<MaxPool>(new MaxPool());

Thanks.