Hi AastaLL
Thanks for the link. I have updated my implementation of PluginFactory to incorporate both createPlugin functions.
However, i am still getting the segmentation fault while it tries to serialize the engine.
(*modelStream) = engine->serialize();
Since i am trying to write my custom Plugin to be integrated into the tensorRT API model instead of caffe parser, i am following the source code for sampleCharRNN as it essentially does the same with reshape layer.
So, it will be great if you could help me with following doubts in the context of sampleCharRNN implementation:
-
In sampleCharRNN, the call to serialize engine takes place without instantiation of Pluginfactory object, so does that mean that PluginFactory plays no role in serializing the engine when the rest of the model is written using layer APIs instead of using caffe parser ?
-
Why is that there are no two kind of createPlugin implementations in sampleCharRNN source code ?
-
I am still getting segmentation fault (as already mentioned above) . I have copied the main file (sampleMNISTAPI.cpp) below, so that it’s easier for you to trace if i have any bug someplace else.
#include "NvInfer.h"
#include "NvCaffeParser.h"
#include "cuda_runtime_api.h"
#include <cassert>
#include <cmath>
#include <ctime>
#include <cstring>
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <sys/stat.h>
#include <vector>
#include <algorithm>
#include "common.h"
//extern "C" {
#include "maxpool_layer.h"
#include "cuda.h"
// stuff we know about the network and the input/output blobs
static const int INPUT_H = 28;
static const int INPUT_W = 28;
static const int OUTPUT_SIZE = 10;
const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";
using namespace nvinfer1;
using namespace nvcaffeparser1;
static Logger gLogger;
// Our weight files are in a very simple space delimited format.
// [type] <data x size in hex>
std::map<std::string, Weights> loadWeights(const std::string file)
{
std::map<std::string, Weights> weightMap;
std::ifstream input(file);
assert(input.is_open() && "Unable to load weight file.");
int32_t count;
input >> count;
assert(count > 0 && "Invalid weight map file.");
while(count--)
{
Weights wt{DataType::kFLOAT, nullptr, 0};
uint32_t type, size;
std::string name;
input >> name >> std::dec >> type >> size;
wt.type = static_cast<DataType>(type);
if (wt.type == DataType::kFLOAT)
{
uint32_t *val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
for (uint32_t x = 0, y = size; x < y; ++x)
{
input >> std::hex >> val[x];
}
wt.values = val;
} else if (wt.type == DataType::kHALF)
{
uint16_t *val = reinterpret_cast<uint16_t*>(malloc(sizeof(val) * size));
for (uint32_t x = 0, y = size; x < y; ++x)
{
input >> std::hex >> val[x];
}
wt.values = val;
}
wt.count = size;
weightMap[name] = wt;
}
return weightMap;
}
// We have the data files located in a specific directory. This
// searches for that directory format from the current directory.
std::string locateFile(const std::string& input)
{
std::vector<std::string> dirs{"data/samples/mnist/", "data/mnist/"};
return locateFile(input, dirs);
}
// simple PGM (portable greyscale map) reader
void readPGMFile(const std::string& filename, uint8_t buffer[INPUT_H*INPUT_W])
{
readPGMFile(locateFile(filename), buffer, INPUT_H, INPUT_W);
}
class MaxPool: public nvinfer1::IPlugin
{
public:
MaxPool(int kernelSize, int strideLength)
{
kernel = kernelSize;
stride = strideLength;
}
MaxPool()
{
kernel = 2;
stride = 2;
}
// create the plugin at runtime from a byte stream
MaxPool(const void* buffer, size_t size)
{
// assert(size == 2*sizeof(int));
// const int* d = reinterpret_cast<const int*>(buffer);
// kernel = d[0];
// stride = d[1];
kernel = 2;
stride = 2;
}
~MaxPool()
{
//cudaFree(const_cast<void*>(mKernelWeights.values));
//cudaFree(const_cast<void*>(mBiasWeights.values));
};
int getNbOutputs() const override { return 1; }
Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims)
{
std::cout << "Index is " << index << std::endl;
assert(index == 0 && nbInputDims == 1 && inputs[0].nbDims == 3);
int out_c = inputs[index].d[0];
int out_h = inputs[index].d[1] / stride;
int out_w = inputs[index].d[2] / stride;
size = out_c * out_h * out_w ;
std::cout << "Size is " << size << std::endl;
return DimsNCHW(1, out_c, out_h, out_w);
}
inline void terminate() override {;};
inline size_t getWorkspaceSize(int) const override { return 0; };
size_t getSerializationSize()
{
std::cout << "Serialization called" << std::endl;
return 0; //sizeof(int)*2;
}
void configure(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, int maxBatchSize) override
{
}
int initialize() override
{
//CHECK(cudnnCreate(&mCudnn)); // initialize cudnn and cublas
//CHECK(cublasCreate(&mCublas));
//CHECK(cudnnCreateTensorDescriptor(&mSrcDescriptor)); // create cudnn tensor descriptors we need for bias addition
//CHECK(cudnnCreateTensorDescriptor(&mDstDescriptor));
return 0;
}
int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream)
{
CHECK(cudaThreadSynchronize());
//CHECK(cudaMemcpyAsync(outputRois, buffers[outputIndex2], batchSize * nmsMaxOut * 4 * sizeof(float), cudaMemcpyDeviceToHost, stream));
printf("%s\n", "Forward pass started");
CHECK(cudaMemcpyAsync(static_cast<float*>(outputs[0]),
static_cast<const float*>(inputs[0]),
sizeof(float) * size * batchSize, cudaMemcpyDefault, stream));
//size_t nn = out_c * out_w * out_h;
// forward_maxpool_layer_gpu(nn, (int) nn, input_h, input_w, input_c, stride, kernel, 0,
// (float*) inputs[0], (float*) outputs[0]);
std::cout << "One pass successful" << std::endl;
return 0;
}
void serialize(void* buffer)
{
std::cout << "Running" << std::endl;
// int* d = static_cast<int*>(buffer);
// d[0] = kernel;
// d[1] = stride;
}
private:
//size_t mCopySize;
int kernel, stride, size;
//int out_c, out_w, out_h;
//int input_c, input_w, input_h;
};
class PluginFactory : public nvinfer1::IPluginFactory
{
public:
// deserialization plugin implementation
IPlugin* createPlugin(const char* layerName, const void* serialData, size_t serialLength)
{
assert(isPlugin(layerName));
if(!strncmp(layerName, "maxpool", 7))
{
assert(mPlugin.get() == nullptr);
if (!mPlugin) mPlugin = std::unique_ptr<MaxPool>(new MaxPool(serialData, serialLength));
return mPlugin.get();
}
}
IPlugin* createPlugin(const char* layerName, const nvinfer1::Weights* weights, int nbWeights)
{
assert(isPlugin(layerName));
if (!strncmp(layerName, "maxpool", 7))
{
assert(mPlugin.get() == nullptr);
mPlugin = std::unique_ptr<MaxPool>(new MaxPool());
return mPlugin.get();
}
}
void destroyPlugin()
{
mPlugin.release();
mPlugin = nullptr;
}
bool isPlugin(const char* name)
{
return (!strcmp(name, "maxpool"));
}
private:
std::unique_ptr<MaxPool> mPlugin{ nullptr };
}; // PluginFactory
// Creat the Engine using only the API and not any parser.
ICudaEngine *
createMNISTEngine(unsigned int maxBatchSize, IBuilder *builder, DataType dt)
{
INetworkDefinition* network = builder->createNetwork();
// Create input of shape { 1, 1, 28, 28 } with name referenced by INPUT_BLOB_NAME
auto data = network->addInput(INPUT_BLOB_NAME, dt, DimsCHW{ 1, INPUT_H, INPUT_W});
assert(data != nullptr);
// Create a scale layer with default power/shift and specified scale parameter.
float scale_param = 0.0125f;
Weights power{DataType::kFLOAT, nullptr, 0};
Weights shift{DataType::kFLOAT, nullptr, 0};
Weights scale{DataType::kFLOAT, &scale_param, 1};
auto scale_1 = network->addScale(*data, ScaleMode::kUNIFORM, shift, scale, power);
assert(scale_1 != nullptr);
// Add a convolution layer with 20 outputs and a 5x5 filter.
std::map<std::string, Weights> weightMap = loadWeights(locateFile("mnistapi.wts"));
auto conv1 = network->addConvolution(*scale_1->getOutput(0), 20, DimsHW{5, 5}, weightMap["conv1filter"], weightMap["conv1bias"]);
assert(conv1 != nullptr);
conv1->setStride(DimsHW{1, 1});
// Add a max pooling layer with stride of 2x2 and kernel size of 2x2.
auto pool1 = network->addPooling(*conv1->getOutput(0), PoolingType::kMAX, DimsHW{2, 2});
assert(pool1 != nullptr);
pool1->setStride(DimsHW{2, 2});
d::endl;
// Add a second convolution layer with 50 outputs and a 5x5 filter.
auto conv2 = network->addConvolution(*pool1->getOutput(0), 50, DimsHW{5, 5}, weightMap["conv2filter"], weightMap["conv2bias"]);
assert(conv2 != nullptr);
conv2->setStride(DimsHW{1, 1});
// My maxpool plugin goes here
MaxPool maxpool(2, 2);
ITensor *ptr2 = conv2->getOutput(0);
auto pool2 = network->addPlugin(&ptr2, 1, maxpool);
pool2->setName("maxpool");
auto ip1 = network->addFullyConnected(*pool2->getOutput(0), 500, weightMap["ip1filter"], weightMap["ip1bias"]);
assert(ip1 != nullptr);
// ITensor *fc1 = ip1->getOutput(0);
// std::cout << "fully connected 1: " << fc1->getDimensions().d[0] << ", " << fc1->getDimensions().d[1] << ", " << fc1->getDimensions().d[2]<< std::endl;
// Add an activation layer using the ReLU algorithm.
auto relu1 = network->addActivation(*ip1->getOutput(0), ActivationType::kRELU);
assert(relu1 != nullptr);
// Add a second fully connected layer with 20 outputs.
auto ip2 = network->addFullyConnected(*relu1->getOutput(0), OUTPUT_SIZE, weightMap["ip2filter"], weightMap["ip2bias"]);
assert(ip2 != nullptr);
// ITensor *fc2 = ip2->getOutput(0);
// std::cout << "fully connected 2: " << "Total dims: " << fc2->getDimensions().nbDims << "[ " << fc2->getDimensions().d[0] << ", " << fc2->getDimensions().d[1] << ", " << fc2->getDimensions().d[2]<< std::endl;
// Add a softmax layer to determine the probability.
auto prob = network->addSoftMax(*ip2->getOutput(0));
assert(prob != nullptr);
prob->getOutput(0)->setName(OUTPUT_BLOB_NAME);
network->markOutput(*prob->getOutput(0));
// Build the engine
builder->setMaxBatchSize(maxBatchSize);
builder->setMaxWorkspaceSize(1 << 25);
auto engine = builder->buildCudaEngine(*network);
// we don't need the network any more
network->destroy();
// Once we have built the cuda engine, we can release all of our held memory.
for (auto &mem : weightMap)
{
free((void*)(mem.second.values));
}
return engine;
}
void APIToModel(unsigned int maxBatchSize, // batch size - NB must be at least as large as the batch we want to run with)
IHostMemory **modelStream)
{
// create the builder
IBuilder* builder = createInferBuilder(gLogger);
// create the model to populate the network, then set the outputs and create an engine
ICudaEngine* engine = createMNISTEngine(maxBatchSize, builder, DataType::kFLOAT);
std::cout << "Created engine successfully" << std::endl;
assert(engine != nullptr);
// serialize the engine, then close everything down
(*modelStream) = engine->serialize();
std::cout << "Serialized successfully " << std::endl;
engine->destroy();
builder->destroy();
//return engine;
}
void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
{
const ICudaEngine& engine = context.getEngine();
// input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
// of these, but in this case we know that there is exactly one input and one output.
assert(engine.getNbBindings() == 2);
void* buffers[2];
// In order to bind the buffers, we need to know the names of the input and output tensors.
// note that indices are guaranteed to be less than IEngine::getNbBindings()
int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME),
outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);
// create GPU buffers and a stream
CHECK(cudaMalloc(&buffers[inputIndex], batchSize * INPUT_H * INPUT_W * sizeof(float)));
CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));
cudaStream_t stream;
CHECK(cudaStreamCreate(&stream));
// DMA the input to the GPU, execute the batch asynchronously, and DMA it back:
CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
std::cout << "About to run enqueue function " << std::endl;
context.enqueue(batchSize, buffers, stream, nullptr);
//context.execute(batchSize, buffers);
std::cout << "enqueue function executed" << std::endl;
CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE*sizeof(float), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);
// release the stream and the buffers
cudaStreamDestroy(stream);
CHECK(cudaFree(buffers[inputIndex]));
CHECK(cudaFree(buffers[outputIndex]));
}
int main(int argc, char** argv)
{
// create a model using the API directly and serialize it to a stream
IHostMemory *modelStream{nullptr};
APIToModel(1, &modelStream);
PluginFactory pluginFactory;
std::cout << "Engine built successfully " << std::endl;
// read a random digit file
srand(unsigned(time(nullptr)));
uint8_t fileData[INPUT_H*INPUT_W];
int num = rand() % 10;
readPGMFile(std::to_string(num) + ".pgm", fileData);
// print an ascii representation
std::cout << "\n\n\n---------------------------" << "\n\n\n" << std::endl;
for (int i = 0; i < INPUT_H*INPUT_W; i++)
std::cout << (" .:-=+*#%@"[fileData[i] / 26]) << (((i + 1) % INPUT_W) ? "" : "\n");
// parse the mean file produced by caffe and subtract it from the image
nvcaffeparser1::ICaffeParser* parser = createCaffeParser();
IBinaryProtoBlob* meanBlob = parser->parseBinaryProto(locateFile("mnist_mean.binaryproto").c_str());
parser->destroy();
const float *meanData = reinterpret_cast<const float*>(meanBlob->getData());
float data[INPUT_H*INPUT_W];
for (int i = 0; i < INPUT_H*INPUT_W; i++)
data[i] = float(fileData[i])-meanData[i];
meanBlob->destroy();
IRuntime* runtime = createInferRuntime(gLogger);
ICudaEngine* engine = runtime->deserializeCudaEngine(modelStream->data(), modelStream->size(), &pluginFactory);
assert(engine != nullptr);
if (modelStream) modelStream->destroy();
IExecutionContext *context = engine->createExecutionContext();
std::cout << "Running Inference " << std::endl;
float prob[OUTPUT_SIZE];
doInference(*context, data, prob, 1);
// destroy the engine
context->destroy();
engine->destroy();
runtime->destroy();
// print a histogram of the output distribution
std::cout << "\n\n";
float val{0.0f};
int idx{0};
for (unsigned int i = 0; i < 10; i++)
{
val = std::max(val, prob[i]);
if (val == prob[i]) idx = i;
std::cout << i << ": " << std::string(int(std::floor(prob[i] * 10 + 0.5f)), '*') << "\n";
}
std::cout << std::endl;
return (idx == num && val > 0.9f) ? EXIT_SUCCESS : EXIT_FAILURE;
}
Thanks again.