#include #include #include #include #include #include #include #include #include #include #include #include #include "common.h" #include "NvInfer.h" #include "NvUffParser.h" #include "NvUtils.h" using namespace nvuffparser; using namespace nvinfer1; static const int INPUT_H = 256; static const int INPUT_W = 512; static const int INPUT_C = 3; static const int OUTPUT_SIZE = INPUT_H * INPUT_W * INPUT_C; static const int BATCH_SIZE = 6; static Logger gLogger; #define MAX_WORKSPACE (1 << 30) inline int64_t volume(const Dims& d) { int64_t v = 1; for (int64_t i = 0; i < d.nbDims; i++) v *= d.d[i]; return v; } inline unsigned int elementSize(nvinfer1::DataType t) { switch (t) { case nvinfer1::DataType::kINT32: return 4; case nvinfer1::DataType::kFLOAT: return 4; case nvinfer1::DataType::kHALF: return 2; case nvinfer1::DataType::kINT8: return 1; } assert(0); return 0; } /* Locate files in the listed directories */ std::string locateFile(const std::string& input) { std::vector dirs{"/home/code/TensorRT/data/","/home/code/TensorRT/tf1.8/"}; return locateFile(input, dirs); } void uffToTRTModel(const char* uffFile, int maxBatchSize, IUffParser* parser, IHostMemory *&trtModelStream) { IBuilder* builder = createInferBuilder(gLogger); INetworkDefinition* network = builder->createNetwork(); #if 1 if (!parser->parse(uffFile, *network, nvinfer1::DataType::kFLOAT)){ std::string msg("failed to parse uff file fp32"); gLogger.log(nvinfer1::ILogger::Severity::kERROR, msg.c_str()); exit(EXIT_FAILURE); } #else if (!parser->parse(uffFile, *network, nvinfer1::DataType::kHALF)){ std::string msg("failed to parse uff file fp16"); gLogger.log(nvinfer1::ILogger::Severity::kERROR, msg.c_str()); exit(EXIT_FAILURE); } builder->setFp16Mode(true); #endif /* We create the engine */ builder->setMaxBatchSize(maxBatchSize); builder->setMaxWorkspaceSize(MAX_WORKSPACE); ICudaEngine* engine = builder->buildCudaEngine(*network); assert(engine); /* We can clean the network and the parser */ network->destroy(); /* serialize the engine, then close everything down*/ trtModelStream = engine->serialize(); engine->destroy(); builder->destroy(); } void doInference(IExecutionContext& context, float* input, float output[BATCH_SIZE][OUTPUT_SIZE], int batchSize) { const ICudaEngine& engine = context.getEngine(); // input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(), // of these, but in this case we know that there is exactly one input and one output. assert(engine.getNbBindings() == 2); void* buffers[2]; // In order to bind the buffers, we need to know the names of the input and output tensors. // note that indices are guaranteed to be less than IEngine::getNbBindings() int inputIndex, outputIndex; for (int b = 0; b < engine.getNbBindings(); ++b) { if (engine.bindingIsInput(b)) inputIndex = b; else outputIndex = b; } // create GPU buffers and a stream CHECK(cudaMalloc(&buffers[inputIndex], batchSize * INPUT_H * INPUT_W * INPUT_C * sizeof(float))); CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float))); cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); // DMA the input to the GPU, execute the batch asynchronously, and DMA it back: CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueue(batchSize, buffers, stream, nullptr); CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE*sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); // release the stream and the buffers cudaStreamDestroy(stream); CHECK(cudaFree(buffers[inputIndex])); CHECK(cudaFree(buffers[outputIndex])); } int main(int argc, char** argv) { // create a TensorRT model from the onnx model and serialize it to a stream auto fileName = locateFile("20180906_network.uff"); std::cout << "Locate uff file: " << fileName << std::endl; auto parser = createUffParser(); /* Register tensorflow input */ parser->registerInput("input_bytes", DimsCHW(3, 256, 512), UffInputOrder::kNCHW); parser->registerOutput("Softmax"); IHostMemory *trtModelStream{nullptr}; uffToTRTModel(fileName.c_str(), BATCH_SIZE, parser, trtModelStream); assert(trtModelStream != nullptr); float ** data = new float* [BATCH_SIZE]; for (int b; b < BATCH_SIZE; b++){ data[b] = new float [INPUT_C * INPUT_H * INPUT_W]; for (int i; i < INPUT_C * INPUT_H * INPUT_W; i++ ){ data[b][i] = ((float)rand() / RAND_MAX); } } // deserialize the engine IRuntime* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream->data(), trtModelStream->size(), nullptr); assert(engine != nullptr); trtModelStream->destroy(); IExecutionContext *context = engine->createExecutionContext(); assert(context != nullptr); // run inference float ms; float output[BATCH_SIZE][INPUT_C * INPUT_H * INPUT_W]; auto t_start = std::chrono::high_resolution_clock::now(); doInference(*context, *data, output, BATCH_SIZE); for (int b; b < BATCH_SIZE; b++){ delete[] data[b]; } delete[] data; auto t_end = std::chrono::high_resolution_clock::now(); ms = std::chrono::duration(t_end - t_start).count(); std::cout << "run time for 1 inference: " << ms << " ms on batch size: " << BATCH_SIZE << std::endl; // destroy the engine parser->destroy(); context->destroy(); engine->destroy(); runtime->destroy(); shutdownProtobufLibrary(); return EXIT_SUCCESS; }