#include "cuda_runtime_api.h" #include "parserOnnxConfig.h" #include "NvInfer.h" #include "NvInferPlugin.h" #include "parserOnnxConfig.h" #include "common.h" #include "logger.h" #include "Utils.h" std::vector mPredictionContext; std::vector mPredictionEngine; std::vector mTrtRunTime_prediction; std::vector mTrtCudaStream_onnx; std::vector> mTrtCudaBuffer_prediction; std::vector> mTrtBindBufferSize_prediction; int _size = 416; inline void* safeCudaMalloc(size_t memSize) { void* deviceMem; //CUDA_CHECK(cudaMalloc(&deviceMem, memSize)); cudaMalloc(&deviceMem, memSize); if (deviceMem == nullptr) { std::cerr << "Out of memory" << std::endl; exit(1); } return deviceMem; } inline int64_t volume(const nvinfer1::Dims& d) { return std::accumulate(d.d, d.d + d.nbDims, 1, std::multiplies()); } inline unsigned int getElementSize(nvinfer1::DataType t) { switch (t) { case nvinfer1::DataType::kINT32: return 4; case nvinfer1::DataType::kFLOAT: return 4; case nvinfer1::DataType::kHALF: return 2; case nvinfer1::DataType::kINT8: return 1; } throw std::runtime_error("Invalid DataType."); return 0; } void IntializeEngines_predict(int streamIndex) { std::cout << "intializing prediction engine" << std::endl; mPredictionContext[streamIndex] = mPredictionEngine[streamIndex]->createExecutionContext(); int nbBindings_prediction = mPredictionEngine[streamIndex]->getNbBindings(); mTrtCudaBuffer_prediction[streamIndex].resize(nbBindings_prediction); mTrtBindBufferSize_prediction[streamIndex].resize(nbBindings_prediction); for (int i = 0; i < nbBindings_prediction; ++i) { Dims dims = mPredictionEngine[streamIndex]->getBindingDimensions(i); dims = mPredictionEngine[streamIndex]->getBindingDimensions(i); DataType dtype = mPredictionEngine[streamIndex]->getBindingDataType(i); int64_t totalSize = getElementSize(dtype); for (int DimIndex = 0; DimIndex < dims.nbDims; DimIndex++) totalSize = totalSize * dims.d[DimIndex]; mTrtBindBufferSize_prediction[streamIndex][i] = totalSize; if (totalSize > 0) mTrtCudaBuffer_prediction[streamIndex][i] = safeCudaMalloc(totalSize); } } void buildPredictionEngine(nvinfer1::IBuilder *builder, string modelName, string precision, int maxWorkSpace, int numberOfStreams) { std::cout << "build Prediction Engine" << std::endl; std::string onnxModel = modelName + ".onnx"; // Create a network using the parser. const auto explicitBatch = 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); INetworkDefinition *network = builder->createNetworkV2(explicitBatch); auto parser = nvonnxparser::createParser(*network, sample::gLogger.getTRTLogger()); parser->parseFromFile(onnxModel.c_str(), static_cast(nvinfer1::ILogger::Severity::kWARNING)); // Get information about the inputs/outputs directly from the model. nvinfer1::Dims mPredictionInputDims = network->getInput(0)->getDimensions(); // Create a builder config IBuilderConfig *config = builder->createBuilderConfig(); config->setMaxWorkspaceSize(16_MiB); if (precision == "fp16") { if (builder->platformHasFastFp16()) { std::cout << "Notice: the platform supports fp16. Building fp16 engine" << std::endl; builder->setFp16Mode(true); config->setFlag(BuilderFlag::kFP16); } else std::cout << "Notice: the platform do not support fp16" << std::endl; } std::unique_ptr calibrator; for (int i = 0; i < parser->getNbErrors(); ++i) std::cout << parser->getError(i)->desc() << std::endl; // build first prediction engine { mPredictionEngine[0] = builder->buildEngineWithConfig(*network, *config); std::string outputEngine = modelName + precision + "batch" + to_string(mPredictionInputDims.d[0]) + ".trt"; IHostMemory *serializedModel = mPredictionEngine[0]->serialize(); if (serializedModel) { std::ofstream engineOutput; engineOutput.open(outputEngine, std::ios::binary | std::ios::out); engineOutput.write(static_cast(serializedModel->data()), serializedModel->size()); engineOutput.close(); serializedModel->destroy(); std::cout << "Serialized model" << std::endl; } IntializeEngines_predict(0); } // Define the other prediction engines { for (int streamIndex = 1; streamIndex < numberOfStreams; streamIndex++) { mTrtRunTime_prediction[streamIndex] = createInferRuntime(sample::gLogger); //mPredictionEngine[streamIndex] = mTrtRunTime_prediction[streamIndex]->deserializeCudaEngine(data_predict.get(), length_predict); mPredictionEngine[streamIndex] = mPredictionEngine[0]; IntializeEngines_predict(streamIndex); } } } void loadExistingEngine_predict(std::string _engineFile_predict, int numberOfStreams) { fstream file_predict; file_predict.open(_engineFile_predict, ios::binary | ios::in); if (!file_predict.is_open()) { cout << "read engine file" << _engineFile_predict << " failed" << endl; return; } file_predict.seekg(0, ios::end); long int length_predict = file_predict.tellg(); file_predict.seekg(0, ios::beg); std::unique_ptr data_predict(new char[length_predict]); file_predict.read(data_predict.get(), length_predict); file_predict.close(); std::cout << "deserializing" << std::endl; for (int streamIndex = 0; streamIndex < numberOfStreams; streamIndex++) { mTrtRunTime_prediction[streamIndex] = createInferRuntime(sample::gLogger); assert(mTrtRunTime_prediction[streamIndex] != nullptr); mPredictionEngine[streamIndex] = mTrtRunTime_prediction[streamIndex]->deserializeCudaEngine(data_predict.get(), length_predict); assert(mPredictionEngine[streamIndex] != nullptr); IntializeEngines_predict(streamIndex); } } int main() { int numberOfStreams = 1; cout << "Please enter the number of streams to run in parallel: "; cin >> numberOfStreams; if (numberOfStreams > 5) { cout << "We only support up to 5 streams." << std::endl;; numberOfStreams = 5; } cout << "Running " + to_string(numberOfStreams) + " streams in parallel" << std::endl;; mPredictionContext.resize(numberOfStreams); mPredictionEngine.resize(numberOfStreams); mTrtRunTime_prediction.resize(numberOfStreams); mTrtCudaStream_onnx.resize(numberOfStreams); mTrtCudaBuffer_prediction.resize(numberOfStreams); mTrtBindBufferSize_prediction.resize(numberOfStreams); string modelName = "yolov4size416"; int ModelOptimalBatchSize = 16; string precision = "fp16"; _size = 416; modelName = modelName + "_" + std::to_string(ModelOptimalBatchSize); std::string onnxModel = modelName + ".onnx"; std::string outputEngine_prediction = modelName + precision + "batch" + to_string(ModelOptimalBatchSize) + ".trt"; nvinfer1::IBuilder *builder = createInferBuilder(sample::gLogger); ifstream f_prediction(outputEngine_prediction.c_str()); if (f_prediction.good()) { std::cout << "serialised model (prediction) exists" << std::endl; loadExistingEngine_predict(outputEngine_prediction, numberOfStreams); } else { std::cout << "serialised model (prediction) doesn't exist" << std::endl; buildPredictionEngine(builder, modelName, precision, 1, numberOfStreams); } builder->destroy(); int channels = 3; for(int streamIndex = 0; streamIndex < numberOfStreams; streamIndex++) cudaStreamCreateWithFlags(&mTrtCudaStream_onnx[streamIndex], cudaStreamNonBlocking); // fill inference buffers with dummy data { std::vector sourcepixels(_size*_size * 3 * ModelOptimalBatchSize); void * img_mat_GPU_float; CUDA_CHECK(cudaMalloc(&img_mat_GPU_float, _size * _size * sizeof(float)*channels*ModelOptimalBatchSize)); for (int streamIndex = 0; streamIndex < numberOfStreams; streamIndex++) CUDA_CHECK(cudaMemcpy(mTrtCudaBuffer_prediction[streamIndex][0], sourcepixels.data(), sourcepixels.size(), cudaMemcpyHostToDevice)); } // continuously run inference std::cout << "start inferencing" << std::endl; char key = 0; while (key != 'q') { auto t_start_inference = std::chrono::high_resolution_clock::now(); for (int streamIndex = 0; streamIndex < numberOfStreams; streamIndex++) mPredictionContext[streamIndex]->enqueueV2(&mTrtCudaBuffer_prediction[streamIndex][0], mTrtCudaStream_onnx[streamIndex], NULL); for (int streamIndex = 0; streamIndex < numberOfStreams; streamIndex++) cudaStreamSynchronize(mTrtCudaStream_onnx[streamIndex]); auto t_end_inference = std::chrono::high_resolution_clock::now(); float total_inference = std::chrono::duration(t_end_inference - t_start_inference).count(); //std::cout << "Time taken to copy to GPU (per frame): " << total_cudaMemcpyAsync << " ms." << "\n"; printf("Time taken for inference %f ms.\n", total_inference); } return 0; }