#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "NvInfer.h" #include "NvInferRuntime.h" #include "NvOnnxParser.h" #include "NvInferPlugin.h" #include #include #include #include #include #include "buffers.h" #include "common.h" #include "logger.h" static const int DETECTION_MAX_INSTANCES = 100; static const Dims2 MODEL_DETECTION_SHAPE{DETECTION_MAX_INSTANCES, 6}; static const Dims4 MODEL_MASK_SHAPE{DETECTION_MAX_INSTANCES, 2, 28, 28}; static const int NUM_CLASSES = 1 + 1; // COCO has 80 classes struct RawDetection { float y1, x1, y2, x2, class_id, score; }; struct Mask { float raw[14 * 2 * 14 * 2]; }; struct BBox { float x1, y1, x2, y2; void draw(cv::Mat rgb) { cv::rectangle(rgb, cv::Rect(this->x1, this->y1, this->x2 - this->x1, this->y2 - this->y1), cv::Scalar(255, 0, 255)); } }; struct BBoxInfo { BBox box; int label = -1; float prob = 0.0f; Mask* mask = nullptr; }; struct detection{ float cnt = 0; float confidence = 0.0; float ix = 0.0; float iy = 0.0; float ex = 0.0; float ey = 0.0; void draw(cv::Mat rgb) { cv::rectangle(rgb, cv::Rect(this->ix, this->iy, (this->ex - this->ix), (this->ey- this->iy) ), cv::Scalar(255, 0, 255)); } }; namespace fs = std::experimental::filesystem; using namespace nvinfer1; class Logger : public ILogger { void log(Severity severity, const char* msg) override { // suppress info-level messages if (severity != Severity::kINFO) std::cout << msg << std::endl; } } gLogger; // stuff we know about the network and the caffe input/output blobs static const int INPUT_H = 768; static const int INPUT_W = 1280; static const int IMAGE_CHANNEL = 3; static const int OUTPUT_SIZE = 2; static const int BATCH_SIZE = 1; const char* INPUT_BLOB_NAME = "Input"; const char* OUTPUT_BLOB_NAME = "generate_detections"; const std::string planFilePath{"int8b1y.engine"}; const std::string inferFilePath{"redim/"}; std::vector< std::string > outputs{ OUTPUT_BLOB_NAME }; nvinfer1::ICudaEngine* loadGIEEngine(const std::string planFilePath) { // reading the model in memory std::cout << "Loading TRT Engine..." << std::endl; std::stringstream gieModelStream; gieModelStream.seekg(0, gieModelStream.beg); std::ifstream cache(planFilePath); assert(cache.good()); gieModelStream << cache.rdbuf(); cache.close(); initLibNvInferPlugins(&gLogger,""); // calculating model size gieModelStream.seekg(0, std::ios::end); const int modelSize = gieModelStream.tellg(); gieModelStream.seekg(0, std::ios::beg); void* modelMem = malloc(modelSize); gieModelStream.read((char*) modelMem, modelSize); nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(gLogger); nvinfer1::ICudaEngine* engine = runtime->deserializeCudaEngine(modelMem, modelSize, nullptr); free(modelMem); runtime->destroy(); std::cout << "Loading Complete!" << std::endl; return engine; } void timeInference(IExecutionContext& context, float * input, float* output, float* output2, int batchSize) { const ICudaEngine& engine = context.getEngine(); // input and output buffer pointers that we pass to the engine - the engine requires exactly ICudaEngine::getNbBindings(), // of these, but in this case we know that there is exactly one input and one output. void* buffers[3]; // In order to bind the buffers, we need to know the names of the input and output tensors. // note that indices are guaranteed to be less than ICudaEngine::getNbBindings() int inputIndex = engine.getBindingIndex("Input"); int output1Index = engine.getBindingIndex("generate_detections"); int output2Index = engine.getBindingIndex("mask_head/mask_fcn_logits/BiasAdd"); // allocate GPU buffers DimsCHW inputDims = static_cast < DimsCHW && >(engine.getBindingDimensions(inputIndex)); DimsCHW outputDims = static_cast < DimsCHW && >(engine.getBindingDimensions(output1Index)); size_t inputSize = batchSize * inputDims.c() * inputDims.h() * inputDims.w() * sizeof(float); size_t outputSize = batchSize * outputDims.c() * outputDims.h() * outputDims.w() * sizeof(float); cudaMalloc(&buffers[inputIndex], 11796480); cudaMalloc(&buffers[output1Index], 2400); cudaMalloc(&buffers[output2Index], 627200); cudaStream_t stream; cudaStreamCreate(&stream); cudaMemcpyAsync(buffers[inputIndex], input, 1280*768*3 * sizeof(float), cudaMemcpyHostToDevice, stream); context.enqueue(batchSize, buffers, stream, nullptr); /*cudaMemcpyAsync(output, bindings[1 - inputId], outputTensor.size() * sizeof(float), cudaMemcpyDeviceToHost, stream);*/ cudaMemcpyAsync(output, buffers[output1Index], 2400, cudaMemcpyDeviceToHost, stream); cudaMemcpyAsync(output2, buffers[output2Index], 627200, cudaMemcpyDeviceToHost, stream); cudaStreamSynchronize(stream); cudaStreamDestroy(stream); cudaFree(buffers[inputIndex]); cudaFree(buffers[output1Index]); cudaFree(buffers[output2Index]); } void print_mat(cv::Mat mat) { std::cout << "channels : " << mat.channels() << std::endl; std::cout << "depth : " << mat.depth() << std::endl; std::cout << "elemSize : " << mat.elemSize() << std::endl; std::cout << "elemSize1: " << mat.elemSize1() << std::endl; std::cout << "total : " << mat.total() << std::endl; std::cout << "type : " << mat.type() << std::endl; std::cout << "dims : " << mat.dims << std::endl; std::cout << "flags : " << mat.flags << std::endl; std::cout << "rows : " << mat.rows << std::endl; std::cout << "cols : " << mat.cols << std::endl; std::cout << "size : " << mat.size << std::endl; std::cout << "step1 : " << mat.step1() << std::endl; std::cout << "step : " << mat.step << std::endl; } int main(int argc, char** argv) { ICudaEngine* engine = loadGIEEngine(planFilePath); assert(engine != nullptr); IExecutionContext* context = engine->createExecutionContext(); assert(context != nullptr); for (auto &p : fs::recursive_directory_iterator(inferFilePath)){ if (p.path().extension() == ".jpg"){ cv::Mat image = cv::imread(p.path().string(),cv::IMREAD_COLOR); std::cout << p.path().string() << std::endl; cv::cvtColor(image, image, cv::COLOR_BGR2RGB); //cv::copyMakeBorder(image,image,0,(768-720),0,0,cv::BORDER_CONSTANT,cv::Scalar(0,0,0)); print_mat(image); if(!image.data){ fprintf(stderr, "fail\n"); break; } std::vector inputData; int rows = image.rows; int H = rows; int cols = image.cols; int W = cols; int C = image.channels(); int N = 1; inputData.resize(C * H * W); float pixelMean[3]{ 123.675, 116.280, 103.53}; //float pixelMean[3]{ 103.9, 116.8, 123.7}; // Host memory for input buffer for (int i = 0, volImg = C * H * W; i < 1; ++i) { for (int c = 0; c < C; ++c) { // The color image to input should be in RGB order for (unsigned j = 0, volChl = H * W; j < volChl; ++j) { inputData[i * volImg + c * volChl + j] = float(image.data[j * C + c]) - pixelMean[c]; } } } int input_dim_h = 768, input_dim_w = 1280; int image_height = H; int image_width = W; // resize the DsImage with scale const int image_dim = std::max(image_height, image_width); int resizeH = (int) image_height * input_dim_h / (float) image_dim; int resizeW = (int) image_width * input_dim_w / (float) image_dim; // keep accurary from (float) to (int), then to float float window_x = (1.0f - (float) resizeW / input_dim_w) / 2.0f; float window_y = (1.0f - (float) resizeH / input_dim_h) / 2.0f; float window_width = (float) resizeW / input_dim_w; float window_height = (float) resizeH / input_dim_h; float final_ratio_x = (float) image_width / window_width; float final_ratio_y = (float) image_height / window_height; //cv::Mat inputBlob = cv::dnn::blobFromImage(image, 1.0f, cv::Size(INPUT_W, INPUT_H), cv::Scalar(), false, true); float prob[2400]; float tmasks[627200]; timeInference(*context, inputData.data(), prob, tmasks, 1); int chk = 0; detection d; //Mask* masks = reinterpret_cast((float*) masks); int detectionOffset = samplesCommon::volume(MODEL_DETECTION_SHAPE); // (100,6) int maskOffset = samplesCommon::volume(MODEL_MASK_SHAPE); // (100, 81, 28, 28) RawDetection* detections = reinterpret_cast((float*) prob); Mask* masks = reinterpret_cast((float*) tmasks); for (int det_id = 0; det_id < DETECTION_MAX_INSTANCES; det_id++) { RawDetection cur_det = detections[det_id]; int label = (int) cur_det.class_id; if (label <= 0) continue; BBoxInfo det; det.label = label; det.prob = cur_det.score; if (det.prob < 0.5){ continue; } det.box.x1 = cur_det.x1; det.box.y1 = cur_det.y1; det.box.x2 = cur_det.x2; det.box.y2 = cur_det.y2; if (det.box.x2 <= det.box.x1 || det.box.y2 <= det.box.y1) continue; det.mask = masks + det_id * NUM_CLASSES + label; det.box.draw(image); cv::imshow("Image",image); cv::waitKey(0); } std::cout << "INF" << std::endl; } } }