Thank you. Here is the detailed code. Please have a look
std::vector<DetectResult> YoloDetecter::inference(cv::Mat& img)
{
auto t1 = std::chrono::system_clock::now();
preprocess2gpu(img,vBufferD[0], kInputH, kInputW, stream); //infer preprocessing
auto t2 = std::chrono::system_clock::now();
inference(); // infer
auto t3 = std::chrono::system_clock::now();
std::vector<Detection> res;
nms(res, outputData, kConfThresh, kNmsThresh); //infer postprocessing
auto t4 = std::chrono::system_clock::now();
std::vector<DetectResult> final_res;
for (size_t j = 0; j < res.size(); j++)
{
cv::Rect r = get_rect_adapt_landmark(img, res[j].bbox, res[j].keypoints);
DetectResult single_res;
single_res.tlwh=r;
memcpy(single_res.keypoints, res[j].keypoints, sizeof(float) * kNumberOfPoints * 3);
single_res.conf=res[j].conf;
single_res.class_id=(int)res[j].class_id;
final_res.push_back(single_res);
}
auto t5 = std::chrono::system_clock::now();
std::cout << "#TRT2: preprocess time: " << std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count() << "ms" << std::endl;
std::cout << "#TRT2: inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(t3 - t2).count() << "ms" << std::endl;
std::cout << "#TRT2: nms time: " << std::chrono::duration_cast<std::chrono::milliseconds>(t4 - t3).count() << "ms" << std::endl;
std::cout << "#TRT2: final_res time: " << std::chrono::duration_cast<std::chrono::milliseconds>(t5 - t4).count() << "ms" << std::endl;
std::cout << "#TRT2: all time: " << std::chrono::duration_cast<std::chrono::milliseconds>(t5 - t1).count() << "ms" << std::endl;
std::cout << std::endl;
return final_res;
}
void preprocess2gpu(const cv::Mat& srcImg, float* dstData, const int dstHeight, const int dstWidth, const cudaStream_t& preprocess_s)
{
int srcHeight = srcImg.rows;
int srcWidth = srcImg.cols;
int srcElements = srcHeight * srcWidth * 3;
int dstElements = dstHeight * dstWidth * 3;
cudaMemcpy(srcDevData, srcImg.data, sizeof(uchar) * srcElements, cudaMemcpyHostToDevice);
// calculate width and height after resize
int w, h, x, y;
float r_w = dstWidth / (srcWidth * 1.0);
float r_h = dstHeight / (srcHeight * 1.0);
if (r_h > r_w) {
w = dstWidth;
h = r_w * srcHeight;
x = 0;
y = (dstHeight - h) / 2;
}
else {
w = r_h * srcWidth;
h = dstHeight;
x = (dstWidth - w) / 2;
y = 0;
}
dim3 blockSize(32, 32);
dim3 gridSize((dstWidth + blockSize.x - 1) / blockSize.x, (dstHeight + blockSize.y - 1) / blockSize.y);
// letterbox and resize
letterbox<<<gridSize, blockSize, 0, preprocess_s>>>(srcDevData, srcHeight, srcWidth, midDevData, dstHeight, dstWidth, h, w, y, x);
process<<<gridSize, blockSize>>>(midDevData, dstData, dstHeight, dstWidth);
}
void YoloDetecter::inference()
{
context->enqueue(1, (void**)vBufferD.data(), stream, nullptr);
CUDA_CHECK(cudaMemcpyAsync((void *)outputData, vBufferD[1], vTensorSize[1], cudaMemcpyDeviceToHost, stream));
CUDA_CHECK(cudaStreamSynchronize(stream));
}