Hi,
From the github repo jetson-inference I created a code snippet that takes as input a cv::Mat, performs a superresolution inference from the jeston inference model zoo and outputs the result as a cv::Mat.
I understand that this may not be the best option but I do several processes in opencv, so it would be hard to switch right now to gstreamer. I am also aware of the perfomance gains of inputting a batch for inference.
I was wondering how could I increase the speed of the pipeline? Is it possible to load GpuMat directly to the inference?
With the following code I get around 258ms (3.8fps) in a Jetson Nano:
#include <jetson-inference/superResNet.h>
#include <jetson-utils/loadImage.h>
#include <jetson-utils/commandLine.h>
#include <jetson-utils/cudaMappedMemory.h>
#include <jetson-utils/cudaRGB.h>
#include <opencv2/core.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/imgproc.hpp>
#include <iostream>
#include <chrono>
// main entry point
int main()
{
/*
* load super resolution network
*/
superResNet* net = superResNet::Create();
if( !net )
{
printf("superres-console: failed to load superResNet\n");
return 0;
}
//net->EnableLayerProfiler();
// VideoCapture
cv::VideoCapture cap("videoplayback.mp4");
if (!cap.isOpened())
{
printf("Error loading video");
return 0;
}
/*
* load input image
*/
cv::Mat frame;
// Default resolution of the frame is obtained.The default resolution is system dependent.
int inputWidth = cap.get(3);
int inputHeight = cap.get(4);
cap >> frame;
// Allocate memory input
uchar3* imgBufferRGB = NULL;
float4* imgBufferRGBAf = NULL;
cudaMalloc((void**) &imgBufferRGB, frame.cols * frame.rows * sizeof(uchar3));
cudaMalloc((void**) &imgBufferRGBAf, frame.cols * frame.rows * sizeof(float4));
float* outputCPU = NULL;
float* outputCUDA = NULL;
// Allocate memory output
const int outputWidth = inputWidth * net->GetScaleFactor();
const int outputHeight = inputHeight * net->GetScaleFactor();
if( !cudaAllocMapped((void**)&outputCPU, (void**)&outputCUDA, outputWidth * outputHeight * sizeof(float4)) )
{
printf("superres-console: failed to allocate memory for %ix%i output image\n", outputWidth, outputHeight);
return 0;
}
printf("superres-console: input image size - %ix%i\n", inputWidth, inputHeight);
printf("superres-console: output image size - %ix%i\n", outputWidth, outputHeight);
// Execution timing
std::chrono::time_point<std::chrono::high_resolution_clock> timeCount;
std::chrono::time_point<std::chrono::high_resolution_clock> timeStopCount;
long long globalDuration;
while (true)
{
// Start counting
timeCount = std::chrono::high_resolution_clock::now();
cap >> frame;
// Convert from BGR to RGB
cv::cvtColor(frame, frame, cv::COLOR_BGR2RGB);
// Copy
cudaMemcpy2D((void*) imgBufferRGB, frame.cols * sizeof(uchar3), (void*) frame.data, frame.step, frame.cols * sizeof(uchar3), frame.rows, cudaMemcpyHostToDevice);
// Convert to RGBA float 32
cudaRGB8ToRGBA32(imgBufferRGB, imgBufferRGBAf, frame.cols, frame.rows);
/*
* upscale image with network
*/
if( !net->UpscaleRGBA((float*)imgBufferRGBAf, inputWidth, inputHeight,
outputCUDA, outputWidth, outputHeight) )
{
printf("superres-console: failed to process super resolution network\n");
return 0;
}
//CUDA(cudaDeviceSynchronize());
/*
* save output image
*/
// Imshow with opencv
cv::Mat result = cv::Mat(outputHeight, outputWidth, CV_32FC4, outputCPU);
result /= 255;
cv::cvtColor(result, result, cv::COLOR_RGB2BGR);
cv::imshow("results", result);
cv::waitKey(1);
// Count time
timeStopCount = std::chrono::high_resolution_clock::now();
globalDuration = std::chrono::duration_cast<std::chrono::milliseconds>(timeStopCount - timeCount).count();
std::cout << "Latency value: " << globalDuration << " [ms]. " << 1000.0f / globalDuration << "[fps]" << "\n";
}
cap.release();
cv::destroyAllWindows();
delete net;
return 0;
}
Thank you for your time