Decrease latency from Jetson-Inference model


From the github repo jetson-inference I created a code snippet that takes as input a cv::Mat, performs a superresolution inference from the jeston inference model zoo and outputs the result as a cv::Mat.
I understand that this may not be the best option but I do several processes in opencv, so it would be hard to switch right now to gstreamer. I am also aware of the perfomance gains of inputting a batch for inference.
I was wondering how could I increase the speed of the pipeline? Is it possible to load GpuMat directly to the inference?

With the following code I get around 258ms (3.8fps) in a Jetson Nano:

#include <jetson-inference/superResNet.h>

#include <jetson-utils/loadImage.h>
#include <jetson-utils/commandLine.h>
#include <jetson-utils/cudaMappedMemory.h>
#include <jetson-utils/cudaRGB.h>

#include <opencv2/core.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/imgproc.hpp>

#include <iostream>
#include <chrono>

// main entry point
int main()
	 * load super resolution network
	superResNet* net = superResNet::Create();

	if( !net )
		printf("superres-console:  failed to load superResNet\n");
		return 0;


	// VideoCapture
	cv::VideoCapture cap("videoplayback.mp4");
	if (!cap.isOpened())
		printf("Error loading video");
		return 0;

	 * load input image
	cv::Mat frame;
	// Default resolution of the frame is obtained.The default resolution is system dependent. 
	int inputWidth = cap.get(3); 
	int inputHeight = cap.get(4);
	cap >> frame;
	// Allocate memory input
	uchar3* imgBufferRGB = NULL;
	float4* imgBufferRGBAf = NULL;
	cudaMalloc((void**) &imgBufferRGB, frame.cols * frame.rows * sizeof(uchar3));
	cudaMalloc((void**) &imgBufferRGBAf, frame.cols * frame.rows * sizeof(float4));

	float* outputCPU = NULL;
	float* outputCUDA = NULL;

	// Allocate memory output
	const int outputWidth = inputWidth * net->GetScaleFactor();
	const int outputHeight = inputHeight * net->GetScaleFactor();

	if( !cudaAllocMapped((void**)&outputCPU, (void**)&outputCUDA, outputWidth * outputHeight * sizeof(float4)) )
		printf("superres-console:  failed to allocate memory for %ix%i output image\n", outputWidth, outputHeight);
		return 0;

	printf("superres-console:  input image size - %ix%i\n", inputWidth, inputHeight);
	printf("superres-console:  output image size - %ix%i\n", outputWidth, outputHeight);

	// Execution timing
	std::chrono::time_point<std::chrono::high_resolution_clock> timeCount;
	std::chrono::time_point<std::chrono::high_resolution_clock> timeStopCount;
	long long globalDuration;

	while (true)
		// Start counting
		timeCount = std::chrono::high_resolution_clock::now();
		cap >> frame;

		// Convert from BGR to RGB
		cv::cvtColor(frame, frame, cv::COLOR_BGR2RGB);

		// Copy 
		cudaMemcpy2D((void*) imgBufferRGB, frame.cols * sizeof(uchar3), (void*), frame.step, frame.cols * sizeof(uchar3), frame.rows, cudaMemcpyHostToDevice);

		// Convert to RGBA float 32
		cudaRGB8ToRGBA32(imgBufferRGB, imgBufferRGBAf, frame.cols, frame.rows);

		 * upscale image with network

		if( !net->UpscaleRGBA((float*)imgBufferRGBAf, inputWidth, inputHeight,
						outputCUDA, outputWidth, outputHeight) )
			printf("superres-console:  failed to process super resolution network\n");
			return 0;


		 * save output image

		// Imshow with opencv
		cv::Mat result = cv::Mat(outputHeight, outputWidth, CV_32FC4, outputCPU);
		result /= 255;
		cv::cvtColor(result, result, cv::COLOR_RGB2BGR);

		cv::imshow("results", result);
		// Count time
		timeStopCount = std::chrono::high_resolution_clock::now();
		globalDuration = std::chrono::duration_cast<std::chrono::milliseconds>(timeStopCount - timeCount).count();
		std::cout << "Latency value: " << globalDuration << " [ms].  " << 1000.0f / globalDuration << "[fps]" << "\n";


	delete net;
	return 0;

Thank you for your time

Hi @FelipeVW, what performance do you get on Nano if you just use the original superres-console program? (you would want to run jetson_clocks script beforehand to get a more accurate performance number)

Since jetson-inference expects the image in float4 RGBA format, it would need to be converted as you have done. Although you could pretty easily make your own cudaBGR8ToRGBA32() function from the code in by just swapping the R & B color channels in the input image. That would at least allow you to eliminate that call to cv::cvtColor(frame, frame, cv::COLOR_BGR2RGB)

It would also be interesting to know how much of the performance is related to the cv::imshow() processing - could you comment out all of that code (including that other call to cv::cvtColor() and cv::Mat(...)) and see how the performance improves? That could provide you an indicator of where to focus further optimization efforts.

1 Like

I already saw where is the bottleneck. The results were the following:

  • Full code: 258ms -> 3.9 fps
  • With sudo jetson_clocks: 255 ms -> 3.92 fps
  • No imshow: 240 ms -> 4.19 fps
  • No imshow and cv::Mat output conversion: 84ms -> 11.9 fps

Clearly I should improve the output. What I am wondering, would it be possible to load directly a cv::cuda::Mat, I’ve seen taht interoperability in the tutorials of the VisionWorks? Like:

cv::cuda::GpuMat result = cv::cuda::GpuMat(outputHeight, outputWidth, CV_32FC4, outputCUDA);

Thanks for the help, great model zoo!

Yes, I believe it should be possible, if the GpuMat is in float4 RGBA format with pixel intensities 0-255. You would want to pass the GpuMat’s CUDA device pointer to superResNet::UpscaleRGBA().

1 Like