Anomalous performance behavior of nvp-models when encoding h264-video

I’m observing that the nvp-models lead to anomalous performance behavior when encoding h264-video with the hardware encoder. When enabling for example the Max-N mode and only running cuda-kernels (without h264-encoding) I get as expected the best performance and with Max-Q the worst. But when I also encode h264 at the same time, then Max-Q has much higher overall performance than Max-N. I tested all nvp-models with a test-application with and without h264-encoding and this is the result (fps = frames per second):

nvp-model	fps (w/o enc.)	fps (with enc.)
0		86.3		24.3
1		56.8		41.4
2		74.5		20.5
3		74.7		48.8
4		74.6		21.9

Do you have any explanation for this?

This is the complete source-code of my test-application. It renders a 10-second mandelbrot-animation with 4K-resolution and encodes it into a file. You can disable the encoding by commenting line 18.

#include <iostream>
#include <chrono>
#include <stdexcept>
#include <cuda.h>

#include <glib.h>
#include <gst/gst.h>
#include <gst/app/gstappsrc.h>

using FloatType = float;
constexpr int MB_WIDTH = 3840;
constexpr int MB_HEIGHT = 2160;
constexpr FloatType MB_ASPECT = (FloatType)MB_WIDTH/MB_HEIGHT;
constexpr int MB_BLOCK_WIDTH = 32;
constexpr int MB_BLOCK_HEIGHT = 32;
constexpr int MAX_ITER = 64;
constexpr int FRAME_RATE = 30;
#define ENCODE_TO_FILE

class GstCapture {
public:
	GstCapture() {
		GError *err = NULL;
		pipeline = gst_parse_launch("appsrc name=mysource ! omxh264enc name=myenc iframeinterval=60 profile=2 bitrate=30000000 control-rate=2"
									" ! qtmux ! filesink location=/home/nvidia/Documents/mandelbrot.mp4", &err);
		if (err) throw std::runtime_error("parse-error");

		appsrc = (GstAppSrc*)gst_bin_get_by_name(GST_BIN(pipeline), "mysource");
		if (!appsrc) throw std::runtime_error("no appsrc");
		g_object_set (G_OBJECT (appsrc),
			"caps", gst_caps_new_simple ("video/x-raw",
										"format", G_TYPE_STRING, "I420",
										"width", G_TYPE_INT, MB_WIDTH,
										"height", G_TYPE_INT, MB_HEIGHT,
										"framerate", GST_TYPE_FRACTION, 0, 1,
										NULL),
			"stream-type", GST_APP_STREAM_TYPE_STREAM,
			"format", GST_FORMAT_TIME,
			NULL);

		gst_element_set_state (pipeline, GST_STATE_PLAYING);
	}
	~GstCapture() {
		gst_app_src_end_of_stream(appsrc);
		GstBus *bus = gst_pipeline_get_bus(GST_PIPELINE (pipeline));
		gst_message_unref(gst_bus_timed_pop_filtered(bus, GST_CLOCK_TIME_NONE, GST_MESSAGE_EOS));
		gst_object_unref(bus);
		gst_object_unref(appsrc);
		gst_element_set_state(pipeline, GST_STATE_NULL);
	}
	struct Buffer {
		Buffer() {
			cudaMallocHost(&data, BUF_SIZE);
			cudaMemset(data, 128, BUF_SIZE);
		}
		~Buffer() {cudaFree(data);}
		uint32_t *data;
		bool used = false;
	};
	Buffer *getFreeBuffer() {
		for (int i = 0; i < BUF_COUNT; ++i) if (!buffers[i].used) return &buffers[i];
		return NULL;
	}
	void commitBuffer(Buffer *buffer) {
#ifndef ENCODE_TO_FILE
		return;
#endif
		buffer->used = true;
		GstBuffer *gstBuf = gst_buffer_new_wrapped_full((GstMemoryFlags)0, (gpointer)buffer->data, BUF_SIZE, 0, BUF_SIZE, (void*)buffer, cbDestroyBuf);
		GST_BUFFER_PTS (gstBuf) = timestamp;
		GST_BUFFER_DURATION (gstBuf) = gst_util_uint64_scale_int (1, GST_SECOND, FRAME_RATE);
		timestamp += GST_BUFFER_DURATION (gstBuf);
		GstFlowReturn ret = gst_app_src_push_buffer(appsrc, gstBuf);
		if (ret != GST_FLOW_OK) throw std::runtime_error("push-error");
	}
private:
	static constexpr int BUF_COUNT = 8;
	static constexpr int BUF_SIZE = MB_WIDTH*MB_HEIGHT*3/2;
	Buffer buffers[BUF_COUNT];
	GstElement *pipeline = NULL;
	GstAppSrc *appsrc = NULL;
	GstClockTime timestamp = 0;

	static void cbDestroyBuf(gpointer data) {
		Buffer *buf = (Buffer*)data;
		buf->used = false;
	}
};

__global__ void mandelbrotK(uint32_t *pixelsOut, FloatType tx, FloatType ty, FloatType s) {
	const int xQuad = blockIdx.x*blockDim.x + threadIdx.x;
	const int y = blockIdx.y*blockDim.y + threadIdx.y;
	const FloatType cIm = -s/MB_ASPECT*((FloatType)y/MB_HEIGHT-FloatType(0.5)) + ty;
	uint32_t quad;
	uint8_t *Y = (uint8_t*)&quad;
	for (int i = 0; i < 4; ++i) {
		const int x = 4*xQuad+i;
		const FloatType cRe = s*((FloatType)x/MB_WIDTH-FloatType(0.5)) + tx;
		FloatType zRe = 0;
		FloatType zIm = 0;
		int it;
		for (it = 0; it < MAX_ITER && zRe*zRe+zIm*zIm < 2*2; ++it) {
			const FloatType temp = zRe*zRe - zIm*zIm + cRe;
			zIm = 2*zRe*zIm + cIm;
			zRe = temp;
		}
		Y[i] = it==MAX_ITER?0:it*(256/MAX_ITER);
	}
	pixelsOut[xQuad + y*(MB_WIDTH/4)] = quad;
}

static GMainLoop *mainLoop;
static gboolean myIdleFunc(gpointer user_data) {
	using namespace std;
	using namespace chrono;
	using theclock = high_resolution_clock;

	static GstCapture *gstCapture = NULL;
	if (!gstCapture) gstCapture = new GstCapture;
	GstCapture::Buffer *buffer = gstCapture->getFreeBuffer();
	if (!buffer) return G_SOURCE_CONTINUE;

	static double scaleFactor = 2;
	scaleFactor *= 0.999;

	static theclock::time_point lastTime;
	static theclock::time_point startTime;
	static int lastFrameCount = 0;
	static int totalFrameCount = 0;

	const auto tStartRender = theclock::now();
	mandelbrotK<<<dim3(MB_WIDTH/MB_BLOCK_WIDTH/4, MB_HEIGHT/MB_BLOCK_HEIGHT), dim3(MB_BLOCK_WIDTH, MB_BLOCK_HEIGHT)>>>(
		buffer->data, -0.7436, 0.131, scaleFactor
	);

	const auto tStartSync = theclock::now();
	cudaDeviceSynchronize();

	const auto tCommit = theclock::now();
	gstCapture->commitBuffer(buffer);

	const auto tDone = theclock::now();

	const auto curTime = tDone;
	if (!lastTime.time_since_epoch().count()) {
		lastTime = curTime;
		startTime = curTime;
	}
	++lastFrameCount;
	++totalFrameCount;
	const double timeDiffLast = duration_cast<duration<double>>(curTime-lastTime).count();
	if (timeDiffLast > 1) {
		const double timeDiffStart = duration_cast<duration<double>>(curTime-startTime).count();

		const double kernelDuration = duration_cast<duration<double, ratio<1, 1000>>>(tStartSync-tStartRender).count();
		const double syncDuration = duration_cast<duration<double, ratio<1, 1000>>>(tCommit-tStartSync).count();
		const double commitDuration = duration_cast<duration<double, ratio<1, 1000>>>(tDone-tCommit).count();
		const double renderDuration = duration_cast<duration<double, ratio<1, 1000>>>(tDone-tStartRender).count();

		printf("%.0fs, %.1f fps (kernel: %.3fms + sync: %.3fms + commit: %.3fms = %.3fms)",
				timeDiffStart, lastFrameCount/timeDiffLast,
				kernelDuration, syncDuration, commitDuration, renderDuration);
		cout << endl;
		lastTime = curTime;
		lastFrameCount = 0;
	}

	constexpr int MAX_FRAME_COUNT = 60*5;
	if (totalFrameCount == MAX_FRAME_COUNT) {
		const double timeDiffStart = duration_cast<duration<double>>(curTime-startTime).count();
		printf("Result after %d frames: Rendering needed %.2f seconds with an avg. frame rate of %.2f fps",
				MAX_FRAME_COUNT, timeDiffStart, totalFrameCount/timeDiffStart);
		cout << endl;

		delete gstCapture;
		gstCapture = NULL;
		g_main_loop_quit(mainLoop);
		return G_SOURCE_REMOVE;
	}

	return G_SOURCE_CONTINUE;
}

int main(int argc, char *argv[]) {
	mainLoop = g_main_loop_new(NULL, FALSE);
	gst_init(NULL, NULL);
	g_idle_add(myIdleFunc, NULL);
	g_main_loop_run(mainLoop);
	return 0;
}

When encoding, resources besides just the cuda engine are being used? Otherwise, this seems truly bizarre.

Hi, does ‘nvp-models’ mean TX2 + default carrier board?

With “nvp-model” I mean the performance/power consumption tradeoff which can be configured with the nvpmodel-command. I presume nvpmodel is an abbreviation for “Nvidia performance/power-model”. I’m using the default TX2 dev-kit.
By the way, each time I call the nvpmodel-command I immediately call jetson-clocks.sh to equalize the min- and max-values and therefore get constant behavior.

Hi,
Which document describes the usage of ‘nvp-model’? Have you tried tegrastats in Appendix → Tegra Stats Utility?
https://developer.nvidia.com/embedded/dlc/l4t-documentation-27-1

And please share steps to reproduce the issue so that we can do further check.

You can read about nvpmodel here:

No, I didn’t try tegrastats.

The nsight-project of my test-application is attached. You compile it by calling the makefile in the mandelbrot/Release-directory (“make mandelbrot”). Prior to this you need to install libgstreamer-plugins-base1.0-dev (“sudo apt-get install libgstreamer-plugins-base1.0-dev”).

In order to run a performance-test you have call “sudo nvpmodel -m 0” (for setting mode 0 in this case), “sudo ~/jetson-clocks.sh” and then the mandelbrot-application for running a test. (The encoded video is written to the file /home/nvidia/Documents/mandelbrot.mp4.)
You can bypass the h264-encoding by commenting line #25 in main.cu and then recompile.
mandelbrot.tar.gz (4.03 KB)

Hi there,
I can confirm the results for nvpmodel 0 and 1 (the ones I tried) on the TX2 dev-kit.
Used the information in post #6 and the (nice) code ran straight away.
Could it be that for this particular task the Denver cores corrupt/occupy memory throughput?

I discovered this issue while developing a completely different application. The only mutuality with this test-app is the basic interplay of cuda and gstreamer for rendering images and encoding them with the hardware h264-encoder.

Yes, these denver-cores seem reduce performance in this scenario.

In the meantime I also tested transcoding video-clips by only using this gst-launch-command from the Nvidia gstreamer-guide:

gst-launch-1.0 filesrc location=1280x720_30p.mp4 ! qtdemux ! h264parse ! omxh264dec ! nvvidconv
! 'video/x-raw(memory:NVMM), width=(int)640, height=(int)480, format=(string)I420'
! omxh264enc ! qtmux ! filesink location=test.mp4 –e

The results also weren’t as expected: Independently of the nvpmodel the processing always took the same amount of time.
But when I removed “(memory:NVMM)” from this command the performance drastically improved and it then also depended on the nvpmodel how I would expect it (Max-N gave best performance). But as I far as I know Nvidia never disclosed any details about this NVMM-thing.

Hi mrjazz2,
We tried to run mandelbrot alone and get the result

nvidia@tegra-ubuntu:/home/ubuntu/mandelbrot/Release$ ./mandelbrot
Inside NvxLiteH264DecoderLowLatencyInitNvxLiteH264DecoderLowLatencyInit set DPB and MjstreamingInside NvxLiteH265DecoderLowLatencyInitNvxLiteH265DecoderLowLatencyInit set DPB and MjstreamingFramerate set to : 0 at NvxVideoEncoderSetParameterNvMMLiteOpen : Block : BlockType = 4
===== MSENC =====
NvMMLiteBlockCreate : Block : BlockType = 4
NvH264MSEncInit: Frame rate overridden to 30 (frame_rate 0.000000)
===== MSENC blits (mode: 1) into tiled surfaces =====
1s, 26.9 fps (kernel: 0.097ms + sync: 11.155ms + commit: 0.019ms = 11.271ms)
2s, 23.9 fps (kernel: 0.096ms + sync: 11.327ms + commit: 0.020ms = 11.442ms)
3s, 24.0 fps (kernel: 0.099ms + sync: 11.440ms + commit: 0.019ms = 11.558ms)
4s, 24.0 fps (kernel: 0.130ms + sync: 11.776ms + commit: 0.023ms = 11.929ms)
5s, 24.1 fps (kernel: 0.094ms + sync: 11.696ms + commit: 0.021ms = 11.811ms)
6s, 24.0 fps (kernel: 0.094ms + sync: 11.814ms + commit: 0.021ms = 11.928ms)
7s, 24.0 fps (kernel: 0.095ms + sync: 11.943ms + commit: 0.021ms = 12.059ms)
8s, 24.0 fps (kernel: 0.097ms + sync: 12.093ms + commit: 0.019ms = 12.209ms)
9s, 24.0 fps (kernel: 0.095ms + sync: 12.220ms + commit: 0.020ms = 12.335ms)
10s, 24.1 fps (kernel: 0.116ms + sync: 12.383ms + commit: 0.020ms = 12.519ms)
11s, 24.1 fps (kernel: 0.114ms + sync: 12.546ms + commit: 0.019ms = 12.679ms)
12s, 24.0 fps (kernel: 0.094ms + sync: 12.648ms + commit: 0.018ms = 12.760ms)
Result after 300 frames: Rendering needed 12.37 seconds with an avg. frame rate of 24.25 fps

It looks like nvp-models and the encoding are independent.

The reason why it cannot achieve 4k 60fps is because there is a buffer copy in appsrc → omxh264enc. appsrc fills in CPU buffers but it has to be copied to DMA buffers(video/x-raw(memory:NVMM)) and sends to encoder.

We have verified 4kp60 encoding in
https://devtalk.nvidia.com/default/topic/1008984/jetson-tx2/video-encode-speed/post/5149942/#5149942

The output looks good. How exactly did you conclude that the nvpmodel-setting doesn’t have an influence on encoding? And wouldn’t that be strange/unexpected as the clock speed of the CPUs as well as the GPU are changed? Did you run jetson-clocks.sh after each call to the nvpmodel-command?

How can a single copy operation (less than 1GB/s for 4K@60fps 8bit-I420, TX2-RAM capable of 50+GB/s) have such an impact on performance? And is there a way to get data generated by cuda-kernels into a gstreamer-pipeline avoiding this copy-operation?

Hi mrjazz2,

We run ./mandelbrot lone and get ~24fps, which is close to the result of nvp-models 0 + ./mandelbrot

Please refer to
https://devtalk.nvidia.com/default/topic/1001636/jetson-tx1/tearing-in-gstreamer-captured-screen-video-with-opengl-full-screen-mode/post/5137203/#5137203
https://devtalk.nvidia.com/default/topic/1010929/jetson-tx1/nveglstreamsrc-does-not-receive-egldisplay-and-eglstreamkhr-parameters/post/5160967/#5160967

I don’t understand. Didn’t you repeat that test also with nvp-model 1, 2, 3 and 4? Is there something special about nvp-model 0?

You are proposing using EGL, right? But how do I get my cuda-generated I420-data into an EGL-stream without having to do any more copy-operations? Respectively how can I gain a performance advantage by using EGL?

Hi,
We have clarified appsrc → omxh264enc is not optimal, please consider using MM APIs
[url]https://devtalk.nvidia.com/default/topic/1008984/jetson-tx2/video-encode-speed/post/5149942/#5149942[/url]
Or cudaEglStreamProducer
https://devtalk.nvidia.com/default/topic/1014452/