Latency issue: nvv4l2h265enc accumulates four images before releasing the first

I am using the nvv4l2h265enc element in the following gstreamer launch string:

appsrc name=mysource format=3 is-live=1 \
! video/x-raw(memory:NVMM),width=7200,height=6000,framerate=15/1,format=NV12  \
! nvv4l2h265enc \
    bitrate=8000000 \
    control-rate=0 \
    iframeinterval=2 \
    maxperf-enable=true \
    preset-level=UltraFastPreset \
    insert-vui=true \
    insert-sps-pps=1 \
    num-B-Frames=0 \
    idrinterval=1 \
    qp-range="-1,20:-1,20:-1,-1" \
! appsink name=mysink

My program attaches callbacks for need-data (appsrc) and new-sample (appsink). These callbacks print the time when a buffer is passed to GStreamer and when a new sample is received at the appsink.

I discovered that I need to push four images into the the nvv4l2h265enc element before it starts emitting the first image. In other words, my progame has to service 4 need-data signals before the encoder produces the first new-sample signal.

At 15 fps these four images represent a massive latency for live streaming, and the application we are supporting is hindered by this. Is there any way to cut this buffering down? Does the encoder really need to accumulate so many images? I’m seeing this on many different resolutions and frame rates.

Hi,
We have the same observation and it is the constraint of hardware encoder. 7200x6000 is a very large resolution(height is even larger than standard 8K). The large resolution increases latency.

We observe the behavior by running this test sample:

$ gst-launch-1.0 videotestsrc num-buffers=1 ! video/x-raw,format=NV12,width=7200,height=6000 ! filesink location=/home/nvidia/a.yuv
$ g++ -Wall -std=c++11  a.cpp -o test $(pkg-config --cflags --libs gstreamer-app-1.0)
$ ./test
#include <cstdlib>
#include <gst/gst.h>
#include <gst/gstinfo.h>
#include <gst/app/gstappsrc.h>
#include <gst/app/gstappsink.h>
#include <glib-unix.h>
#include <dlfcn.h>

#include <cstring>
#include <iostream>
#include <sstream>
#include <thread>

using namespace std;

#define USE(x) ((void)(x))

static GstPipeline *gst_pipeline = nullptr;
static string launch_string;
static GstElement *appsrc_;

GstClockTime timestamp = 0;
static int w = 7200;
static int h = 6000;
static void *ptr = nullptr;

static void appsink_eos(GstAppSink * appsink, gpointer user_data)
{
    printf("app sink receive eos\n");
}

static GstFlowReturn new_buffer(GstAppSink *appsink, gpointer user_data)
{
    GstSample *sample = NULL;

    g_signal_emit_by_name (appsink, "pull-sample", &sample,NULL);

    if (sample)
    {
        GstBuffer *buffer = NULL;
        GstCaps   *caps   = NULL;
        GstMapInfo map    = {0};

        caps = gst_sample_get_caps (sample);
        if (!caps)
        {
            printf("could not get snapshot format\n");
        }
        gst_caps_get_structure (caps, 0);
        buffer = gst_sample_get_buffer (sample);
        gst_buffer_map (buffer, &map, GST_MAP_READ);

        printf("map.size = %lu\n", map.size);

        gst_buffer_unmap(buffer, &map);

        gst_sample_unref (sample);
    }
    else
    {
        g_print ("could not make snapshot\n");
    }

    return GST_FLOW_OK;
}

static gboolean feed_function(gpointer user_data) {
    GstBuffer *buffer;
    guint size;
    GstFlowReturn ret;
    GstMapInfo map = {0};

    size = (w*h*3)/2;
    buffer = gst_buffer_new_allocate (NULL, size, NULL);
    buffer->pts = timestamp;

    gst_buffer_map (buffer, &map, GST_MAP_WRITE);
    memcpy(map.data, ptr , size);
    gst_buffer_unmap(buffer, &map);

    g_signal_emit_by_name (appsrc_, "push-buffer", buffer, &ret);
    gst_buffer_unref(buffer);

    timestamp += 66666666;
    printf("fed one buffer \n");
    return G_SOURCE_CONTINUE;
}

int main(int argc, char** argv) {
    USE(argc);
    USE(argv);

    gst_init (&argc, &argv);

    GMainLoop *main_loop;
    main_loop = g_main_loop_new (NULL, FALSE);
    ostringstream launch_stream;
    GstAppSinkCallbacks callbacks = {appsink_eos, NULL, new_buffer};

    launch_stream
    << "appsrc name=mysource ! "
    << "video/x-raw,width="<< w <<",height="<< h <<",framerate=30/1,format=NV12 ! "
    << "nvvidconv ! video/x-raw(memory:NVMM),format=NV12 ! "
    << "nvv4l2h265enc maxperf-enable=1 ! appsink name=mysink ";

    launch_string = launch_stream.str();

    g_print("Using launch string: %s\n", launch_string.c_str());

    GError *error = nullptr;
    gst_pipeline  = (GstPipeline*) gst_parse_launch(launch_string.c_str(), &error);

    if (gst_pipeline == nullptr) {
        g_print( "Failed to parse launch: %s\n", error->message);
        return -1;
    }
    if(error) g_error_free(error);

    appsrc_ = gst_bin_get_by_name(GST_BIN(gst_pipeline), "mysource");
    gst_app_src_set_stream_type(GST_APP_SRC(appsrc_), GST_APP_STREAM_TYPE_STREAM);

    guint size;
    size = (w*h*3)/2;
    FILE *fp = fopen ("/home/nvidia/a.yuv", "rb");
    ptr = malloc(size);
    fread(ptr, size, 1, fp);
    fclose(fp);
    
    GstElement *appsink_ = gst_bin_get_by_name(GST_BIN(gst_pipeline), "mysink");
    gst_app_sink_set_callbacks (GST_APP_SINK(appsink_), &callbacks, NULL, NULL);

    gst_element_set_state((GstElement*)gst_pipeline, GST_STATE_PLAYING); 

    for (int i=0; i<15; i++) {
        feed_function(nullptr);
        usleep(66666);
    }

    gst_element_set_state((GstElement*)gst_pipeline, GST_STATE_NULL);
    gst_object_unref(GST_OBJECT(gst_pipeline));
    g_main_loop_unref(main_loop);

    free(ptr);
    g_print("going to exit \n");
    return 0;
}

Thanks for the confirmation. I agree that the resolution we are trying is huge. We are willing to consider lower resolutions for our application.

3 questions:

  • The data we pass to the encoder is already in NV12 formant, in a buffer created by NvBufferCreate - can this help?
  • I understand that changing the number of pixels will reduce the per-frame processing time, but would it decrease the number of accumulated frames? My observations say no, but if you know of a way, I am very interested in hearing it.
  • Is this constraint present in the nvv4l2vp9enc encoder? Would it support such resolutions?

Hi,

We don’t see much improvement in this method. The resolution is the key factor.

We don’t see the accumulation in 1080p30 input. and around 2-frame latency in 4Kp30 input. In our test we can see resolution dominates the latency.

No. Above 4K resolution only works in h265 encoding.

Do you have any information/math about how the resolution determines how many buffers need to be accumulated? I tried digging into the drivers but there are many levels of indirection and I think it might actually end up disappearing into NVIDIA proprietary code if I am not mistaken. Is it related to the VIDIOC_REQBUFS ioctl?

By the way, using the MeasureEncoderLatency property of nvv4l2h265enc does not account for the buffer accumulation. In my scenario, it reports roughly 60 ms for 7200x6000. Not sure what this value represents.

Finally, I am seeing NvENC: bBlitmode set to true in my program output - is this a source of latency?

Hi,
Running the test app we see encoder will have buffer accumulation at beginning and then catch up with frame rate:

nvidia@nvidia-desktop:~$ ./test
Using launch string: appsrc name=mysource ! video/x-raw,width=7200,height=6000,framerate=30/1,format=NV12 ! nvvidconv ! video/x-raw(memory:NVMM),format=NV12 ! nvv4l2h265enc maxperf-enable=1 ! appsink name=mysink
Opening in BLOCKING MODE
fed one buffer
NvMMLiteOpen : Block : BlockType = 8
===== NVMEDIA: NVENC =====
NvMMLiteBlockCreate : Block : BlockType = 8
fed one buffer
fed one buffer
NVMEDIA: H265 : Profile : 1
fed one buffer
fed one buffer
map.size = 120946
map.size = 81944
fed one buffer
map.size = 52171
fed one buffer
map.size = 46475
fed one buffer
map.size = 20833
fed one buffer
map.size = 13637
fed one buffer
map.size = 11633
fed one buffer
map.size = 10926    // feed one buffer and
map.size = 1219     // receive two encoded frames 
fed one buffer
map.size = 1815
fed one buffer
map.size = 1210
fed one buffer
map.size = 6003
fed one buffer
map.size = 42782
going to exit

So the buffer accumulation is at beginning due to large resolution and then encoder is up and running to catch up with source frame rate. The verified cases are listed in module data sheet:
https://developer.nvidia.com/jetson-xavier-nx-data-sheet

The verified use-case is 4Kp60. 8K is not tested but it should be good to achieve 30fps. 60ms look to be average encoding speed for 7200x6000(take 60ms to encode one 7200x6000 frame).

For single input source this should not trigger the latency. If you have concern please try to run hardware converter fixed at max clock:
Nvvideoconvert issue, nvvideoconvert in DS4 is better than Ds5? - #3 by DaneLLL

That’s good to know. We’re trying to mitigate the issue by pushing the frame rate up to allow it to catch up, but we’re capturing from four 4032x3040 sources and compositing them so the system load is higher in our case. Right now we don’t see it catch up. It’s a lot more lightweight when you just grab buffers from Argus and push them straight to nvvidconv.

By the way, we were hoping the Orin would help us, but the encoding capabilities have been reduced compared to the Xavier, so that’s an important step back for us.

Ultimately, we don’t need to use the encoder, but we do need to output at high resolutions (8K would be acceptable). We will need to investigate to see if there are any alternative approaches (display port? high-speed transceiver? SDI output?)