Question about V4L2 API for encode of TX1

Hi, all

Hardware: Tegra X1
System version: Linux For Tegra R24.2.1

I found the R24.2 released with a new L4TMultimediaAPI. It have V4L2 API for encode, decode, scaling, and other media functions.

I am very interested in it. So I made some testing with the example 03_video_cuda_enc. Then, I found the output_plane.qBuffer takes many time (about 10ms) but output_plane.dqBuffer almost take no time (about 50us). I am confused about this. It should not be the output_plane.dqBuffer waiting for the buffer returned back from encoder?

So, I added some debug info in code:

diff --git a/samples/03_video_cuda_enc/video_cuda_enc_main.cpp b/samples/03_video_cuda_enc/video_cuda_enc_main.cpp
index 537ec19..d89dd01 100644
--- a/samples/03_video_cuda_enc/video_cuda_enc_main.cpp
+++ b/samples/03_video_cuda_enc/video_cuda_enc_main.cpp
@@ -33,6 +33,10 @@
 #include <linux/videodev2.h>
 #include <malloc.h>
 #include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>

 #include "nvbuf_utils.h"
 #include "NvCudaProc.h"
@@ -45,6 +49,8 @@

 using namespace std;

+struct timespec ts1, ts2, ts3, ts4, ts5;
+
 static void
 abort(context_t *ctx)
 {
@@ -73,6 +79,9 @@ encoder_capture_plane_dq_callback(struct v4l2_buffer *v4l2_buf, NvBuffer * buffe
         return false;
     }

+    clock_gettime(CLOCK_MONOTONIC, &ts5);
+    printf("encoder_capture_plane_dq_callback index=%d ts5 time=%ld\n", v4l2_buf->index, ts5.tv_sec * 1000000000 + ts5.tv_nsec);
+
     write_encoder_output_frame(ctx->out_file, buffer);

     if (enc->capture_plane.qBuffer(*v4l2_buf, NULL) < 0)
@@ -108,6 +117,7 @@ main(int argc, char *argv[])
     int ret = 0;
     int error = 0;
     bool eos = false;
+    int frame_count = 10;

     set_defaults(&ctx);

@@ -157,7 +167,8 @@ main(int argc, char *argv[])

     if (ctx.encoder_pixfmt == V4L2_PIX_FMT_H264)
     {
-        ret = ctx.enc->setProfile(V4L2_MPEG_VIDEO_H264_PROFILE_HIGH);
+        //ret = ctx.enc->setProfile(V4L2_MPEG_VIDEO_H264_PROFILE_HIGH);
+        ret = ctx.enc->setProfile(V4L2_MPEG_VIDEO_H264_PROFILE_BASELINE);
     }
     else
     {
@@ -176,12 +187,12 @@ main(int argc, char *argv[])

     // Query, Export and Map the output plane buffers so that we can read
     // raw data into the buffers
-    ret = ctx.enc->output_plane.setupPlane(V4L2_MEMORY_MMAP, 10, true, false);
+    ret = ctx.enc->output_plane.setupPlane(V4L2_MEMORY_MMAP, 4, true, false);
     TEST_ERROR(ret < 0, "Could not setup output plane", cleanup);

     // Query, Export and Map the output plane buffers so that we can write
     // encoded data from the buffers
-    ret = ctx.enc->capture_plane.setupPlane(V4L2_MEMORY_MMAP, 10, true, false);
+    ret = ctx.enc->capture_plane.setupPlane(V4L2_MEMORY_MMAP, 4, true, false);
     TEST_ERROR(ret < 0, "Could not setup capture plane", cleanup);

     // output plane STREAMON
@@ -245,13 +256,18 @@ main(int argc, char *argv[])
         HandleEGLImage(&ctx.eglimg);
         NvDestroyEGLImage(ctx.eglDisplay, ctx.eglimg);

+       clock_gettime(CLOCK_MONOTONIC, &ts3);
+       printf("before output_plane.qBuffer index=%d ts3 time=%ld\n", v4l2_buf.index, ts3.tv_sec * 1000000000 + ts3.tv_nsec);
         ret = ctx.enc->output_plane.qBuffer(v4l2_buf, NULL);
+       clock_gettime(CLOCK_MONOTONIC, &ts4);
+       printf("after output_plane.qBuffer index=%d ts4 time=%ld\n", v4l2_buf.index, ts4.tv_sec * 1000000000 + ts4.tv_nsec);
         if (ret < 0)
         {
             cerr << "Error while queueing buffer at output plane" << endl;
             abort(&ctx);
             goto cleanup;
         }
+       printf("\n");

         if (v4l2_buf.m.planes[0].bytesused == 0)
         {
@@ -273,12 +289,16 @@ main(int argc, char *argv[])

         v4l2_buf.m.planes = planes;

+       clock_gettime(CLOCK_MONOTONIC, &ts1);
+       printf("before output_plane.dqBuffer index=%d ts1 time=%ld\n", v4l2_buf.index, ts1.tv_sec * 1000000000 + ts1.tv_nsec);
         if (ctx.enc->output_plane.dqBuffer(v4l2_buf, &buffer, NULL, 10) < 0)
         {
             cerr << "ERROR while DQing buffer at output plane" << endl;
             abort(&ctx);
             goto cleanup;
         }
+       clock_gettime(CLOCK_MONOTONIC, &ts2);
+       printf("after output_plane.dqBuffer index=%d ts2 time=%ld\n", v4l2_buf.index, ts2.tv_sec * 1000000000 + ts2.tv_nsec);

         if (read_video_frame(ctx.in_file, *buffer) < 0)
         {
@@ -290,13 +310,25 @@ main(int argc, char *argv[])
         HandleEGLImage(&ctx.eglimg);
         NvDestroyEGLImage(ctx.eglDisplay, ctx.eglimg);

+       clock_gettime(CLOCK_MONOTONIC, &ts3);
+       printf("before output_plane.qBuffer index=%d ts3 time=%ld\n", v4l2_buf.index, ts3.tv_sec * 1000000000 + ts3.tv_nsec);
         ret = ctx.enc->output_plane.qBuffer(v4l2_buf, NULL);
+       clock_gettime(CLOCK_MONOTONIC, &ts4);
+       printf("after output_plane.qBuffer index=%d ts4 time=%ld\n", v4l2_buf.index, ts4.tv_sec * 1000000000 + ts4.tv_nsec);
         if (ret < 0)
         {
             cerr << "Error while queueing buffer at output plane" << endl;
             abort(&ctx);
             goto cleanup;
         }
+       printf("\n");
+
+       frame_count--;
+       if (frame_count == 0) {
+           cout << "sleep and exit" << endl;
+           usleep(2000000);
+           break;
+       }

         if (v4l2_buf.m.planes[0].bytesused == 0)
         {
diff --git a/samples/common/classes/NvUtils.cpp b/samples/common/classes/NvUtils.cpp
index 9a994a2..c36111d 100644
--- a/samples/common/classes/NvUtils.cpp
+++ b/samples/common/classes/NvUtils.cpp
@@ -46,9 +46,11 @@ read_video_frame(std::ifstream * stream, NvBuffer & buffer)
         plane.bytesused = 0;
         for (j = 0; j < plane.fmt.height; j++)
         {
+/*
             stream->read(data, bytes_to_read);
             if (stream->gcount() < bytes_to_read)
                 return -1;
+*/
             data += plane.fmt.stride;
         }
         plane.bytesused = plane.fmt.stride * plane.fmt.height;

Then, the output:

Failed to query video capabilities: Bad address
NvMMLiteOpen : Block : BlockType = 4
===== MSENC =====
NvMMLiteBlockCreate : Block : BlockType = 4
875967048
842091865
before output_plane.qBuffer index=0 ts3 time=20736071424891		// output_plane.qBuffer frame 0 start
===== MSENC blits (mode: 1) into tiled surfaces =====
encoder_capture_plane_dq_callback index=0 ts5 time=20736081566684	// encoded frame 0(SPS+PPS+IDR_SLICE) output, encoder take 10141793(ts5 - ts3) ns
after output_plane.qBuffer index=0 ts4 time=20736081663761		// output_plane.qBuffer frame 0 end, qBuffer take 10238870(ts4 - ts3) ns

before output_plane.qBuffer index=1 ts3 time=20736082638276		// output_plane.qBuffer frame 1 start
after output_plane.qBuffer index=1 ts4 time=20736091488806		// output_plane.qBuffer frame 1 end, qBuffer take 8850530(ts4 - ts3) ns

encoder_capture_plane_dq_callback index=1 ts5 time=20736091697698	// encoded frame 1(SLICE) output, encoder take 9059422(ts5 - ts3) ns
before output_plane.qBuffer index=2 ts3 time=20736092426032		// output_plane.qBuffer frame 2 start
encoder_capture_plane_dq_callback index=2 ts5 time=20736101379003	// encoded frame 1(SLICE) output, encoder take 8952971(ts5 - ts3) ns
after output_plane.qBuffer index=2 ts4 time=20736101417854		// output_plane.qBuffer frame 1 end, qBuffer take 8991822(ts4 - ts3) ns

before output_plane.qBuffer index=3 ts3 time=20736102331905
encoder_capture_plane_dq_callback index=3 ts5 time=20736111248889
after output_plane.qBuffer index=3 ts4 time=20736111286178

before output_plane.qBuffer index=4 ts3 time=20736112202885
encoder_capture_plane_dq_callback index=4 ts5 time=20736120931808
after output_plane.qBuffer index=4 ts4 time=20736120968108

before output_plane.qBuffer index=5 ts3 time=20736121883513
encoder_capture_plane_dq_callback index=5 ts5 time=20736130510724
after output_plane.qBuffer index=5 ts4 time=20736130547545

before output_plane.qBuffer index=6 ts3 time=20736131477584
after output_plane.qBuffer index=6 ts4 time=20736140275826

encoder_capture_plane_dq_callback index=6 ts5 time=20736140400765
before output_plane.qBuffer index=7 ts3 time=20736141196960
encoder_capture_plane_dq_callback index=7 ts5 time=20736149853388
after output_plane.qBuffer index=7 ts4 time=20736149890104

before output_plane.qBuffer index=8 ts3 time=20736150908263
encoder_capture_plane_dq_callback index=8 ts5 time=20736159592189
after output_plane.qBuffer index=8 ts4 time=20736159629895

before output_plane.qBuffer index=9 ts3 time=20736160603004
encoder_capture_plane_dq_callback index=9 ts5 time=20736169510249
after output_plane.qBuffer index=9 ts4 time=20736169544621

before output_plane.dqBuffer index=0 ts1 time=20736169575505		// output_plane.dqBuffer frame 0 start
after output_plane.dqBuffer index=0 ts2 time=20736169611179		// output_plane.dqBuffer frame 0 end, dqBuffer take 35674(ts2 - ts1) ns
before output_plane.qBuffer index=0 ts3 time=20736170246552
encoder_capture_plane_dq_callback index=0 ts5 time=20736178912875
after output_plane.qBuffer index=0 ts4 time=20736178943186

before output_plane.dqBuffer index=0 ts1 time=20736178984693
after output_plane.dqBuffer index=1 ts2 time=20736179003806
before output_plane.qBuffer index=1 ts3 time=20736179639752
encoder_capture_plane_dq_callback index=1 ts5 time=20736188261599
after output_plane.qBuffer index=1 ts4 time=20736188291806

before output_plane.dqBuffer index=0 ts1 time=20736188332219
after output_plane.dqBuffer index=2 ts2 time=20736188351645
before output_plane.qBuffer index=2 ts3 time=20736188944885
encoder_capture_plane_dq_callback index=2 ts5 time=20736197578867
after output_plane.qBuffer index=2 ts4 time=20736197608136

before output_plane.dqBuffer index=0 ts1 time=20736197639853
after output_plane.dqBuffer index=3 ts2 time=20736197665267
before output_plane.qBuffer index=3 ts3 time=20736198262257
encoder_capture_plane_dq_callback index=3 ts5 time=20736206880615
after output_plane.qBuffer index=3 ts4 time=20736206909415

before output_plane.dqBuffer index=0 ts1 time=20736206949881
after output_plane.dqBuffer index=4 ts2 time=20736206975921
before output_plane.qBuffer index=4 ts3 time=20736207558381
encoder_capture_plane_dq_callback index=4 ts5 time=20736216185697
after output_plane.qBuffer index=4 ts4 time=20736216214965

before output_plane.dqBuffer index=0 ts1 time=20736216246265
after output_plane.dqBuffer index=5 ts2 time=20736216271576
before output_plane.qBuffer index=5 ts3 time=20736216948769
encoder_capture_plane_dq_callback index=5 ts5 time=20736225559419
after output_plane.qBuffer index=5 ts4 time=20736225589261

before output_plane.dqBuffer index=0 ts1 time=20736225620717
after output_plane.dqBuffer index=6 ts2 time=20736225646757
before output_plane.qBuffer index=6 ts3 time=20736226286452
after output_plane.qBuffer index=6 ts4 time=20736234854866

before output_plane.dqBuffer index=0 ts1 time=20736234900123
after output_plane.dqBuffer index=7 ts2 time=20736234923194
encoder_capture_plane_dq_callback index=6 ts5 time=20736235002980
before output_plane.qBuffer index=7 ts3 time=20736235495550
encoder_capture_plane_dq_callback index=7 ts5 time=20736244094483
after output_plane.qBuffer index=7 ts4 time=20736244172602

before output_plane.dqBuffer index=0 ts1 time=20736244214214
after output_plane.dqBuffer index=8 ts2 time=20736244233327
before output_plane.qBuffer index=8 ts3 time=20736244862762
encoder_capture_plane_dq_callback index=8 ts5 time=20736253500546
after output_plane.qBuffer index=8 ts4 time=20736253531690

before output_plane.dqBuffer index=0 ts1 time=20736253563980
after output_plane.dqBuffer index=9 ts2 time=20736253588874
before output_plane.qBuffer index=9 ts3 time=20736254303616
encoder_capture_plane_dq_callback index=9 ts5 time=20736262929682
after output_plane.qBuffer index=9 ts4 time=20736262959211

sleep and exit
App run was successful

Is this a BUG?
or
Is the VideoEncoder’s qBuffer work in sync mode waiting for encoding completion of the frame?
If so, I am afraid the buffer pool is needless.

Another question:
Why my post using another account was hidden, can’t be seen by others?
Here is my post thread:
https://devtalk.nvidia.com/default/topic/987024/jetson-tx1/question-about-v4l2-api-for-encode-of-tx1/

Locked this thread as issue is under discussing at previous thread:
https://devtalk.nvidia.com/default/topic/987024/jetson-tx1/question-about-v4l2-api-for-encode-of-tx1/