High Cpu Usage While doing asychronous readback

Hello,

I want to capture screen pixels of a qml qt quick controls application on nvidia xavier nx platform by using native opengl functions since I will use same functionality on Android as well. I have found that I can implement some asynchronous readback implementation by using glReadPixels function. My requirement is to get 16 bit RGB color pixels. For this reason I have created 2 different algorithms which causes high cpu usage.Followings are the implementations and details for CPU usage, read time and process time:

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
Sceneraio-1:

void WaylandEgl::createPixelBO()
{
if (!buffCreated)
{
pbo_size = mWinHeight * mWinWidth *2;
pixels = new unsigned char[pbo_size];
glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
glPixelStorei(GL_PACK_ALIGNMENT, 1);
glGenBuffers(PBO_COUNT,pboIds);
glBindBuffer(GL_PIXEL_PACK_BUFFER,pboIds[0]);
glBufferData(GL_PIXEL_PACK_BUFFER, pbo_size, 0, GL_STREAM_READ);
glBindBuffer(GL_PIXEL_PACK_BUFFER,pboIds[1]);
glBufferData(GL_PIXEL_PACK_BUFFER, pbo_size, 0, GL_STREAM_READ);
glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
buffCreated = true;

    glInfo glInfo;
    glInfo.getInfo();
    glInfo.printSelf();

    if(glInfo.isExtensionSupported("GL_ARB_pixel_buffer_object"))
    {
        qDebug() << "Video card supports GL_ARB_pixel_buffer_object.";
        pboSupported = true;
    }
    else
    {
        qDebug() << "Video card does NOT support GL_ARB_pixel_buffer_object.";
        pboSupported = false;
        return;
    }
}

}

void WaylandEgl::runPixelBO()
{
static int index = 0;
int nextIndex = 0; // pbo index used for next frame
index = (index + 1) % 2; // index 1
nextIndex = (index + 1) % 2; // nexIndex 0

createPixelBO();
memset(pixels,0,pbo_size);
glReadBuffer(GL_FRONT);

if (pboSupported)
{
    t1.start();

    glBindBuffer(GL_PIXEL_PACK_BUFFER, pboIds[index]);
    glReadPixels(0, 0, mWinWidth, mWinHeight, GL_RGB, GL_UNSIGNED_SHORT_5_6_5, 0); 

    t1.stop();
    readTime = t1.getElapsedTimeInMilliSec();

    t1.start();

    glBindBuffer(GL_PIXEL_PACK_BUFFER, pboIds[nextIndex]);

    GLubyte *ptr = (GLubyte*)glMapBufferRange(GL_PIXEL_PACK_BUFFER, 0, pbo_size, GL_MAP_READ_BIT);

    if (ptr)
    {
        memcpy(pixels, ptr, pbo_size);
        glUnmapBuffer(GL_PIXEL_PACK_BUFFER);
    }
    else
    {
        qDebug() << "NULL ptr";
    }
    t1.stop();
    processTime = t1.getElapsedTimeInMilliSec();
    glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
}
else
{
    t1.start();

    glReadPixels(0, 0, mWinWidth, mWinHeight, GL_RGB, GL_UNSIGNED_SHORT_5_6_5, pixels);

    // measure the time reading framebuffer
    t1.stop();
    readTime = t1.getElapsedTimeInMilliSec();

    t1.start();

    // measure the time reading framebuffer
    t1.stop();
    processTime = t1.getElapsedTimeInMilliSec();
}

qDebug() << "Read Time " << readTime;
qDebug() << "Process Time " << processTime;

}

Results:

PBO ON:
CPU: 28-32%
Read Time 5.065 ms
Process Time 0.334 ms

PBO OFF:
CPU:24-27%
Read Time 5.26 ms
Process Time 0 ms
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
Sceneraio-2

void WaylandEgl::initFastBuffers()
{
if (!buffCreated)
{
pbo_size = mWinHeight * mWinWidth *2;
pixels = new unsigned char[pbo_size];
Readback_buf = (GLchar *) malloc( pbo_size );

    glGenBuffers( PBO_COUNT, pboIds );

    // Buffer #0: glReadPixels target
    GLenum target = GL_PIXEL_PACK_BUFFER;

    glBindBuffer( target, pboIds[0] );
    glBufferData( target, pbo_size, 0, GL_STATIC_COPY );


    glGetBufferParameterui64vNV = (PFNGLGETBUFFERPARAMETERUI64VNVPROC)eglGetProcAddress("glGetBufferParameterui64vNV");
    if (!glGetBufferParameterui64vNV)
    {
        qDebug() << "glGetBufferParameterui64vNV not fouynded!";
        return;
    }

    glMakeBufferResidentNV = (PFNGLMAKEBUFFERRESIDENTNVPROC)eglGetProcAddress("glMakeBufferResidentNV");
    if (!glMakeBufferResidentNV)
    {
        qDebug() << "glMakeBufferResidentNV not fouynded!";
        return;
    }

    glUnmapBufferARB = (PFNGLUNMAPBUFFERARBPROC)eglGetProcAddress("glUnmapBufferARB");
    if (!glUnmapBufferARB)
    {
        qDebug() << "glUnmapBufferARB not fouynded!";
        return;
    }

    glGetBufferSubData = (PFNGLGETBUFFERSUBDATAPROC)eglGetProcAddress("glGetBufferSubData");
    if (!glGetBufferSubData)
    {
        qDebug() << "glGetBufferSubData not fouynded!";
        return;
    }

    qDebug() << "Run the optimizatiosn";


    GLuint64EXT addr;
    glGetBufferParameterui64vNV( target, GL_BUFFER_GPU_ADDRESS_NV, &addr );
    glMakeBufferResidentNV( target, GL_READ_ONLY );

    // Buffer #1: glCopyBuffer target
    target = GL_COPY_WRITE_BUFFER;
    glBindBuffer( target, pboIds[1] );
    glBufferData( target, pbo_size, 0, GL_STREAM_READ );

    glMapBufferRange( target, 0, 1, GL_MAP_WRITE_BIT);
    glUnmapBufferARB( target );
    glGetBufferParameterui64vNV( target, GL_BUFFER_GPU_ADDRESS_NV, &addr );
    glMakeBufferResidentNV     ( target, GL_READ_ONLY );
    buffCreated = true;
    glPixelStorei( GL_PACK_ALIGNMENT, 1 );
}

}

void WaylandEgl::doReadbackFAST()
{
// Work-around for NVidia driver readback crippling on GeForce.

initFastBuffers();

//glFinish();
Timer t1;
t1.start();
// Do a depth readback to BUF OBJ #0
glBindBuffer( GL_PIXEL_PACK_BUFFER, pboIds[0] );

glReadPixels( 0, 0, mWinWidth, mWinHeight,
              GL_RGB, GL_UNSIGNED_SHORT_5_6_5, 0 );
t1.stop();
readTime = t1.getElapsedTimeInMilliSec();

t1.start();
// Copy from BUF OBJ #0 to BUF OBJ #1
glBindBuffer( GL_COPY_WRITE_BUFFER, pboIds[1] );
glCopyBufferSubData( GL_PIXEL_PACK_BUFFER, GL_COPY_WRITE_BUFFER, 0, 0,
                     pbo_size );

// Do the readback from BUF OBJ #1 to app CPU memory
glGetBufferSubData( GL_COPY_WRITE_BUFFER, 0, pbo_size,
                    Readback_buf );

//sendImage((unsigned char*)Readback_buf,pbo_size);
t1.stop();
processTime = t1.getElapsedTimeInMilliSec();
glBindBuffer( GL_PIXEL_PACK_BUFFER, 0 );
qDebug() << "Read Time " << readTime;
qDebug() << "Process Time " << processTime;

}

Results:

PBO ON:
CPU: 28-33%
Read Time 3.446 ms
Process Time 2.111 ms

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

Strange point is that when I call function glReadPixels as blocking it uses less cpu for RGB16 bit color.

For RGBA and RGB both algorithms works fine and Sceneraio-2’s algorithm works with less cpu consumption.I need to get RGB16 bit color pixels and want to make cpu usage as lower as possible. Could you please check my algorithms or/and suggest me a way to manage low cpu consumption for getting 16 bit color pixels ?

Regards

Hello,

Is there any feedback for the issue ?

Regards

Hello,

From the link :https://docs.nvidia.com/jetson/l4t/index.html#page/Tegra%20Linux%20Driver%20Package%20Development%20Guide/graphics_opengl.html#wwpID0E0FD0HA

According to Avoid reading back the framebuffer contents (M1) part:

I tried to add PACK_ROW_LENGTH support as the following on initialization functions :

nBytesPerLine = 1280; // width of qml
    int rowL;
    glGetIntegerv(GL_PACK_ROW_LENGTH, &rowL);
    qDebug() << "Rowl before" << rowL;

    glPixelStorei( GL_PACK_ALIGNMENT, 1 );
    glPixelStorei(GL_PACK_ROW_LENGTH,nBytesPerLine);
    qDebug() << "Pixel st" << glGetError();
    glGetIntegerv(GL_PACK_ROW_LENGTH, &rowL);
    qDebug() << "Rowl after" << rowL;

and It did not reduce cpu usage for both GL_RGBA and GL_RGB.

Any idea what I can do is well appreciated ?

Regards

Hi,
Please share your pipeline for reference. Seems like you copy data in CPU buffers to GPU buffers. It looks normal to take CPU usage if you copy data between CPU and GPU buffers. If your source is a v4l2 source, we have a sample to capture data into NvBuffer through v4l2:

/usr/src/jetson_multimedia_api/samples/12_camera_v4l2_cuda

But common format is YUV422. Not sure if it can be applied to your usecase.

Hello,

Could you please tell me what is pipeline ? Also My source is not v4l2 since it is not a camera image.I am trying to get 16 bit color pixels of a qt qml application by using type GL_UNSIGNED_SHORT_5_6_5 or if any applicable format exist.

Regards

Hi,
What I mean is block diagram. Since we support gstreamer and it is like to link pipeline. May not be proper in using the name in other usecases.

GL_UNSIGNED_SHORT_5_6_5 is not supported in NvBuffer so you may not be able to apply the usecase to jetson_multimedia_api. Could you check samples in

/usr/src/nvidia/graphics_demos/

These are samples for demonstrating GL/EGL usecases. May be helpful for your usecase.

Hello,

It seems to me that EGL_KHR_image_pixmap may be an option but It does not exist in nvidia xavier nx EGL_Extensions.The following EGL_extensions are present on my device:

EGL Version: “1.5”

EGL Vendor: “NVIDIA”

EGL Extensions: “EGL_ANDROID_native_fence_sync EGL_EXT_buffer_age EGL_EXT_client_sync EGL_EXT_create_context_robustness EGL_EXT_image_dma_buf_import EGL_EXT_image_dma_buf_import_modifiers EGL_EXT_output_base EGL_EXT_output_drm EGL_EXT_protected_content EGL_EXT_stream_consumer_egloutput EGL_EXT_stream_acquire_mode EGL_EXT_sync_reuse EGL_IMG_context_priority EGL_KHR_config_attribs EGL_KHR_create_context_no_error EGL_KHR_context_flush_control EGL_KHR_create_context EGL_KHR_display_reference EGL_KHR_fence_sync EGL_KHR_get_all_proc_addresses EGL_KHR_partial_update EGL_KHR_swap_buffers_with_damage EGL_KHR_no_config_context EGL_KHR_gl_colorspace EGL_KHR_gl_renderbuffer_image EGL_KHR_gl_texture_2D_image EGL_KHR_gl_texture_3D_image EGL_KHR_gl_texture_cubemap_image EGL_KHR_image EGL_KHR_image_base EGL_KHR_reusable_sync EGL_KHR_stream EGL_KHR_stream_attrib EGL_KHR_stream_consumer_gltexture EGL_KHR_stream_cross_process_fd EGL_KHR_stream_fifo EGL_KHR_stream_producer_eglsurface EGL_KHR_surfaceless_context EGL_KHR_wait_sync EGL_MESA_image_dma_buf_export EGL_NV_context_priority_realtime EGL_NV_cuda_event EGL_NV_nvrm_fence_sync EGL_NV_stream_cross_display EGL_NV_stream_cross_object EGL_NV_stream_cross_process EGL_NV_stream_flush EGL_NV_stream_metadata EGL_NV_stream_remote EGL_NV_stream_reset EGL_NV_stream_socket EGL_NV_stream_socket_unix EGL_NV_stream_sync EGL_NV_stream_fifo_next EGL_NV_stream_consumer_gltexture_yuv EGL_NV_stream_attrib EGL_NV_system_time EGL_NV_output_drm_flip_event EGL_WL_bind_wayland_display EGL_WL_wayland_eglstream”

By the way I modified the algorithms above to use a 16 bit color renderbuffer , read from that FBO and export __GL_YIELD=USLEEP which reduced the cpu usage but it seems to me if I can prevent some copy mechanism as in EGL_KHR_image_pixmap extension ,cpu load maybe reduced more.

One more thing that explains why I am trying to use glReadPixels is that I will transfer the color pixels from one device to another device.And It seems to me EGL_KHR_image_pixmap also may not be applicable since from that extension EGLImage will be created which can not be transferred to another device as far as I know.

For this reason Do you have any idea what might be the issue for the algorithm above or do you have more suggestions ?

Regards

Hi,
Do you use JP4.4 or JP4.4.1? We have fixed an issue in calling eglSwapBuffers() on JP4.5.

Here are relevant topics:
Multiple gstreamer pipelines slow down when in the same process
Multiple instances of NvEglRenderer - #12 by DaneLLL
My Jetson EGL code makes compiz use 95% of the CPU

Hello,

My Jetpack release is : 4.4.1 . Currently it is really hard to switch to JP4.5 since some my other setups based on 4.4.1. Would you mind sharing the patch so that I can apply and check if it is helping ?

Regards

Hello,

Any feedback for the request ?

Regards

Hi,
There is significant modification in the fix, so we would suggest upgrade the version.