OpenCV CUDA processing from gstreamer pipeline [JP4, JP5]

Just sharing here some findings from my experiments and some example code about how to use OpenCV cuda processing in gstreamer pipeline leveraging Jetson HW and SW.

  1. This is far from clean code… I’m just a stupid dog, be careful ;-) No warranty at all and no expected support !

  2. This example builds a gstreamer app that will launch the pipeline string passed as argument. In that pipeline, the app will try to find an element with name=conv. Then it will add a src pad probe to it that will be able to post process its output buffers from OpenCV CUDA.

  3. Try this and look at the code for your needs.

// Uncomment the following line if using JetPack4 (R32). Let it commented if using JP5 (R35)
//#define JP4

#include <gst/gst.h>
#include <nvbuf_utils.h>
#ifndef JP4
#include <nvbufsurface.h>
#endif /* not defined JP4 */
#include <cuda.h>
#include <cuda_runtime.h>
#include <cudaEGL.h>
#include <opencv2/core.hpp>
#include <opencv2/cudaarithm.hpp>


#ifdef JP4    
static EGLDisplay egl_display;

static inline unsigned int GetPitch(unsigned int width, const unsigned int stride) {
    g_print("width=%d, stride=%d\n", width, stride);
    int num_strides = width/stride;
    if ((width % stride) != 0)
        ++num_strides;
    return num_strides*stride;
}
#endif /* JP4 */


// This example would just draw a 200x200 green rectangle starting at (100,100) so resolution is expected to be at least 500 for height and width  
static cv::Rect roi(100,100,200,200); 
static cv::Rect roi2(50,50,100,100); // for half size planes such as U&V in NV12 format


static void CvCudaProcessRGBA(const unsigned int height, const unsigned int pitch, uchar * bufPtr) {
    g_print("RGBA processing using pitch %d\n", pitch);
    cv::cuda::GpuMat d_mat(height, pitch, CV_8UC4, bufPtr);

    // Process
    d_mat(roi).setTo(cv::Scalar(0,255,0,255));  // R,G,B,A

    // Check
    if (d_mat.data != bufPtr)
        g_printerr("Error RGBA buffer reallocated\n");
}


static void CvCudaProcessI420(const unsigned int height, const unsigned int pitch, uchar * YbufPtr, uchar * UbufPtr, uchar * VbufPtr) {
    g_print("I420 processing using pitch %d\n", pitch);
    // Y is full size
    cv::cuda::GpuMat d_Mat_Y(height, pitch, CV_8UC1, YbufPtr);
    // U and V are half size
    cv::cuda::GpuMat d_Mat_U(height/2, pitch/2, CV_8UC1, UbufPtr);
    cv::cuda::GpuMat d_Mat_V(height/2, pitch/2, CV_8UC1, VbufPtr);

    // Process
    d_Mat_Y(roi).setTo(255);
    d_Mat_U(roi2).setTo(0);
    d_Mat_V(roi2).setTo(0);
    
    // Check
    if (d_Mat_Y.data != YbufPtr)
        g_printerr("Error Y buffer reallocated\n");
    if (d_Mat_U.data != UbufPtr)
        g_printerr("Error U buffer reallocated\n"); 
    if (d_Mat_V.data != VbufPtr)
        g_printerr("Error V buffer reallocated\n"); 
}


static void CvCudaProcessNV12(const unsigned int height, const unsigned int pitch, uchar * YbufPtr, uchar * UVbufPtr) {
    static std::vector<cv::cuda::GpuMat> uv(2);
    g_print("NV12 using pitch %d\n", pitch);
    // Y is full size
    cv::cuda::GpuMat d_Mat_Y(height, pitch, CV_8UC1, YbufPtr);
    // U and V are half size and interleaved 
    cv::cuda::GpuMat d_Mat_UV(height/2, pitch/2, CV_8UC2, UVbufPtr);
    cv::cuda::split(d_Mat_UV, uv);
    cv::cuda::GpuMat& d_Mat_U = uv[0];
    cv::cuda::GpuMat& d_Mat_V = uv[1];

    // Process
    d_Mat_Y(roi).setTo(255);
    d_Mat_U(roi2).setTo(0);
    d_Mat_V(roi2).setTo(0);

    // reinterleave U&V
    cv::cuda::merge(uv, d_Mat_UV);
    // Check
    if (d_Mat_Y.data != YbufPtr)
        g_printerr("Error Y buffer reallocated\n");
    if (d_Mat_UV.data != UVbufPtr)
        g_printerr("Error UV buffer reallocated\n");
}


/************************************************************************/
/* Maps an EGL frame and call CUDA processing depending on color format */
/************************************************************************/
static void CvCudaProcessEGLImage(EGLImageKHR& egl_image, unsigned int pitch)
{
    //printf("EglImage at %p\n", egl_image);
    CUresult status;
    CUeglFrame eglFrame;
    CUgraphicsResource pResource = NULL;
    cudaFree(0);
    status = cuGraphicsEGLRegisterImage(&pResource,
                                        egl_image,
                                        CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
    if (status != CUDA_SUCCESS)
        g_printerr("cuGraphicsEGLRegisterImage failed\n");

    status = cuGraphicsResourceGetMappedEglFrame(&eglFrame, pResource, 0, 0);
    if (status != CUDA_SUCCESS)
        g_printerr("cuGraphicsResourceGetMappedEglFrame failed\n");

    status = cuCtxSynchronize();
    if (status != CUDA_SUCCESS)
        g_printerr("cuCtxSynchronize failed\n");

    //printf("Mapped frame with width=%d and height=%d\n", eglFrame.width, eglFrame.height);
    if (eglFrame.frameType == CU_EGL_FRAME_TYPE_PITCH) {
        /********************/
        /* EGL Pitch frame  */
        /********************/
        g_print("EGL frame type pitch: ");
        switch(eglFrame.eglColorFormat) {
            case CU_EGL_COLOR_FORMAT_ABGR:
                CvCudaProcessRGBA(eglFrame.height, eglFrame.width, (uchar*) eglFrame.frame.pPitch[0]);
                break;

            case CU_EGL_COLOR_FORMAT_YUV420_PLANAR:
#ifdef JP4
                pitch = GetPitch(eglFrame.width, 256);
#endif /* JP4 */
                CvCudaProcessI420(eglFrame.height, pitch, (uchar*)eglFrame.frame.pPitch[0], (uchar*)eglFrame.frame.pPitch[1], (uchar*)eglFrame.frame.pPitch[2]);
                break;
        
            case CU_EGL_COLOR_FORMAT_YUV420_SEMIPLANAR:
#ifdef JP4
                pitch = GetPitch(eglFrame.width, 256);
#endif /* JP4 */
                CvCudaProcessNV12(eglFrame.height, pitch, (uchar*)eglFrame.frame.pPitch[0], (uchar*)eglFrame.frame.pPitch[1]);
                break;
        
            default:
                g_printerr ("Unsupported eglcolorformat %d for CU_EGL_FRAME_TYPE_PITCH\n", eglFrame.eglColorFormat);
        }
    }
    else {
        /***********************/
        /* EGL CU Array frame  */
        /***********************/
        g_print("EGL frame type array. Block linear format is not public AFAIK. Currently unsupported\n");
    }

    status = cuCtxSynchronize();
    if (status != CUDA_SUCCESS)
        g_printerr("cuCtxSynchronize failed\n");

    status = cuGraphicsUnregisterResource(pResource);
    if (status != CUDA_SUCCESS)
        g_printerr("cuGraphicsUnregisterResource failed\n");

}


/********************************************************************/
/* This src (output) probe callback function would be called each   */
/* time that element with name "conv" has prepared an output buffer */
/* that we can process here before it goes further in pipeline      */
/********************************************************************/
static GstPadProbeReturn
conv_src_pad_buffer_probe (GstPad * pad, GstPadProbeInfo * info, gpointer u_data)
{
    GstBuffer *buffer = (GstBuffer *) info->data;
    GstMapInfo map    = {0};
    gst_buffer_map (buffer, &map, GST_MAP_WRITE);
    
    EGLImageKHR egl_image;
    
#ifdef JP4
    int dmabuf_fd = 0;
    if (-1 == ExtractFdFromNvBuffer((void *)map.data, &dmabuf_fd))
    {
        g_printerr("ExtractFdFromNvBuffer failed\n");
    }
    egl_image = NvEGLImageFromFd(egl_display, dmabuf_fd);
    CvCudaProcessEGLImage(egl_image, 0); // pitch will be later computed from eglFrame
    NvDestroyEGLImage(egl_display, egl_image);
    
#else /* JP5 */

    NvBufSurface* surf = (NvBufSurface*)map.data;
    g_print("Got NvBufSurface with %d surface%s  mem type %d\n", surf->numFilled, surf->numFilled > 1 ? "s":"", surf->memType);
    NvBufSurfaceParams params = surf->surfaceList[0];
    g_print("Surface has width=%u, height=%u, pitch=%u, memsize=%u at %p bufferDesc=%zu\n", 
            params.width, params.height, params.pitch, params.dataSize, params.dataPtr, params.bufferDesc);
            
    if (surf->memType == NVBUF_MEM_SURFACE_ARRAY) {
   	    /***************************************************/    
        /* NVBUF_MEM_SURFACE_ARRAY is accessed through EGL */
    	/***************************************************/            
        g_print("Using EGL CUDA mapping: ");
        //int dmabuf_fd = surf->surfaceList[0].bufferDesc;

        if(0 != NvBufSurfaceMapEglImage(surf, -1))
            g_printerr("Failed to map surface to EGLImage\n");
        egl_image = surf->surfaceList[0].mappedAddr.eglImage; 
        if (egl_image == NULL)
            g_printerr("No EGLImage mapped from surface\n");

        CvCudaProcessEGLImage(egl_image, params.pitch);
        NvBufSurfaceUnMapEglImage (surf, 0);
    }
    else {  
        g_print("Not using EGL: ");
        switch(params.colorFormat) {
            case NVBUF_COLOR_FORMAT_RGBA:
                CvCudaProcessRGBA(params.height, params.pitch/4, (uchar *)surf->surfaceList[0].dataPtr);
                break;

            case NVBUF_COLOR_FORMAT_YUV420:
                CvCudaProcessI420(params.height, params.pitch, (uchar*)surf->surfaceList[0].dataPtr, (uchar *)surf->surfaceList[0].dataPtr + params.height*params.pitch, (uchar *)surf->surfaceList[0].dataPtr + params.height*params.pitch + params.height*params.pitch/4);
                break;

            case NVBUF_COLOR_FORMAT_NV12:
                CvCudaProcessNV12(params.height, params.pitch, (uchar*)surf->surfaceList[0].dataPtr, (uchar *)surf->surfaceList[0].dataPtr + params.height*params.pitch);
                break;

            default:
                g_printerr("Unsupported color format\n");
        }
    }
     
#endif /* JP5 */
    
    gst_buffer_unmap(buffer, &map);

    return GST_PAD_PROBE_OK;
}


void print_usage(void) {
    fprintf(stderr, "Usage: test_surface \"your pipeline string with your processing element having name=conv\"\n");
    return;
}


/************************************************************************************************/
/* This example builds a gstreamer app that will launch the pipeline string passed as argument. */
/* In that pipeline, the app will try to find an element with name=conv. Then it will add a src */
/* pad probe to it that will be able to post process its output buffers from OpenCV CUDA        */
/************************************************************************************************/
gint
main (gint   argc,
      gchar *argv[])
{
    if (argc != 2) {
        print_usage();
        exit(-2);
    }

    GMainLoop *loop;
    GstElement *pipeline;

    /* init GStreamer */
    gst_init (&argc, &argv);
    loop = g_main_loop_new (NULL, FALSE);

    /* Try to create pipeline */
    pipeline = gst_parse_launch(argv[1], NULL);
    if (pipeline == NULL)
        g_error ("Failed to launch pipeline");

    /* Try to find element with name=conv */
    GstElement *conv = gst_bin_get_by_name(GST_BIN(pipeline), "conv");
    if (conv == NULL)
        g_error ("Failed to find conv in pipeline");
 
    /* Get its src pad and add the probe to it */
    GstPad *pad = gst_element_get_static_pad (conv, "src");
    /* gulong probe_id = */ gst_pad_add_probe (pad, GST_PAD_PROBE_TYPE_BUFFER,(GstPadProbeCallback) conv_src_pad_buffer_probe, NULL, NULL);
    gst_object_unref (pad);
    gst_object_unref (conv);

    /* Try running the pipeline and wait until it's up and running or failed */
    gst_element_set_state (pipeline, GST_STATE_PLAYING);
    if (gst_element_get_state (pipeline, NULL, NULL, -1) == GST_STATE_CHANGE_FAILURE) {
       g_error ("Failed to go into PLAYING state");
    }
    g_print ("Running ...\n");
    g_main_loop_run (loop);

    /* exit */
    gst_element_set_state (pipeline, GST_STATE_NULL);
    gst_object_unref (pipeline);

    return 0;
}


/* Built with:
gcc -Wall -O3 -o test_surface -I/usr/include/gstreamer-1.0 -I/usr/include/glib-2.0  -I/usr/lib/aarch64-linux-gnu/glib-2.0/include -I/usr/local/cuda/include -I/usr/local/opencv-4.6.0/include/opencv4 -I/usr/src/jetson_multimedia_api/include/ test_surface.cpp -lstdc++ -lgstreamer-1.0 -lgobject-2.0 -lglib-2.0 -L/usr/local/opencv-4.6.0/lib -lopencv_core -lopencv_cudaarithm -lEGL -lGLESv2 -L/usr/lib/aarch64-linux-gnu/tegra/ -lcuda -lnvbuf_utils -lnvbufsurface -L/usr/local/cuda/lib64/ -lcudart

export LD_LIBRARY_PATH=/usr/local/opencv-4.6.0/lib/:$LD_LIBRARY_PATH

# RGBA works 
./test_surface "videotestsrc ! video/x-raw,width=640,height=480,framerate=30/1 ! nvvidconv name=conv ! video/x-raw(memory:NVMM),width=1920,height=1080,format=RGBA ! nv3dsink"

# I420 works
./test_surface "videotestsrc ! video/x-raw,width=640,height=480,framerate=30/1 ! nvvidconv name=conv ! video/x-raw(memory:NVMM),width=1920,height=1080,format=I420 ! nv3dsink"

# NV12 with pitch linear works
./test_surface "videotestsrc ! video/x-raw,width=640,height=480,framerate=30/1 ! nvvidconv name=conv bl-output=0 ! video/x-raw(memory:NVMM),width=1920,height=1080,format=NV12 ! nv3dsink"

# NV12 with block linear is not supported:
./test_surface "videotestsrc ! video/x-raw,width=640,height=480,framerate=30/1 ! nvvidconv name=conv bl-output=1 ! video/x-raw(memory:NVMM),width=1920,height=1080,format=NV12 ! nv3dsink"

# If you have some decoder or else outputting BL NV12, there is a bug in nvvidconv for converting into NV12 pitch linear. DS nvvideoconvert works for the following case:
./test_surface "videotestsrc ! video/x-raw,width=640,height=480,framerate=30/1 ! nvvideoconvert  bl-output=1 ! video/x-raw(memory:NVMM),width=1920,height=1080,format=NV12 ! nvvideoconvert name=conv bl-output=0 ! video/x-raw(memory:NVMM),width=1920,height=1080,format=NV12 ! nv3dsink"

# ...but it doesn't work with nvv4l2decoder (at least with the following case), although bl-output=0 is set, BL format is reported on src pad probe. 
# So a workaround would be converting to I420:
./test_surface "filesrc location=/home/nvidia/Videos/bbb_sunflower_1080p_60fps_normal.mp4 ! qtdemux ! h264parse ! nvv4l2decoder ! queue ! nvvideoconvert ! video/x-raw(memory:NVMM),format=I420 ! nvvideoconvert bl-output=0 name=conv ! video/x-raw(memory:NVMM),format=NV12 ! nv3dsink"

# RGBA works from other memory types such as:
./test_surface "videotestsrc ! video/x-raw,width=640,height=480,framerate=30/1 ! nvvidconv name=conv compute-hw=GPU nvbuf-memory-type=1 ! video/x-raw(memory:NVMM),width=1920,height=1080,format=RGBA ! nv3dsink"

# I420 may work from other memory types such as:
./test_surface "videotestsrc ! video/x-raw,width=640,height=480,framerate=30/1 ! nvvidconv name=conv compute-hw=GPU nvbuf-memory-type=1 ! video/x-raw(memory:NVMM),width=1920,height=1080,format=I420 ! nv3dsink"

# NV12 may work from other memory types such as:
./test_surface "videotestsrc ! video/x-raw,width=640,height=480,framerate=30/1 ! nvvidconv name=conv compute-hw=GPU nvbuf-memory-type=1 bl-output=0 ! video/x-raw(memory:NVMM),width=1920,height=1080,format=NV12 ! nv3dsink"

*/
  1. Build and try… look at the final comments in the code.
2 Likes

Hi Honey Patouceul, many thanks for the sharing.

Hi,

For the issue of NV12 + block linear not working, please apply the following patch on gstnvvconv.c and try again. The source code is available in Linux_for_Tegra/soure/public/gst-nvvidconv_src.tbz2.

diff --git a/gst-nvvidconv-1.0/gstnvvconv.c b/gst-nvvidconv-1.0/gstnvvconv.c
index 03b211c..e6da8ab 100644
--- a/gst-nvvidconv-1.0/gstnvvconv.c
+++ b/gst-nvvidconv-1.0/gstnvvconv.c
@@ -3220,7 +3220,8 @@ gst_nvvconv_transform (GstBaseTransform * btrans, GstBuffer * inbuf,
       } else if (space->inbuf_memtype == BUF_MEM_HW && space->outbuf_memtype == BUF_MEM_HW) {
         NvBufSurface *surf = ((NvBufSurface *) inmap.data);
         /* TODO : Check for PayloadInfo.TimeStamp = gst_util_uint64_scale (GST_BUFFER_PTS (inbuf), GST_MSECOND * 10, GST_SECOND); */
-        if (space->need_intersurf || space->do_scaling || space->flip_method) {
+        if (space->need_intersurf || space->do_scaling || space->flip_method || 
+            surf->surfaceList->layout != omem->buf->surface->surfaceList->layout) {
           retn = NvBufSurfTransform (surf, omem->buf->surface, &space->transform_params);
           if (retn != 0) {
             g_print ("%s: NvBufSurfTransform Failed \n", __func__);

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.