How to ensure GPU zero-copy buffer access with GStreamer appsink on Jetson (using NvBufSurface and CUDA)?

• Hardware Platform (Jetson / GPU) : Jetson Xavier NX
• DeepStream Version 6.3
• JetPack Version (valid for Jetson only) 5.3
• TensorRT Version 8.5.2
• CUDA Version 11.4
• My goal is to process frames entirely on the GPU (zero-copy) using CUDA.
• Pipeline nvv4l2camerasrc device=/dev/video0 name=mysource ! video/x-raw(memory:NVMM), width=1920, height=1020, framerate=30/1 ! nvvidconv ! video/x-raw(memory:NVMM), format=RGBA ! tee name=t t. ! queue leaky=2 max-size-buffers=10 ! nvvidconv name=myconv ! video/x-raw(memory:NVMM),format=RGBA ! appsink name=sink emit-signals=true max-buffers=1 drop=true

In my C++ code, I’m pulling the sample like this

• Approach 1

 GstElement *sink = gst_bin_get_by_name(GST_BIN(gst_pipeline), "sink");

    tee_element = gst_bin_get_by_name(GST_BIN(gst_pipeline), "t"); // "t" is your tee name
    if (!tee_element)
    {
        std::cerr << "Tee element not found in pipeline!" << std::endl;
        return -1;
    }

    if (error)
        g_error_free(error);

    // Start pipeline
    gst_element_set_state((GstElement *)gst_pipeline, GST_STATE_PLAYING);
    
    // Wait for pipeline to start
    GstStateChangeReturn ret = gst_element_get_state(GST_ELEMENT(gst_pipeline),
                                                    NULL, NULL, GST_CLOCK_TIME_NONE);
    if (ret == GST_STATE_CHANGE_FAILURE) {
        std::cerr << "Failed to start pipeline" << std::endl;
        return -1;
    }

 while (true)
    {

        GstSample *sample = gst_app_sink_pull_sample(GST_APP_SINK(sink));

        if (!sample) break;
        GstBuffer *buffer = gst_sample_get_buffer(sample);
        GstMapInfo map = {0};
        gst_buffer_map(buffer, &map, GST_MAP_READ);

        NvBufSurface *surface = (NvBufSurface *)map.data;

        NvBufSurfaceMapEglImage(surface, 0);

      
        CUresult status1;
        CUeglFrame eglFrame;
        CUgraphicsResource pResource = NULL;
        cudaFree(0);

        if (!surface->surfaceList[0].mappedAddr.eglImage)
        {
            std::cerr << "EGL Image is null. Skipping frame." << std::endl;
            return -1;
        }

        status1 = cuGraphicsEGLRegisterImage(&pResource,
                                            surface->surfaceList[0].mappedAddr.eglImage,
                                            CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
        if (status1 != CUDA_SUCCESS)
        {
            printf("cuGraphicsEGLRegisterImage failed: %d \n", status1);
        }
        status1 = cuGraphicsResourceGetMappedEglFrame(&eglFrame, pResource, 0, 0);
        status1 = cuCtxSynchronize();

        if (undistort_frames)
        {
            inf.undistort(reinterpret_cast<uchar4 *>(eglFrame.frame.pPitch[0]), rectifiedimg, WIDTH, HEIGHT);
        }

       ...... SOME CUDA OPERATIONS

       ......

        inf.runInference(imgfinal, resize_mem, nchw, tmp, frame_number, mode, pitch, roll);
       
        gst_buffer_unmap(buffer, &map);
        gst_sample_unref(sample);

        status1 = cuCtxSynchronize();
        status1 = cuGraphicsUnregisterResource(pResource);

        NvBufSurfaceUnMapEglImage(surface, 0);


       
    }

• Approach 2

static GstPadProbeReturn
conv_src_pad_buffer_probe(GstPad *pad, GstPadProbeInfo *info,
                          gpointer u_data)
{
    // Early exit if stopping - return REMOVE to stop being called
    if (stopSignal.load()) {
        std::cout << "Probe: Stop signal received, removing probe\n";
        return GST_PAD_PROBE_REMOVE;
    }

    GstBuffer *buffer = (GstBuffer *)info->data;
    GstMapInfo map = {0};
    gst_buffer_map(buffer, &map, GST_MAP_READ);

    NvBufSurface *surface = (NvBufSurface *)map.data;

    // CUDA postprocess
    {
        AppContext *ctx = (AppContext *)u_data;
        NvBufSurfaceMapEglImage(surface, 0);

       
        CUresult status;
        CUeglFrame eglFrame;
        CUgraphicsResource pResource = NULL;
        cudaFree(0);

        if (!surface->surfaceList[0].mappedAddr.eglImage)
        {
            std::cerr << "EGL Image is null. Skipping frame." << std::endl;
            return GST_PAD_PROBE_DROP;
        }

        status = cuGraphicsEGLRegisterImage(&pResource,
                                            surface->surfaceList[0].mappedAddr.eglImage,
                                            CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
        if (status != CUDA_SUCCESS)
        {
            printf("cuGraphicsEGLRegisterImage failed: %d \n", status);
        }
        status = cuGraphicsResourceGetMappedEglFrame(&eglFrame, pResource, 0, 0);
        status = cuCtxSynchronize();

        if (ctx->undistort_frames)
        {
            ctx->inference->undistort(reinterpret_cast<uchar4 *>(eglFrame.frame.pPitch[0]), ctx->rectifiedimg, ctx->width, ctx->height);
        }
       ...... SOME CUDA OPERATIONS

       ......

        ctx->inference->runInference(ctx->imgfinal, ctx->resize_mem, ctx->nchw, ctx->tmp, ctx->frame_number, ctx->mode, ctx->pitch, ctx->roll);

    
        status = cuCtxSynchronize();
        status = cuGraphicsUnregisterResource(pResource);

        NvBufSurfaceUnMapEglImage(surface, 0);
    }
    gst_buffer_unmap(buffer, &map);

    return GST_PAD_PROBE_OK;
}

int main() 
{
 ...
 tee_element = gst_bin_get_by_name(GST_BIN(gst_pipeline), "t"); // "t" is your tee name
    if (!tee_element)
    {
        std::cerr << "Tee element not found in pipeline!" << std::endl;
        return -1;
    }

    if (error)
        g_error_free(error);
    
    GstElement *conv = gst_bin_get_by_name(GST_BIN(gst_pipeline), "myconv");
    GstPad *src_pad = gst_element_get_static_pad(conv, "src");
    pad_probe_id = gst_pad_add_probe(src_pad, GST_PAD_PROBE_TYPE_BUFFER, conv_src_pad_buffer_probe, ctx, NULL);
    gst_object_unref(src_pad); 

    // Setup main loop and bus watch
    main_loop = g_main_loop_new(NULL, FALSE);
    ctx->main_loop2 = main_loop;

    // Start pipeline
    gst_element_set_state((GstElement *)gst_pipeline, GST_STATE_PLAYING);
    
    // Wait for pipeline to start
    GstStateChangeReturn ret = gst_element_get_state(GST_ELEMENT(gst_pipeline),
                                                    NULL, NULL, GST_CLOCK_TIME_NONE);
    if (ret == GST_STATE_CHANGE_FAILURE) {
        std::cerr << "Failed to start pipeline" << std::endl;
        return -1;
    }

    g_main_loop_run(main_loop);
     
     // Shutdown
    std::cout << "Graceful shutdown started..." << std::endl;

My questions:

  1. Am I correctly achieving zero-copy GPU access using this method? ( I think i am not)
  2. Is there a better or more reliable way to ensure that the buffer never touches CPU memory?
  3. I need to fetch the pointer like shown below but via gstreamer directly { i am trying to add pipelines dynamically which jetson utils dont support so i started with deepstream components }

Jetson Utils Snippet : I want to do this but directly via GStreamer / Deepstream SDK

#include <jetson-utils/videoSource.h>
#include <jetson-utils/cudaMappedMemory.h>
#include <jetson-utils/cudaNormalize.h>
#include <jetson-utils/cudaUtility.h>
#include <jetson-utils/cudaWarp.h>
#include <jetson-utils/cudaColorspace.h>
#include <jetson-utils/videoOptions.h>

int main(){
    cudaSetDevice(0);
    
    videoOptions opts;
    std::string video_name(argv[1]);
    if (video_name == "rear_cam")
    {
        char *fullpath = realpath("/dev/rear_cam", NULL);
        std::string resolvedStr(fullpath);
        free(fullpath);
        opts.resource = "v4l2://" + resolvedStr;
      
    }

    opts.width = 1920;
    opts.height = 1024;

    // Set framerate
    opts.frameRate = 30;

    // Set number of ring buffers
    opts.numBuffers = 8;

    // Enable zero-copy (recommended for Jetson)
    opts.zeroCopy = true;
    videoSource *video = videoSource::Create(opts);
    if (!video)
    {
        printf("Failed to create video source.\n");
        return -1;
    }

    while (true)
    {
        if (!video->Capture(&img, 1000))
        { // 1000 = timeout in milliseconds
            printf("No frame received, exiting...\n");
            break;
     

.... SEND TO INFERENCE >>>


.....

}
}

I am a newbie and not that great with this unified memory stuff :(

Thanks for your time if you have read till here,

Best,
Mohit

There are lots of customized code, we don’t know what happened. So we don’t know whether your code is “zero-copy GPU access”. But the pipeline nvv4l2camerasrc device=/dev/video0 name=mysource ! video/x-raw(memory:NVMM), width=1920, height=1020, framerate=30/1 ! nvvidconv ! video/x-raw(memory:NVMM), format=RGBA ! tee name=t t. ! queue leaky=2 max-size-buffers=10 ! nvvidconv name=myconv ! video/x-raw(memory:NVMM),format=RGBA ! appsink name=sink emit-signals=true max-buffers=1 drop=true itself is “zero-copy GPU access”.

From your code there is no CPU access until

You’ve already got the CUDA array in “eglFrame”. CUDA Driver API :: CUDA Toolkit Documentation. All you succeeded operations should be CUDA operation on the “pArray” in “eglFrame”. Then you can guarantee there is only GPU envolved.
We don’t know what happened in your “ctx->inference->undistort”, “ctx->inference->runInference”,… You need to check by yourself.

If your code “ctx->inference->runInference” is for inferencing, we suggest you to deploy your inferencing model directly with DeepStream APIs.

Please make sure your model is ONNX model and you know well about your model’s input and output. Then you can follow the instructions DeepStream SDK FAQ - Intelligent Video Analytics / DeepStream SDK - NVIDIA Developer Forums to prepare the nvinfer parameters. There are lots of ONNX model deployment samples in DeepStream SDK, please refer to C/C++ Sample Apps Source Details — DeepStream documentation and NVIDIA-AI-IOT/deepstream_reference_apps: Samples for TensorRT/Deepstream for Tesla & Jetson

Hi before giving it to the surface , does these 2 operations use CPU side memory ?

If the buffer resides in GPU memory (e.g., memory:NVMM ), mapping it for GST_MAP_READ often forces a copy to host-accessible memory (CPU RAM), breaking zero-copy. ~ LLMs

Also if not , which of these 2 approaches above is the optimal way to fetch , via appsink or via callback probe ?

GstBuffer is a GObject GObject – 2.0, surely the operation is in CPU but it has nothing to do with buffer/memory. The “map” is not memory map. Please ignore it.

What do you mean by “optimal”? These two ways are the same from buffer access point. Which way is better depends on your actual implementation.

So i cannot further optimise the way i am reading the pointer right ?

This uses NVINFER only , some resizing and pre/post processing are on GPU/CPU but main inference is using NVINFER using a TensorRT Engine.

Thanks for your suggestion , i am already looking into this, i wanted to ask that how do i get output from the pipeline like if suppose after inference i want to give the objects/detections to my tracker how can i extract them from the pipeline. Can you link up some resources.

Got that.
Last question. is there some GPU accelerated element like appsink ?

I am fairly new to gstreamer/Deepstream SDK and i might be asking really trivial questions, thanks for your time!

Best,
Mohit

DeepStream already provided “nvtracker” plugin to do the objects tracking inside DeepStream pipeline. If you are not satisfied with the tracking algorithms(already accelerated by hardware) provided with Gst-nvtracker — DeepStream documentation, you can also customized your own nvtracker low-level tracking library Gst-nvtracker — DeepStream documentation

All the inferencing results are stored in NvDsMetadata MetaData in the DeepStream SDK — DeepStream documentation .

Please read the DeepStream SDK user manual to get familiar with the DeepStream functions and APIs. It is not convenient to introduce them one by one in the forum post. Please start with the document and samples.

**There is no update from you for a period, assuming this is not an issue anymore. Hence we are closing this topic. If need further support, please open a new one. Thanks**

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.