How to ensure GPU zero-copy buffer access with GStreamer appsink on Jetson (using NvBufSurface and CUDA)?

talwarmohit2005 · July 16, 2025, 12:52pm

• Hardware Platform (Jetson / GPU) : Jetson Xavier NX
• DeepStream Version 6.3
• JetPack Version (valid for Jetson only) 5.3
• TensorRT Version 8.5.2
• CUDA Version 11.4
• My goal is to process frames entirely on the GPU (zero-copy) using CUDA.
• Pipeline nvv4l2camerasrc device=/dev/video0 name=mysource ! video/x-raw(memory:NVMM), width=1920, height=1020, framerate=30/1 ! nvvidconv ! video/x-raw(memory:NVMM), format=RGBA ! tee name=t t. ! queue leaky=2 max-size-buffers=10 ! nvvidconv name=myconv ! video/x-raw(memory:NVMM),format=RGBA ! appsink name=sink emit-signals=true max-buffers=1 drop=true

In my C++ code, I’m pulling the sample like this

• Approach 1

 GstElement *sink = gst_bin_get_by_name(GST_BIN(gst_pipeline), "sink");

    tee_element = gst_bin_get_by_name(GST_BIN(gst_pipeline), "t"); // "t" is your tee name
    if (!tee_element)
    {
        std::cerr << "Tee element not found in pipeline!" << std::endl;
        return -1;
    }

    if (error)
        g_error_free(error);

    // Start pipeline
    gst_element_set_state((GstElement *)gst_pipeline, GST_STATE_PLAYING);
    
    // Wait for pipeline to start
    GstStateChangeReturn ret = gst_element_get_state(GST_ELEMENT(gst_pipeline),
                                                    NULL, NULL, GST_CLOCK_TIME_NONE);
    if (ret == GST_STATE_CHANGE_FAILURE) {
        std::cerr << "Failed to start pipeline" << std::endl;
        return -1;
    }

 while (true)
    {

        GstSample *sample = gst_app_sink_pull_sample(GST_APP_SINK(sink));

        if (!sample) break;
        GstBuffer *buffer = gst_sample_get_buffer(sample);
        GstMapInfo map = {0};
        gst_buffer_map(buffer, &map, GST_MAP_READ);

        NvBufSurface *surface = (NvBufSurface *)map.data;

        NvBufSurfaceMapEglImage(surface, 0);

      
        CUresult status1;
        CUeglFrame eglFrame;
        CUgraphicsResource pResource = NULL;
        cudaFree(0);

        if (!surface->surfaceList[0].mappedAddr.eglImage)
        {
            std::cerr << "EGL Image is null. Skipping frame." << std::endl;
            return -1;
        }

        status1 = cuGraphicsEGLRegisterImage(&pResource,
                                            surface->surfaceList[0].mappedAddr.eglImage,
                                            CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
        if (status1 != CUDA_SUCCESS)
        {
            printf("cuGraphicsEGLRegisterImage failed: %d \n", status1);
        }
        status1 = cuGraphicsResourceGetMappedEglFrame(&eglFrame, pResource, 0, 0);
        status1 = cuCtxSynchronize();

        if (undistort_frames)
        {
            inf.undistort(reinterpret_cast<uchar4 *>(eglFrame.frame.pPitch[0]), rectifiedimg, WIDTH, HEIGHT);
        }

       ...... SOME CUDA OPERATIONS

       ......

        inf.runInference(imgfinal, resize_mem, nchw, tmp, frame_number, mode, pitch, roll);
       
        gst_buffer_unmap(buffer, &map);
        gst_sample_unref(sample);

        status1 = cuCtxSynchronize();
        status1 = cuGraphicsUnregisterResource(pResource);

        NvBufSurfaceUnMapEglImage(surface, 0);


       
    }

• Approach 2

static GstPadProbeReturn
conv_src_pad_buffer_probe(GstPad *pad, GstPadProbeInfo *info,
                          gpointer u_data)
{
    // Early exit if stopping - return REMOVE to stop being called
    if (stopSignal.load()) {
        std::cout << "Probe: Stop signal received, removing probe\n";
        return GST_PAD_PROBE_REMOVE;
    }

    GstBuffer *buffer = (GstBuffer *)info->data;
    GstMapInfo map = {0};
    gst_buffer_map(buffer, &map, GST_MAP_READ);

    NvBufSurface *surface = (NvBufSurface *)map.data;

    // CUDA postprocess
    {
        AppContext *ctx = (AppContext *)u_data;
        NvBufSurfaceMapEglImage(surface, 0);

       
        CUresult status;
        CUeglFrame eglFrame;
        CUgraphicsResource pResource = NULL;
        cudaFree(0);

        if (!surface->surfaceList[0].mappedAddr.eglImage)
        {
            std::cerr << "EGL Image is null. Skipping frame." << std::endl;
            return GST_PAD_PROBE_DROP;
        }

        status = cuGraphicsEGLRegisterImage(&pResource,
                                            surface->surfaceList[0].mappedAddr.eglImage,
                                            CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
        if (status != CUDA_SUCCESS)
        {
            printf("cuGraphicsEGLRegisterImage failed: %d \n", status);
        }
        status = cuGraphicsResourceGetMappedEglFrame(&eglFrame, pResource, 0, 0);
        status = cuCtxSynchronize();

        if (ctx->undistort_frames)
        {
            ctx->inference->undistort(reinterpret_cast<uchar4 *>(eglFrame.frame.pPitch[0]), ctx->rectifiedimg, ctx->width, ctx->height);
        }
       ...... SOME CUDA OPERATIONS

       ......

        ctx->inference->runInference(ctx->imgfinal, ctx->resize_mem, ctx->nchw, ctx->tmp, ctx->frame_number, ctx->mode, ctx->pitch, ctx->roll);

    
        status = cuCtxSynchronize();
        status = cuGraphicsUnregisterResource(pResource);

        NvBufSurfaceUnMapEglImage(surface, 0);
    }
    gst_buffer_unmap(buffer, &map);

    return GST_PAD_PROBE_OK;
}

int main() 
{
 ...
 tee_element = gst_bin_get_by_name(GST_BIN(gst_pipeline), "t"); // "t" is your tee name
    if (!tee_element)
    {
        std::cerr << "Tee element not found in pipeline!" << std::endl;
        return -1;
    }

    if (error)
        g_error_free(error);
    
    GstElement *conv = gst_bin_get_by_name(GST_BIN(gst_pipeline), "myconv");
    GstPad *src_pad = gst_element_get_static_pad(conv, "src");
    pad_probe_id = gst_pad_add_probe(src_pad, GST_PAD_PROBE_TYPE_BUFFER, conv_src_pad_buffer_probe, ctx, NULL);
    gst_object_unref(src_pad); 

    // Setup main loop and bus watch
    main_loop = g_main_loop_new(NULL, FALSE);
    ctx->main_loop2 = main_loop;

    // Start pipeline
    gst_element_set_state((GstElement *)gst_pipeline, GST_STATE_PLAYING);
    
    // Wait for pipeline to start
    GstStateChangeReturn ret = gst_element_get_state(GST_ELEMENT(gst_pipeline),
                                                    NULL, NULL, GST_CLOCK_TIME_NONE);
    if (ret == GST_STATE_CHANGE_FAILURE) {
        std::cerr << "Failed to start pipeline" << std::endl;
        return -1;
    }

    g_main_loop_run(main_loop);
     
     // Shutdown
    std::cout << "Graceful shutdown started..." << std::endl;

My questions:

Am I correctly achieving zero-copy GPU access using this method? ( I think i am not)
Is there a better or more reliable way to ensure that the buffer never touches CPU memory?
I need to fetch the pointer like shown below but via gstreamer directly { i am trying to add pipelines dynamically which jetson utils dont support so i started with deepstream components }

Jetson Utils Snippet : I want to do this but directly via GStreamer / Deepstream SDK

#include <jetson-utils/videoSource.h>
#include <jetson-utils/cudaMappedMemory.h>
#include <jetson-utils/cudaNormalize.h>
#include <jetson-utils/cudaUtility.h>
#include <jetson-utils/cudaWarp.h>
#include <jetson-utils/cudaColorspace.h>
#include <jetson-utils/videoOptions.h>

int main(){
    cudaSetDevice(0);
    
    videoOptions opts;
    std::string video_name(argv[1]);
    if (video_name == "rear_cam")
    {
        char *fullpath = realpath("/dev/rear_cam", NULL);
        std::string resolvedStr(fullpath);
        free(fullpath);
        opts.resource = "v4l2://" + resolvedStr;
      
    }

    opts.width = 1920;
    opts.height = 1024;

    // Set framerate
    opts.frameRate = 30;

    // Set number of ring buffers
    opts.numBuffers = 8;

    // Enable zero-copy (recommended for Jetson)
    opts.zeroCopy = true;
    videoSource *video = videoSource::Create(opts);
    if (!video)
    {
        printf("Failed to create video source.\n");
        return -1;
    }

    while (true)
    {
        if (!video->Capture(&img, 1000))
        { // 1000 = timeout in milliseconds
            printf("No frame received, exiting...\n");
            break;
     

.... SEND TO INFERENCE >>>


.....

}
}

I am a newbie and not that great with this unified memory stuff :(

Thanks for your time if you have read till here,

Best,
Mohit

Fiona.Chen · July 17, 2025, 2:18am

There are lots of customized code, we don’t know what happened. So we don’t know whether your code is “zero-copy GPU access”. But the pipeline nvv4l2camerasrc device=/dev/video0 name=mysource ! video/x-raw(memory:NVMM), width=1920, height=1020, framerate=30/1 ! nvvidconv ! video/x-raw(memory:NVMM), format=RGBA ! tee name=t t. ! queue leaky=2 max-size-buffers=10 ! nvvidconv name=myconv ! video/x-raw(memory:NVMM),format=RGBA ! appsink name=sink emit-signals=true max-buffers=1 drop=true itself is “zero-copy GPU access”.

From your code there is no CPU access until

talwarmohit2005:

    if (!surface->surfaceList[0].mappedAddr.eglImage)
    {
        std::cerr << "EGL Image is null. Skipping frame." << std::endl;
        return GST_PAD_PROBE_DROP;
    }

    status = cuGraphicsEGLRegisterImage(&pResource,
                                        surface->surfaceList[0].mappedAddr.eglImage,
                                        CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
    if (status != CUDA_SUCCESS)
    {
        printf("cuGraphicsEGLRegisterImage failed: %d \n", status);
    }
    status = cuGraphicsResourceGetMappedEglFrame(&eglFrame, pResource, 0, 0);

You’ve already got the CUDA array in “eglFrame”. CUDA Driver API :: CUDA Toolkit Documentation. All you succeeded operations should be CUDA operation on the “pArray” in “eglFrame”. Then you can guarantee there is only GPU envolved.
We don’t know what happened in your “ctx->inference->undistort”, “ctx->inference->runInference”,… You need to check by yourself.

If your code “ctx->inference->runInference” is for inferencing, we suggest you to deploy your inferencing model directly with DeepStream APIs.

talwarmohit2005:

Jetson Utils Snippet : I want to do this but directly via GStreamer / Deepstream SDK

#include <jetson-utils/videoSource.h>
#include <jetson-utils/cudaMappedMemory.h>
#include <jetson-utils/cudaNormalize.h>
#include <jetson-utils/cudaUtility.h>
#include <jetson-utils/cudaWarp.h>
#include <jetson-utils/cudaColorspace.h>
#include <jetson-utils/videoOptions.h>

int main(){
    cudaSetDevice(0);
    
    videoOptions opts;
    std::string video_name(argv[1]);
    if (video_name == "rear_cam")
    {
        char *fullpath = realpath("/dev/rear_cam", NULL);
        std::string resolvedStr(fullpath);
        free(fullpath);
        opts.resource = "v4l2://" + resolvedStr;
      
    }

    opts.width = 1920;
    opts.height = 1024;

    // Set framerate
    opts.frameRate = 30;

    // Set number of ring buffers
    opts.numBuffers = 8;

    // Enable zero-copy (recommended for Jetson)
    opts.zeroCopy = true;
    videoSource *video = videoSource::Create(opts);
    if (!video)
    {
        printf("Failed to create video source.\n");
        return -1;
    }

    while (true)
    {
        if (!video->Capture(&img, 1000))
        { // 1000 = timeout in milliseconds
            printf("No frame received, exiting...\n");
            break;
     

.... SEND TO INFERENCE >>>


.....

}
}

Please make sure your model is ONNX model and you know well about your model’s input and output. Then you can follow the instructions DeepStream SDK FAQ - Intelligent Video Analytics / DeepStream SDK - NVIDIA Developer Forums to prepare the nvinfer parameters. There are lots of ONNX model deployment samples in DeepStream SDK, please refer to C/C++ Sample Apps Source Details — DeepStream documentation and NVIDIA-AI-IOT/deepstream_reference_apps: Samples for TensorRT/Deepstream for Tesla & Jetson

talwarmohit2005 · July 17, 2025, 5:29am

Hi before giving it to the surface , does these 2 operations use CPU side memory ?

If the buffer resides in GPU memory (e.g., memory:NVMM ), mapping it for GST_MAP_READ often forces a copy to host-accessible memory (CPU RAM), breaking zero-copy. ~ LLMs

Also if not , which of these 2 approaches above is the optimal way to fetch , via appsink or via callback probe ?

Fiona.Chen · July 17, 2025, 6:17am

talwarmohit2005:

talwarmohit2005:
GstMapInfo map = {0};
    gst_buffer_map(buffer, &map, GST_MAP_READ);
    NvBufSurface *surface = (NvBufSurface *)map.data;
Hi before giving it to the surface , does these 2 operations use CPU side memory ?

GstBuffer is a GObject GObject – 2.0, surely the operation is in CPU but it has nothing to do with buffer/memory. The “map” is not memory map. Please ignore it.

What do you mean by “optimal”? These two ways are the same from buffer access point. Which way is better depends on your actual implementation.

talwarmohit2005 · July 17, 2025, 6:35am

So i cannot further optimise the way i am reading the pointer right ?

This uses NVINFER only , some resizing and pre/post processing are on GPU/CPU but main inference is using NVINFER using a TensorRT Engine.

Thanks for your suggestion , i am already looking into this, i wanted to ask that how do i get output from the pipeline like if suppose after inference i want to give the objects/detections to my tracker how can i extract them from the pipeline. Can you link up some resources.

Got that.
Last question. is there some GPU accelerated element like appsink ?

I am fairly new to gstreamer/Deepstream SDK and i might be asking really trivial questions, thanks for your time!

Best,
Mohit

Fiona.Chen · July 17, 2025, 7:08am

DeepStream already provided “nvtracker” plugin to do the objects tracking inside DeepStream pipeline. If you are not satisfied with the tracking algorithms(already accelerated by hardware) provided with Gst-nvtracker — DeepStream documentation, you can also customized your own nvtracker low-level tracking library Gst-nvtracker — DeepStream documentation

All the inferencing results are stored in NvDsMetadata MetaData in the DeepStream SDK — DeepStream documentation .

Please read the DeepStream SDK user manual to get familiar with the DeepStream functions and APIs. It is not convenient to introduce them one by one in the forum post. Please start with the document and samples.

yingliu · August 12, 2025, 2:13am

**There is no update from you for a period, assuming this is not an issue anymore. Hence we are closing this topic. If need further support, please open a new one. Thanks**

system · August 26, 2025, 2:14am

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.

Topic		Replies	Views
DeepStream zero-copy + run cuda kernel directly (Jetson Orin Nx) DeepStream SDK camera , cuda , kernel , gstreamer , cudnn , deepstream	9	265	August 22, 2025
Get gpu memory buffer from gstreamer without copying to CPU General	13	3558	October 13, 2022
Gstreamer writing to CUDA memory and zero copy cv::cuda::GpuMat with Jetpack 5.1.2 Jetson AGX Xavier camera , cuda , gstreamer	11	426	August 13, 2025
Share cuda memory with gstreamer DeepStream SDK deepstream	16	443	August 4, 2025
Opencv gpu mat into GStreamer without downloading to cpu Jetson Nano opencv , gstreamer	19	9232	October 13, 2021
How to avoid double copy of image from EGL stream, when creating batched nvbuf surface DeepStream SDK camera , jetson-inference , nvbugs	15	805	March 20, 2024
Error generated while running the code after connecting the camera Jetson Xavier NX gstreamer , nvbugs	45	1834	January 2, 2024
From nvBufSurface to gpu::Mat or cuda stream DeepStream SDK cuda , deepstream	15	347	February 6, 2025
Some question about jetson nano/xavier-nx and deep stream DeepStream SDK	6	1200	October 12, 2021
Transform_ip buffer of custom gstreamer plugin not what I expected Jetson Nano gstreamer	11	1615	October 15, 2021

How to ensure GPU zero-copy buffer access with GStreamer appsink on Jetson (using NvBufSurface and CUDA)?

My questions:

Related topics