Passing NVMM frames to Gstreamer appsink to apply custom processing

Hello There,

TL;DR

How do you access NVMM buffers from nvarguscamerassrc within an appsink callback for furthrue processing with CUDA?

In Detail

I’m trying to port a computer vision application on to the Jetson Xavier AGX platform.

The use-case focuses on low latency and high throughput so we’re looking to implement an efficient and low overhead processing pipeline using NVIDIA’s libraries.

the basic architecture should look like this

nvarguscamerassrc (memory:NVMM) -> nvvideoconvert (memory:NVMM) -> appsink 
-> custon_algorithm
-> appsrc -> nvegltransform -> nveglglessink

Implementation is done with the GStreamer C API and I’m having trouble with figuring out how to access these NVMM buffers that are provided by the nvarguscamerasrc / nvvideoconvert plugins

I find the sample apps source / documentation around these plugins very unsatisfactory with regards of how to directly access these buffers.

For simplicity’s sake, custon_algorithm can be replaced with a simple color space transformation from the NPP library. nppiRGBToGray_8u_C3C1R.

I don’t know what is the correct way to access the NVMM buffers from the GstBuffer pointer I have.

Source
#include <gst/app/app.h>
#include <gst/gst.h>
#include <iostream>
#include <npp.h>
#include <nvbufsurface.h>
#include <opencv2/highgui.hpp>
#include <opencv4/opencv2/core.hpp>
#include <opencv4/opencv2/opencv.hpp>
#include <string>

namespace Globals {
    Npp8u *dst_addr;
    void *mapped_host_mem;
    const int WIDTH = 1280;
    const int HEIGHT = 720;
    const int FPS = 120;
    const auto U8 = sizeof(Npp8u);
    const auto DST_STEP = WIDTH;
    const auto DST_DIM_SIZE = HEIGHT * WIDTH;
    const auto DST_MEM_SIZE = U8 * DST_DIM_SIZE;
    const auto SRC_STEP = DST_STEP * 3;
    NppiSize cudaROI = {WIDTH, HEIGHT};
    const auto frame_size = cv::Size(WIDTH, HEIGHT);
    cv::VideoWriter *display = nullptr;
}

using namespace Globals;


GstFlowReturn new_sample(GstAppSink *sink, gpointer user_data) {
    GstSample *sample = gst_app_sink_pull_sample(sink);
    if (sample == nullptr) {
        return GST_FLOW_ERROR;
    }
    GstBuffer *buffer = gst_sample_get_buffer(sample);
    NvBufSurface *nvbuf = (NvBufSurface *) buffer;
    NvBufSurfaceParams *nv_buf_params = &nvbuf->surfaceList[0];
    Npp8u *src_addr = (Npp8u *) nv_buf_params->mappedAddr.addr[0];

    // ---> custom algorithm goes  here <---
    nppiRGBToGray_8u_C3C1R(src_addr, SRC_STEP, dst_addr, DST_STEP, cudaROI);

    cv::Mat ref_img = cv::Mat(HEIGHT, WIDTH, CV_8UC1, dst_addr, DST_STEP);
    *display << ref_img;
    gst_sample_unref(sample);
    return GST_FLOW_OK;
}

gboolean bus_call(GstBus *bus, GstMessage *msg, gpointer data) {
    auto *loop = (GMainLoop *) data;
    switch (GST_MESSAGE_TYPE (msg)) {
        case GST_MESSAGE_EOS:
            g_print("End of stream\n");
            g_main_loop_quit(loop);
            break;
        case GST_MESSAGE_ERROR: {
            gchar *debug;
            GError *error;
            gst_message_parse_error(msg, &error, &debug);
            g_printerr("ERROR from element %s: %s\n", GST_OBJECT_NAME (msg->src), error->message);
            if (debug) g_printerr("Error details: %s\n", debug);
            g_free(debug);
            g_error_free(error);
            g_main_loop_quit(loop);
            break;
        }
        default:
            break;
    }
    return TRUE;
}

void cleanup(GMainLoop *loop, GstElement *pipeline, guint bus_watch_id) {
    gst_element_set_state(pipeline, GST_STATE_NULL);
    gst_object_unref(GST_OBJECT (pipeline));
    g_source_remove(bus_watch_id);
    g_main_loop_unref(loop);
    cudaFreeHost(Globals::mapped_host_mem);
}

void start_pipeline() {
    display = new cv::VideoWriter(
            "appsrc ! nvvidconv ! nvegltransform ! nveglglessink",
            cv::CAP_GSTREAMER,
            cv::VideoWriter::fourcc('r', 'a', 'w', ' '),
            FPS,
            frame_size,
            true
    );
    auto err = cudaHostAlloc(&Globals::mapped_host_mem, DST_MEM_SIZE, cudaHostAllocMapped);
    if (err || !mapped_host_mem) {
        std::cout << "nullptr! Failed to allocate memory for buffer" << std::endl;
        return;
    }
    Globals::dst_addr = (Npp8u *) Globals::mapped_host_mem;
    std::cout << "Init Gstreamer" << std::endl;

    GstBus *bus = nullptr;
    gst_init(nullptr, nullptr);
    GMainLoop *loop = g_main_loop_new(nullptr, false);
    GstElement *pipeline = gst_pipeline_new("capture-pipeline");
    if (!pipeline) { std::cout << "Failed to create pipeline" << std::endl; }
    GstElement *source = gst_element_factory_make("nvarguscamerasrc", "ArgusCamera0");
    g_object_set(G_OBJECT (source), "sensor-mode", 2, NULL);
    GstElement *nvvidconv = gst_element_factory_make("nvvideoconvert", "nvvidconv0");
    GstElement *sink = gst_element_factory_make("appsink", "appsink0");

    g_object_set(sink, "emit-signals", TRUE, "async", FALSE, NULL);
    auto *appsink_callbacks = new GstAppSinkCallbacks();
    appsink_callbacks->eos = nullptr;
    appsink_callbacks->new_preroll = nullptr;
    appsink_callbacks->new_sample = new_sample;
    gst_app_sink_set_callbacks(GST_APP_SINK(sink), appsink_callbacks, (gpointer) nullptr, free);

    bus = gst_pipeline_get_bus(GST_PIPELINE (pipeline));
    auto bus_watch_id = gst_bus_add_watch(bus, bus_call, loop);
    gst_object_unref(bus);


    gst_bin_add_many(GST_BIN(pipeline), source, nvvidconv, sink, nullptr);
    gboolean link_success = gst_element_link_many(source, nvvidconv, sink, nullptr);
    if (!link_success) {
        g_printerr("Elements could not be linked: 1. Exiting.\n");
        cleanup(loop, pipeline, bus_watch_id);
        return;
    }

    gst_element_set_state(pipeline, GST_STATE_PLAYING);
    g_main_loop_run(loop);
    cleanup(loop, pipeline, bus_watch_id);
}


int main(int argc, char **argv) {
    std::cout.precision(4);
    cudaDeviceProp cuda_prop{};
    cudaGetDeviceProperties(&cuda_prop, 0);
    if (!cuda_prop.canMapHostMemory)
        exit(EXIT_FAILURE);
    start_pipeline();
    return 0;
}

As you can see from the output below, i’m experiencing a segmentation fault, probably for accessing uncharted CPU /GPU memory.

Output
Using winsys: x11 
Init Gstreamer
GST_ARGUS: Creating output stream
CONSUMER: Waiting until producer is connected...
GST_ARGUS: Available Sensor modes :
GST_ARGUS: 2592 x 1944 FR = 29.999999 fps Duration = 33333334 ; Analog Gain range min 1.000000, max 16.000000; Exposure Range min 34000, max 550385000;

GST_ARGUS: 2592 x 1458 FR = 29.999999 fps Duration = 33333334 ; Analog Gain range min 1.000000, max 16.000000; Exposure Range min 34000, max 550385000;

GST_ARGUS: 1280 x 720 FR = 120.000005 fps Duration = 8333333 ; Analog Gain range min 1.000000, max 16.000000; Exposure Range min 22000, max 358733000;

GST_ARGUS: Running with following settings:
   Camera index = 0 
   Camera mode  = 2 
   Output Stream W = 1280 H = 720 
   seconds to Run    = 0 
   Frame Rate = 120.000005 
GST_ARGUS: Setup Complete, Starting captures for 0 seconds
GST_ARGUS: Starting repeat capture requests.
CONSUMER: Producer has connected; continuing.
[1]    13109 Segmentation fault      (core dumped) ./app

Many thanks in advance.

Hi,
You can consider to run

nvarguscamerassrc (memory:NVMM) -> nvvideoconvert (memory:NVMM) -> dsexample -> nvegltransform -> nveglglessink

And apply your custom code in dsexample. The sample code is in

deepstream-5.0\sources\gst-plugins\gst-dsexample
2 Likes

Hi soof,

You may found interesting the following information about the GstCUDA framework, I think that is exactly what you are looking for. Below you will find a more detailed description, but in summary, it consists of a framework that allows to easily and optimally interface GStreamer with CUDA, guaranteeing zero memory copies and direct consumption of NVMM buffers.

You can execute any custom algorithm accelerated with CUDA to process the incoming frames and re-inject the resultant processed frames to video pipeline without the need of appsink/appsrc elements.

GstCUDA handles under the hood all the complexity and give CUDA accessible pointers to the data of the incoming frames to be processed by your custom algorithm. You can develop your own CUDA kernels or use CUDA accelerated libraries like NPP for your custom algorithm.

GstCUDA is a RidgeRun developed GStreamer plug-in enabling easy CUDA algorithm integration into GStreamer pipelines. GstCUDA offers a framework that allows users to develop custom GStreamer elements that execute any CUDA algorithm. The GstCUDA framework is a series of base classes abstracting the complexity of both CUDA and GStreamer. With GstCUDA, developers avoid writing elements from scratch, allowing the developer to focus on the algorithm logic, thus accelerating time to market.

GstCUDA offers a GStreamer plugin that contains a set of elements, that are ideal for GStreamer/CUDA quick prototyping. Those elements consist in a set of filters with different input/output pads combinations, that are run-time loadable with an external custom CUDA library that contains the algorithm to be executed on the GPU on each video frame that passes through the pipeline. GstCUDA plugin allows users to develop their own CUDA processing library, pass the library into the GstCUDA filter element that best adapts to the algorithm requirements, executes the library on the GPU, passing upstream frames from the GStreamer pipeline to the GPU and passing the modified frames downstream to the next element in the GStreamer pipeline. Those elements were created with the CUDA algorithm developer in mind - supporting quick prototyping and abstracting all GStreamer concepts. The elements are fully adaptable to different project needs, making GstCUDA a powerful tool that is essential for CUDA/GStreamer project development.

One remarkable feature of GstCUDA is that it provides a zero memory copy interface between CUDA and GStreamer on etson TX1/TX2/Nano/Xaviver-AGX/Xavier-NX platforms. This enables heavy algorithms and large amounts of data (up to 2x 4K 60fps streams) to be processed on CUDA without the performance caused by copies or memory conversions. GstCUDA provides the necessary APIs to directly handle NVMM buffers to achieve the best possible performance on Jetson TX1/TX2/Nano/Xaviver-AGX/Xavier-NX platforms. It provides a series of base classes and utilities that abstract the complexity of handle memory interface between GStreamer and CUDA, so the developer can focus on what actually gives value to the end product. GstCuda ensures an optimal performance for GStreamer/CUDA applications on Jetson platforms.

You can find detailed information about GstCUDA on the following link:

I hope this information can be useful to you.

Best regards,
-Daniel

1 Like