How to get a Cuda GpuMat into Gstreamer?

Hello All.

I am new to Gstreamer and am struggling to understand the basics.

After some research i learned about pipelines and how to use the inspect command to check for compatibility between elements.

I want to create a stream from my jetson to another machine. In this case a ubuntu machine in the same local network.

I managed to get a stream going from the default videotestsrc with the following two commands:

jetson terminal:
gst-launch-1.0 videotestsrc ! videoscale ! video/x-raw,width=800,height=600 ! x264enc tune=zerolatency ! rtph264pay ! udpsink host=192.168.2.63 port=5000

ubuntu terminal:
gst-launch-1.0 udpsrc port=5000 caps = "application/x-rtp, media=video, clock-rate=90000, encoding-name=H264, payload=96" ! rtph264depay ! avdec_h264 ! autovideosink

This will open a new window on my ubuntu machine with the test video.

I was very happy to see it working. But of course i don’t want to stream the videotestsrc but a live webcam. For reasons outside the scope of this post i needed to work on the camera output and now have the image in a cv::cuda GpuMat. I can see the camera output using cv::imshow().

I searched this forum on how to get a gpumat into a gstreamer pipeline but the answers were very complicated and with my limited gstreamer experience i couldn’t understand if and how i could make this work with my existing codebase.

If not possible otherwise i could copy to cpu memory into a cv::mat but i wish to evade that method as it would kill my framerate significantly.

i have a working example of how my code looks like right now. Just a simple loop with a getImage() that in the end lands in a GpuMat. I can download it into a cpu mat making it a cv::mat instead of a cv::cuda::GpuMat but this would reduce the framerate alot. However i am so desperate i would also accept to use the cpu mat…

I have added a test pipe that just shows me the videotest src. Somehow i need to push my matrix into that pipe. But i don’t know how.

Here is the code i currently have:

#include <m3api/xiApi.h>
#include <iostream>
#include <opencv2/highgui.hpp>
#include <opencv2/cudaimgproc.hpp>
#include <opencv2/cudaarithm.hpp>
#include <cuda_runtime.h>
#include <chrono>
#include <gst/app/gstappsrc.h>

#define NUMBER_OF_IMAGES 150
// Define parameters for a static white balance
#define WB_BLUE 2
#define WB_GREEN 1
#define WB_RED 1.3

using namespace std;

int main(int argc, char *argv[]){

  // Initialize XI_IMG structure
  XI_IMG image;
  memset(&image, 0, sizeof(XI_IMG));
  image.size = sizeof(XI_IMG);

  HANDLE xiH = NULL;
  XI_RETURN stat = XI_OK;

  bool downsampling = true;
  double fps_counter = 0;

  try{
    // Get device handle for the camera
    stat = xiOpenDevice(0, &xiH);
    if (stat != XI_OK)
      throw "Opening device failed";

    // camera settings
		int OCVbayer = cv::COLOR_BayerBG2BGR;    								// Set demosaicing type
    if(downsampling == true){xiSetParamInt(xiH, XI_PRM_DOWNSAMPLING , 2);}    //Activate Downsampling if needed
    xiSetParamInt(xiH, XI_PRM_IMAGE_DATA_FORMAT, XI_FRM_TRANSPORT_DATA);      // Use transport data format (so we can debayer on gpu)
    xiSetParamInt(xiH, XI_PRM_TRANSPORT_DATA_TARGET, XI_TRANSPORT_DATA_TARGET_ZEROCOPY);    // Make data from the camera stream to zerocopy memory
    xiSetParamInt(xiH, XI_PRM_OUTPUT_DATA_BIT_DEPTH, 8); 		// Using 8-bit images here
    xiSetParamInt(xiH, XI_PRM_EXPOSURE, 30 *1000);          // Exposure in ms
    int width = -1;
    xiGetParamInt(xiH, XI_PRM_WIDTH, &width);    						// Get width of image
    int height = -1;
    xiGetParamInt(xiH, XI_PRM_HEIGHT, &height);    					// Get height of image

    // Start the image acquisition
    xiStartAcquisition(xiH);

    // Define pointer used for data on GPU
    // Create GpuMat for the result images
    void *imageGpu;
    cv::cuda::GpuMat gpu_mat(height, width, CV_8UC3);

    /*
    GSTREAMER
    */
    GstElement *pipeline;
  	GstBus *bus;
    GstMessage *msg;

    gst_init(&argc, &argv);

    // make pipeline here
    // somehow i want to put the GPU::MAT into this via appsrc
    pipeline = gst_parse_launch("videotestsrc ! videoscale ! video/x-raw,width=800,height=600 ! x264enc tune=zerolatency ! rtph264pay ! udpsink host=192.168.2.63 port=5000", NULL);

    // set it to start playing
    gst_element_set_state(pipeline, GST_STATE_PLAYING);
    
    //get the bus
    bus = gst_element_get_bus(pipeline);

    /*
    Gstreamer
    */

    // Acquire a number of images, process and render them
    //start timer
    auto begin = std::chrono::high_resolution_clock::now();
    for (int i = 0; i < NUMBER_OF_IMAGES; i++){

      // get the image from the camera
      xiGetImage(xiH, 5000, &image);      // Get host-pointer to image data
      cudaHostGetDevicePointer(&imageGpu, image.bp, 0);      // Convert to device pointer
      cv::cuda::GpuMat gpu_mat_raw(height, width, CV_8UC1, imageGpu);       // Create GpuMat from the device pointer
      cv::cuda::demosaicing(gpu_mat_raw, gpu_mat, OCVbayer);       // Demosaic raw bayer image to color image
      cv::cuda::multiply(gpu_mat, cv::Scalar(WB_BLUE, WB_GREEN, WB_RED), gpu_mat);       // Apply static white balance by multiplying the channels

      // download to cpu mat
      // very slow and i want to evade this
      //cv::Mat cpuFrame;
      //gpu_mat.download(cpuFrame);

      // Render image to the screen (using OpenGL)
      // i don't want to render to the screen, i want to stream
      //cv::imshow("XIMEA camera", gpu_mat);
      //cv::imshow("XIMEA camera", cpuFrame);

      fps_counter++;
      cv::waitKey(1);
    }

    //measure end time
    auto final = std::chrono::high_resolution_clock::now();
    //calc delta time and fps
    auto delta = std::chrono::duration_cast<std::chrono::milliseconds>(final-begin);
    double d = std::chrono::duration<double>(delta).count();
    double fps = fps_counter/d;
    if(downsampling){
      cout << "Downsampling 2X2 activated" << endl;
    }
    cout<<"FPS: "<< fps << endl;
    cout<<"Frame Count: "<< fps_counter << endl;
    cout<<"Time needed: "<< delta.count()/1000.0 << endl;

    // Stop image acquisition and close device
    xiStopAcquisition(xiH);
    xiCloseDevice(xiH);

    // free gstreamer resources
    gst_object_unref(bus);
    gst_element_set_state(pipeline, GST_STATE_NULL);
    gst_object_unref(pipeline);

    // Print errors
   }catch(const char* message){
    std::cerr << message << std::endl;
   }
  }

Hi,
Please refer to the sample:

The script is updated:
https://github.com/AastaNV/JEP/blob/master/script/install_opencv4.5.0_Jetson.sh

Thank you for your answer. I already have opencv 4.5:

>>> import cv2
>>> cv2.__version__
'4.5.1-pre'

but trying to build the gst_cv_gpumat.cpp i get the following error:

$ make all
Compiling: gst_cv_gpumat.cpp
g++ -I/usr/src/tegra_multimedia_api/include -I/usr/local/cuda/include -pthread -I/usr/local/include/opencv4 -I/usr/include/gstreamer-1.0 -I/usr/include/glib-2.0 -I/usr/lib/aarch64-linux-gnu/glib-2.0/include -c gst_cv_gpumat.cpp -o gst_cv_gpumat.o
gst_cv_gpumat.cpp:6:10: fatal error: nvbuf_utils.h: No such file or directory
 #include "nvbuf_utils.h"
          ^~~~~~~~~~~~~~~
compilation terminated.
Makefile:44: recipe for target 'gst_cv_gpumat.o' failed
make: *** [gst_cv_gpumat.o] Error 1

i tried finding it with the find command but nothing.

The path to jetson MM has changed. Turn -I/usr/src/tegra_multimedia_api/include into -I/usr/src/jetson_multimedia_api/include.

Depending on your case, you may also try gstreamer NV plugin nvivafiter. You may find some examples searching this forum.

1 Like

i don’t have that folder either:

$ ls /usr/src/
cudnn_samples_v7  linux-headers-4.9.140-tegra-linux_x86_64         tensorrt
cudnn_samples_v8  linux-headers-4.9.140-tegra-ubuntu18.04_aarch64
glibc             nvidia

used the following command to verify that its nowhere:

$ sudo find / -name "*multimedia_api*"

only output:

find: ‘/run/user/1000/gvfs’: Permission denied

So its not there.

EDIT:

I have seen that Multimedia is an option in the ‘nvidia SDK Manager’

However i can’t just install that one alone. I would have to reinstall opencv 4.1.1 alongside it. Is there a way to only install the Multimedia api?

EDIT:

Am i correct in the assumption that i have to install the multimedia api via the sdk manager? This will take alot of time and probably break my opencv install. That i then also have to fix and rebuild. I want ot be sure before i embark on that path.

EDIT:

i have a second jetson tx2 lying around here. i just copied
/usr/src/jetson_multimedia_api/
from one to the other and recreated the symlinks. Everything seems to work fine. Anyway to test this to be sure?

gst_cv_gpumat does build now without errors after applying the changes suggested by Honey_Patouceul , thanks again for your top notch feedback.

when i run the gst_cv_gpumat i get an error that no camera has been found. What kind of camera do i need? i have an PCIE cam from ximea (which does not have any linux drivers and comes with their own api, so i think it just can’t find it) :

$ ./gst_cv_gpumat 
Using launch string: nvarguscamerasrc name=mysource ! video/x-raw(memory:NVMM),width=1920,height=1080,framerate=30/1,format=NV12 ! nvvidconv name=myconv ! video/x-raw(memory:NVMM),format=RGBA ! nvoverlaysink 
Error generated. /dvs/git/dirty/git-master_linux/multimedia/nvgstreamer/gst-nvarguscamera/gstnvarguscamerasrc.cpp, execute:521 No cameras available

and another usb cam which i got for 3$. when i plug in the cheap usb cam the program fails with this output:

$ ./gst_cv_gpumat 
Using launch string: nvarguscamerasrc name=mysource ! video/x-raw(memory:NVMM),width=1920,height=1080,framerate=30/1,format=NV12 ! nvvidconv name=myconv ! video/x-raw(memory:NVMM),format=RGBA ! nvoverlaysink 
GST_ARGUS: Creating output stream
CONSUMER: Waiting until producer is connected...
GST_ARGUS: Available Sensor modes :
Segmentation fault (core dumped)

could this be because my cheap camera is not suitable for this ? It does crash after tryinf to list the available sensor modes and i don’t think my cheap webcam even has anything like that at all. Or is there something wrong with my approach to just copy the multimedia api?

Does this only work with CSI cameras? I have a pcie camera.

Not sure how much easy or even possible it is to use argus for a PCIE camera, but this may require paid support.
You may use the ximea/opencv example of post #2 that reads image into pinned memory from your proprietary camera software, then uses GPU for debayering.
If you want to perform GPU processing, you may do it from this application after debayering.
If you want to perform CPU processing, you may do that after downloading to CPU frame.
When processing is done, you may use an opencv videoWriter for streaming the processed stream.

1 Like

Hi,
In general, we install Jetson OS, SDK components through SDKManager and then execute the script:
JEP/install_opencv4.5.0_Jetson.sh at master · AastaNV/JEP · GitHub

Would suggest you try this.

Thank you for your reply. I already have openCV 4.5.1 installed. Why are you suggesting to try to reinstall it? This is not what this topic is about.

I have a cv::cuda::GpuMat and want to get it into GStreamer. I already have all required libraries.

Hello Honey!

I love reading your posts. Always top notch.

I have it working with openCV::VideoWriter . But it is very slow. My Camera can do 30FPS with my settings. but the stream is below 5fps. I think its because the VideoWriter is using cpu to encode to h264.

I struggled to make this work at all and am at the edge of my knowledge. Is there an obvious thing i missed? Maybe i am doing something in the GStreamer string that would slow down the whole thing?

If it helps the stream will only be send over direct ethernet cable. Bandwidth is not the bottleneck and will never be. Can i somehow do less compression to make it faster if bandwidth is no concern?

Here is the code for anyone who finds this via google:

#include <m3api/xiApi.h>
#include <iostream>
#include <opencv2/highgui.hpp>
#include <opencv2/cudaimgproc.hpp>
#include "opencv2/videoio.hpp"
#include <opencv2/cudaarithm.hpp>
#include <cuda_runtime.h>
#include <chrono>

#include <gst/gst.h>
#include <gst/app/gstappsrc.h>

#include "nvbuf_utils.h"
#include "opencv2/cudacodec.hpp"




#define NUMBER_OF_IMAGES 600
// Define parameters for a static white balance
#define WB_BLUE 2
#define WB_GREEN 1
#define WB_RED 1.3

using namespace std;

int main(int argc, char *argv[]){

  // Initialize XI_IMG structure
  XI_IMG image;
  memset(&image, 0, sizeof(XI_IMG));
  image.size = sizeof(XI_IMG);

  HANDLE xiH = NULL;
  XI_RETURN stat = XI_OK;

  bool downsampling = false;
  double frame_counter = 0;

  try{
    // Get device handle for the camera
    stat = xiOpenDevice(0, &xiH);
    if (stat != XI_OK)
      throw "Opening device failed";

    // camera settings
		int OCVbayer = cv::COLOR_BayerBG2BGR;    								// Set demosaicing type
    if(downsampling == true){xiSetParamInt(xiH, XI_PRM_DOWNSAMPLING , 2);}    //Activate Downsampling if needed
    xiSetParamInt(xiH, XI_PRM_IMAGE_DATA_FORMAT, XI_FRM_TRANSPORT_DATA);      // Use transport data format (so we can debayer on gpu)
    xiSetParamInt(xiH, XI_PRM_TRANSPORT_DATA_TARGET, XI_TRANSPORT_DATA_TARGET_ZEROCOPY);    // Make data from the camera stream to zerocopy memory
    xiSetParamInt(xiH, XI_PRM_OUTPUT_DATA_BIT_DEPTH, 8); 		// Using 8-bit images here
    xiSetParamInt(xiH, XI_PRM_EXPOSURE, 30 *1000);          // Exposure in ms
    int width = -1;
    xiGetParamInt(xiH, XI_PRM_WIDTH, &width);    						// Get width of image
    int height = -1;
    xiGetParamInt(xiH, XI_PRM_HEIGHT, &height);    					// Get height of image

    // Start the image acquisition
    xiStartAcquisition(xiH);

    // Define pointer used for data on GPU
    // Create GpuMat for the result images
    void *imageGpu;
    cv::cuda::GpuMat gpu_mat(height, width, CV_8UC3);
    
    cv::VideoWriter writer;
    //cv::Ptr<cv::cudacodec::VideoWriter> d_writer;
    writer.open("appsrc ! videoconvert ! videoscale ! video/x-raw,width=400,height=600 ! x264enc threads=6 ! mpegtsmux ! udpsink host=127.0.0.1 port=5000", 0, (double)100, cv::Size(2056, 1504), true);

//receiver gst-launch-1.0 -ve udpsrc port=5000 ! tsparse ! tsdemux ! h264parse ! avdec_h264 ! videoconvert ! autovideosink


    if (!writer.isOpened()) {
        printf("video writer can't be opened\n");
        return -1;
    }

    // Acquire a number of images, process and render them
    //start timer for fps calculation
    auto begin = std::chrono::high_resolution_clock::now();
    for (int i = 0; i < NUMBER_OF_IMAGES; i++){

      // get the image from the camera
      xiGetImage(xiH, 5000, &image);      // Get host-pointer to image data
      cudaHostGetDevicePointer(&imageGpu, image.bp, 0);      // Convert to device pointer
      cv::cuda::GpuMat gpu_mat_raw(height, width, CV_8UC1, imageGpu);       // Create GpuMat from the device pointer
      cv::cuda::demosaicing(gpu_mat_raw, gpu_mat, OCVbayer);       // Demosaic raw bayer image to color image
      cv::cuda::multiply(gpu_mat, cv::Scalar(WB_BLUE, WB_GREEN, WB_RED), gpu_mat);       // Apply static white balance by multiplying the channels

      // download to cpu mat
      // very slow and i want to evade this
      cv::Mat cpuFrame;
      gpu_mat.download(cpuFrame);

      writer << cpuFrame;
      //writer << gpu_mat; //doesn't work

      frame_counter++;
      cv::waitKey(1);
    }

    //measure end time for fps calculation
    auto final = std::chrono::high_resolution_clock::now();
    //calc delta time and fps
    auto delta = std::chrono::duration_cast<std::chrono::milliseconds>(final-begin);
    double d = std::chrono::duration<double>(delta).count();
    double fps = frame_counter/d;
    if(downsampling){
      cout << "Downsampling 2X2 activated" << endl;
    }
    cout<<"FPS: "<< fps << endl;
    cout<<"Frame Count: "<< frame_counter << endl;
    cout<<"Time needed: "<< delta.count()/1000.0 << endl;

    // Stop image acquisition and close device
    xiStopAcquisition(xiH);
    xiCloseDevice(xiH);

    // Print errors
   }catch(const char* message){
    std::cerr << message << std::endl;
   }
  }

Yes, your writer pipeline uses x264enc that is CPU only. You may try this for RTP/UDP streaming.

1 Like

works fine now.
i can get a 30 fps video in color on my other machine. I use ffplay to look at the stream. VLC was kinda slow and ffplay was blazing fast. i have about 200ms of delay but that’s almost not noticeable in my use case. I am very happy with the result!

However it only works with downsampling activated. This feature halfs my 4k cam output (2x2 downsampling). If i want to stream the full 4k the code feezes in the line: (waited for 10 min)

gst_udpsink.write(cpuFrame);

None the less i am happy with the result and have marked your answer as the correct solution!

here is the code i ended up with for anyone interested:

#include <m3api/xiApi.h>
#include <iostream>
#include <opencv2/highgui.hpp>
#include <opencv2/cudaimgproc.hpp>
#include <opencv2/cudaarithm.hpp>
#include <cuda_runtime.h>
#include <chrono>

//example program runs with a limited amount of frames
#define NUMBER_OF_IMAGES 60000
// Define parameters for a static white balance
#define WB_BLUE 2
#define WB_GREEN 1
#define WB_RED 1.3

using namespace std;

int main(int argc, char *argv[]){

  // Initialize XI_IMG structure
  XI_IMG image;
  memset(&image, 0, sizeof(XI_IMG));
  image.size = sizeof(XI_IMG);

  HANDLE xiH = NULL;
  XI_RETURN stat = XI_OK;

  //activate downsampling
  bool downsampling = true;
  //should the camera feed be rendered on the local screen?
  bool render_x = true;

  double frame_counter = 0;

  try{
    // Get device handle for the camera
    stat = xiOpenDevice(0, &xiH);
    if (stat != XI_OK)
      throw "Opening device failed";

    /* camera settings */
		int OCVbayer = cv::COLOR_BayerBG2BGR;    								// Set demosaicing type
    if(downsampling){xiSetParamInt(xiH, XI_PRM_DOWNSAMPLING , 2);}    //Activate Downsampling if needed
    xiSetParamInt(xiH, XI_PRM_IMAGE_DATA_FORMAT, XI_FRM_TRANSPORT_DATA);      // Use transport data format (so we can debayer on gpu)
    xiSetParamInt(xiH, XI_PRM_TRANSPORT_DATA_TARGET, XI_TRANSPORT_DATA_TARGET_ZEROCOPY);    // Make data from the camera stream to zerocopy memory
    xiSetParamInt(xiH, XI_PRM_OUTPUT_DATA_BIT_DEPTH, 8); 		// Using 8-bit images here
    xiSetParamInt(xiH, XI_PRM_EXPOSURE, 30 *1000);          // Exposure in ms
    int width = -1;
    xiGetParamInt(xiH, XI_PRM_WIDTH, &width);    						// Get width of image
    int height = -1;
    xiGetParamInt(xiH, XI_PRM_HEIGHT, &height);    					// Get height of image

    // Start the image acquisition
    xiStartAcquisition(xiH);

    // Define pointer used for data on GPU
    // Create GpuMat for the result images
    void *imageGpu;
    cv::cuda::GpuMat gpu_mat(height, width, CV_8UC3);

    // Create a GUI window with OpenGL support
    if (render_x){
      cv::namedWindow("XIMEA camera", cv::WINDOW_OPENGL);
      cv::resizeWindow("XIMEA camera", 1600, 900);
      //cv::resizeWindow("XIMEA camera", width/3, height/3);
    }


    /* video writer to RTP/UDP sink multicast at 224.1.2.1 */
    cv::VideoWriter gst_udpsink("appsrc ! video/x-raw, format=BGR, pixel-aspect-ratio=1/1 ! queue ! videoconvert ! video/x-raw, format=BGRx ! nvvidconv ! nvv4l2h264enc insert-vui=1 ! video/x-h264, stream-format=byte-stream, alignment=au ! h264parse ! video/x-h264, stream-format=byte-stream ! rtph264pay pt=96 config-interval=1 ! application/x-rtp, media=video, encoding-name=H264 ! udpsink host=224.1.2.1 port=5000 auto-multicast=true ", 0, 40, cv::Size (width, height));

    if (!gst_udpsink.isOpened ()) {
      std::cout << "Failed to open gst_udpsink writer." << std::endl;
      return (-8);
    }

    /*
    Run on another machine in the same network: 
    (make sure firewall isn't blocking multicast)
    
    sdp file 'test.sdp':
    m=video 5000 RTP/AVP 96
    c=IN IP4 224.1.2.1
    a=rtpmap:96 H264/90000


    receive with ffplay: 
    $ffplay -fflags nobuffer -flags low_delay -framedrop test.sdp -protocol_whitelistile,udp,rtp 
    */


    // Acquire a number of images, process and render them
    //start timer for fps calculation
    auto begin = std::chrono::high_resolution_clock::now();
    for (int i = 0; i < NUMBER_OF_IMAGES; i++){

      // get the image from the camera
      xiGetImage(xiH, 5000, &image);      // Get host-pointer to image data
      cudaHostGetDevicePointer(&imageGpu, image.bp, 0);      // Convert to device pointer
      cv::cuda::GpuMat gpu_mat_raw(height, width, CV_8UC1, imageGpu);       // Create GpuMat from the device pointer
      cv::cuda::demosaicing(gpu_mat_raw, gpu_mat, OCVbayer);       // Demosaic raw bayer image to color image
      cv::cuda::multiply(gpu_mat, cv::Scalar(WB_BLUE, WB_GREEN, WB_RED), gpu_mat);       // Apply static white balance by multiplying the channels

      // download to cpu mat
      // very slow and i want to evade this
      cv::Mat cpuFrame;
      gpu_mat.download(cpuFrame);

			//push to graphstream
      gst_udpsink.write(cpuFrame);
      //gst_udpsink.write(gpu_mat);

      if (render_x){
        // Render image to the screen (using OpenGL)
        // i don't want to render to the screen, i want to stream
        //cv::imshow("XIMEA camera", gpu_mat);
        //cv::imshow("XIMEA camera", img);
        cv::imshow("XIMEA camera", cpuFrame);
      }

      frame_counter++;
      cv::waitKey(1);
    }

    //measure end time for fps calculation
    auto finish_time = std::chrono::high_resolution_clock::now();
    //calc delta time and fps
    auto delta = std::chrono::duration_cast<std::chrono::milliseconds>(finish_time-begin);
    double d = std::chrono::duration<double>(delta).count();
    double fps = frame_counter/d;
    if(downsampling){cout << "Downsampling 2X2 activated" << endl;}
    cout<<"FPS: "<< fps << endl;
    cout<<"Frame Count: "<< frame_counter << endl;
    cout<<"Time needed: "<< delta.count()/1000.0 << endl;

    gst_udpsink.release();
    // Stop image acquisition and close device
    xiStopAcquisition(xiH);
    xiCloseDevice(xiH);

    // Print errors
   }catch(const char* message){
    std::cerr << message << std::endl;
   }
  }
1 Like