How to create opencv gpumat from nvstream?

Hi,
Please apply below code to get_converted_mat() in gstdsexample.cpp

#include <cudaEGL.h>
#include <opencv2/cudafilters.hpp>
#ifdef __aarch64__
  // To use the converted buffer in CUDA, create an EGLImage and then use
  // CUDA-EGL interop APIs
  if (USE_EGLIMAGE) {
    if (NvBufSurfaceMapEglImage (dsexample->inter_buf, 0) !=0 ) {
      goto error;
    }

    // dsexample->inter_buf->surfaceList[0].mappedAddr.eglImage
    // Use interop APIs cuGraphicsEGLRegisterImage and
    // cuGraphicsResourceGetMappedEglFrame to access the buffer in CUDA
[b]#if 1
    static bool create_filter = true;
    static cv::Ptr< cv::cuda::Filter > filter;
    CUresult status;
    CUeglFrame eglFrame;
    CUgraphicsResource pResource = NULL;
    cudaFree(0);
    status = cuGraphicsEGLRegisterImage(&pResource,
		dsexample->inter_buf->surfaceList[0].mappedAddr.eglImage,
                CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
    status = cuGraphicsResourceGetMappedEglFrame(&eglFrame, pResource, 0, 0);
    status = cuCtxSynchronize();
    if (create_filter) {
        filter = cv::cuda::createSobelFilter(CV_8UC4, CV_8UC4, 1, 0, 3, 1, cv::BORDER_DEFAULT);
        //filter = cv::cuda::createGaussianFilter(CV_8UC4, CV_8UC4, cv::Size(31,31), 0, 0, cv::BORDER_DEFAULT);
        create_filter = false;
    }
    cv::cuda::GpuMat d_mat(dsexample->processing_height, dsexample->processing_width, CV_8UC4, eglFrame.frame.pPitch[0]);
    filter->apply (d_mat, d_mat);
    status = cuCtxSynchronize();
    status = cuGraphicsUnregisterResource(pResource);

    // apply back to the original buffer
    transform_params.src_rect = &dst_rect;
    transform_params.dst_rect = &src_rect;
    NvBufSurfTransform (dsexample->inter_buf, &ip_surf, &transform_params);
#endif[/b]
    // Destroy the EGLImage
    NvBufSurfaceUnMapEglImage (dsexample->inter_buf, 0);
  }
#endif

Makefile

# Remove opencv in PKGS
PKGS:= gstreamer-1.0 gstreamer-base-1.0 gstreamer-video-1.0 <s>opencv</s>
# Add opencv4 to CFLAGS and LIBS
CFLAGS+= -I /usr/local/include/opencv4
LIBS+=-L/usr/local/lib -lopencv_core -lopencv_highgui -lopencv_imgproc -lopencv_videoio -lopencv_cudafilters

Enable dsexample in config file:

[ds-example]
enable=1
processing-width=640
processing-height=480
full-frame=1
unique-id=15
gpu-id=0
2 Likes

Thanks a lot. It worked!

Hello DaneLLL,
I modified your example to use the remap()-function of openCV.

cv::cuda::remap(d_mat, d_mat, dsexample->mat1, dsexample->mat2, cv::INTER_CUBIC, cv::BORDER_CONSTANT, cv::Scalar(0.f));

The code runs without errors, but the output has some weird artifacts in the top half of the image, while the bottom half seems correct.

On CPU the remap()-function works fine.

Was this function ever applied successfully with CUDA?

Hi,
We don’t have experience of using the function. Other users may share their experience.

A reminder is that you need to call map/unmap/syncforcpu/syncfordevice accordingly for processing between CPU and GPU. Please check the APIs in

deepstream_sdk_v4.0.1_jetson\sources\includes\nvbufsurface.h

I have a question regarding this. As starting point I used the gstdsexample and integrated the code, you proposed. As second step I would like to get rid of the example code I dont need (all the transforming and scaling). So I thought using the input_buf should work. But I get some errors. This is my get_converted_mat function.

get_converted_mat (GstDsExample * dsexample, NvBufSurface *input_buf, gint idx)
{
  dsexample->inter_buf = input_buf;
  if (NvBufSurfaceMap (dsexample->inter_buf, 0, 0, NVBUF_MAP_READ) != 0){
    goto error;
  }
#ifdef __aarch64__
  // Cache the mapped data for CPU access
  NvBufSurfaceSyncForCpu (dsexample->inter_buf, 0, 0);
  if (USE_EGLIMAGE) {
    if (NvBufSurfaceMapEglImage (dsexample->inter_buf, 0) !=0 ) {
      goto error;
    }


    #if 1
    CUresult status;
    CUeglFrame eglFrame;
    CUgraphicsResource pResource = NULL;
    cudaFree(0);
    status = cuGraphicsEGLRegisterImage(&pResource,
		dsexample->inter_buf->surfaceList[0].mappedAddr.eglImage,
                CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
    status = cuGraphicsResourceGetMappedEglFrame(&eglFrame, pResource, 0, 0);
    status = cuCtxSynchronize();

    cv::cuda::GpuMat d_mat(dsexample->processing_height, dsexample->processing_width, CV_8UC4, eglFrame.frame.pPitch[0]);
    static bool create_filter = true;
    static cv::Ptr< cv::cuda::Filter > filter;
    if (create_filter) {
      filter = cv::cuda::createSobelFilter(CV_8UC4, CV_8UC4, 1, 0, 3, 1, cv::BORDER_DEFAULT);
      //filter = cv::cuda::createGaussianFilter(CV_8UC4, CV_8UC4, cv::Size(31,31), 0, 0, cv::BORDER_DEFAULT);
      create_filter = false;
    }
    filter->apply (d_mat, d_mat);
    status = cuCtxSynchronize();
    status = cuGraphicsUnregisterResource(pResource);
    #endif
    NvBufSurfaceUnMapEglImage (dsexample->inter_buf, 0);
  }
  #endif
  NvBufSurfaceSyncForDevice (dsexample->inter_buf, 0, 0);

  if (NvBufSurfaceUnMap (dsexample->inter_buf, 0, 0)){
    goto error;
  }

  /* We will first convert only the Region of Interest (the entire frame or the
   * object bounding box) to RGB and then scale the converted RGB frame to
   * processing resolution. */
  return GST_FLOW_OK;

error:
  return GST_FLOW_ERROR;
}

I get the following error.
what(): OpenCV(4.1.1) /home/nvidia/opencv4.1.1/opencv_contrib-4.1.1/modules/cudafilters/src/cuda/row_filter.hpp:172: error: (-217:Gpu API call) unspecified launch failure in function ‘caller’

So what am I doing wrong? It has certainly something to do with the buffer…

Thanks

1 Like

Hi,
The flow of using dsexample is:
1 Convert input_buf[NV12] to dsexample->inter_buf[RGBA]
2 Apply Sobel filter to dsexample->inter_buf[RGBA]
3 Convert dsexample->inter_buf[RGBA] to input_buf[NV12]

If you would like to apply to RGBA buffer, you may access buffers in prob function such as analytics_done_buf_prob(). 5 in FAQ is a sample for your reference.

Hi, DaneLLL
I use DeepStream SDK 5.0 and JetPack 4.4. on Jetson Nano.
When I compile dsexample by running make command, I get the following error:

nvidia@nvidia-desktop:/opt/nvidia/deepstream/deepstream-5.0/sources/gst-plugins/gst-dsexample$ make
-fPIC -DDS_VERSION=“5.0.0” -I /usr/local/cuda-10.2/include -I …/…/includes -I /usr/local/include/opencv4 -pthread -I/usr/include/gstreamer-1.0 -I/usr/include/orc-0.4 -I/usr/include/gstreamer-1.0 -I/usr/include/glib-2.0 -I/usr/lib/aarch64-linux-gnu/glib-2.0/include
g++ -o libnvdsgst_dsexample.so gstdsexample.o -shared -Wl,-no-undefined -L/usr/local/lib -lopencv_core -lopencv_highgui -lopencv_imgproc -lopencv_videoio -lopencv_cudafilters -L dsexample_lib -ldsexample -L/usr/local/cuda-10.2/lib64/ -lcudart -ldl -lnppc -lnppig -lnpps -lnppicc -lnppidei -L/opt/nvidia/deepstream/deepstream-5.0/lib/ -lnvdsgst_helper -lnvdsgst_meta -lnvds_meta -lnvbufsurface -lnvbufsurftransform -Wl,-rpath,/opt/nvidia/deepstream/deepstream-5.0/lib/ -lgstvideo-1.0 -lgstbase-1.0 -lgstreamer-1.0 -lgobject-2.0 -lglib-2.0
gstdsexample.o: In function get_converted_mat(_GstDsExample*, NvBufSurface*, int, _NvOSD_RectParams*, double&, int, int)': gstdsexample.cpp:(.text+0x1e7c): undefined reference to cuGraphicsEGLRegisterImage’
gstdsexample.cpp:(.text+0x1e94): undefined reference to cuGraphicsResourceGetMappedEglFrame' gstdsexample.cpp:(.text+0x1e9c): undefined reference to cuCtxSynchronize’
gstdsexample.cpp:(.text+0x1f98): undefined reference to cuCtxSynchronize' gstdsexample.cpp:(.text+0x1fa4): undefined reference to cuGraphicsUnregisterResource’
collect2: error: ld returned 1 exit status
Makefile:80: recipe for target ‘libnvdsgst_dsexample.so’ failed
make: *** [libnvdsgst_dsexample.so] Error 1

Do I need to install any additional packages?

I solved this problem by additionally adding the following line to the Makefile:

-L /usr/local/cuda-10.2/lib64/stubs/ -lcuda

1 Like

Hi, DaneLLL
How can I change this code to work with dGPU?

user@user:/opt/nvidia/deepstream/deepstream-5.0/sources/gst-plugins/gst-dsexample$ gst-launch-1.0 filesrc location=sample_cam6.mp4 ! qtdemux ! h264parse ! nvv4l2decoder ! m.sink_0 nvstreammux name=m batch-size=1 width=1280 height=720 ! nvvideoconvert ! dsexample full-frame=1 ! nvdsosd ! nveglglessink
Setting pipeline to PAUSED …
Pipeline is PREROLLING …
Got context from element ‘eglglessink0’: gst.egl.EGLDisplay=context, display=(GstEGLDisplay)NULL;
terminate called after throwing an instance of ‘cv::Exception’
what(): OpenCV(4.3.0) /home/user/opencv_contrib-4.3.0/modules/cudafilters/src/cuda/row_filter.hpp:172: error: (-217:Gpu API call) an illegal memory access was encountered in function ‘caller’

Aborted (core dumped)

Hi @Rusli,

I had a similar problem once. As far as I remember all that EGL interoperability stuff does n’t work (or works differently) on dGPU, to find this information one has to dig really deep in docs. Check Run time and documentation problems with cuGraphicsEGLRegisterImage

On dGPU in function get_converted_mat() you can access data in the following way:

...
NvBufSurface ip_surf;
ip_surf = *input_buf;
ip_surf.numFilled = ip_surf.batchSize = 1;
ip_surf.surfaceList = &(input_buf->surfaceList[idx]);
uint8_t * gpu_pitched_image = (unsigned char *) ip_surf.surfaceList[0].dataPtr; // for NV12 format
...

When you use it take into account memory pitch, which you could find in a structure:

NvBufSurfaceParams * surf_params = & ip_surf.surfaceList[0];
int data_pitch = surf_params->planeParams.pitch[0];

See NVIDIA DeepStream SDK Developer Guide — DeepStream 6.1.1 Release documentation

Sorry, I don’t have much time to go in more details here. Please ask, if you have any question.

1 Like

Thanks!
Yes, I also learned from the documentation that сuGraphicsEGLRegisterImage is supported only on Tegra. How should I use gpu_pitched_image and data_pitch?

Hi rus8, very helpful hint. Could you help me to be more specific? I think I need to change the following code to be able to run on dGPU:

        CUresult status;
        CUeglFrame eglFrame;
        CUgraphicsResource pResource = NULL;
        cudaFree(0);
        status = cuGraphicsEGLRegisterImage(&pResource,
    		surface->surfaceList[0].mappedAddr.eglImage,
                    CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
        status = cuGraphicsResourceGetMappedEglFrame(&eglFrame, pResource, 0, 0);
        status = cuCtxSynchronize();

        cv::cuda::GpuMat d_mat(dsexample->processing_height, dsexample->processing_width, CV_8UC4, eglFrame.frame.pPitch[0]);

particularly how to convert these two lines to run on dGPU:

status = cuGraphicsEGLRegisterImage(&pResource,
surface->surfaceList[0].mappedAddr.eglImage,
CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);

and

cv::cuda::GpuMat d_mat(dsexample->processing_height, dsexample->processing_width, CV_8UC4, eglFrame.frame.pPitch[0]);

Your help will be greatly appreciated. Thanks.

Hi @ynjiun and @Rusli,

DeepStream buffer that is flowing inside Gstreamer pipeline on dGPU is different from Jetsons in the way it stores data, as I mentioned earlier. You could n’t use CUeglFrame in DeepStream on dGPU, please check NvBufSurface::memType and NvBufSurfaceMapEglImage.

On dGPU you can get a pointer to device memory , that is gpu_pitched_image in my answer above. So you should be able to construct GpuMat using this pointer and information about dimensions from NVIDIA DeepStream SDK Developer Guide — DeepStream 6.1.1 Release documentation which is the part of NvBufSurfaceParams mentioned in my previous answer.

Hope it helps :)

1 Like

Hi rus8, very helpful. I think I might be very close but just missing a tiny step.

This is what I learn from your post and boil it down to this line:

cv::cuda::GpuMat d_mat(dsexample->processing_height, dsexample->processing_width, CV_8UC4,
        		surface->surfaceList[0].dataPtr,data_pitch);

I am able to compile it without error. But when I run it, it will have this error:

terminate called after throwing an instance of 'cv::Exception'
  what():  OpenCV(4.3.0) /home/cuda-opencv/install/opencv-4.3.0/modules/core/include/opencv2/core/cuda/common.hpp:102: error: (-217:Gpu API call) invalid texture reference in function 'bindTexture'

does this imply the CV_8UC4 type is not correct? or is the data_pitch issue?

Thanks a again for your help.

Attached below please find my code:

/**
 * Called when element recieves an input buffer from upstream element.
 */
static GstFlowReturn
gst_dsexample_transform_ip (GstBaseTransform * btrans, GstBuffer * inbuf)
{
  GstDsExample *dsexample = GST_DSEXAMPLE (btrans);
  GstMapInfo in_map_info;
  GstFlowReturn flow_ret = GST_FLOW_ERROR;

  NvBufSurface *surface = NULL;

  dsexample->frame_num++;
  CHECK_CUDA_STATUS (cudaSetDevice (dsexample->gpu_id),
      "Unable to set cuda device");

  memset (&in_map_info, 0, sizeof (in_map_info));
  if (!gst_buffer_map (inbuf, &in_map_info, GST_MAP_READ)) {
    g_print ("Error: Failed to map gst buffer\n");
    goto error;
  }

  surface = (NvBufSurface *) in_map_info.data;

  if (CHECK_NVDS_MEMORY_AND_GPUID (dsexample, surface))
    goto error;


#ifdef __aarch64__
  /*Tegra: To use the converted buffer in CUDA, create an EGLImage and then use
   * CUDA-EGL interop APIs */
  if (USE_EGLIMAGE) {
    if (NvBufSurfaceMapEglImage (surface, 0) !=0 ) { 
      goto error;
    }
    /* dsexample->inter_buf->surfaceList[0].mappedAddr.eglImage
     * Use interop APIs cuGraphicsEGLRegisterImage and
     * cuGraphicsResourceGetMappedEglFrame to access the buffer in CUDA */
    #if 1
        CUresult status;
        CUeglFrame eglFrame;
        CUgraphicsResource pResource = NULL;
        cudaFree(0);
        status = cuGraphicsEGLRegisterImage(&pResource,
    		surface->surfaceList[0].mappedAddr.eglImage,
                    CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
        status = cuGraphicsResourceGetMappedEglFrame(&eglFrame, pResource, 0, 0);
        status = cuCtxSynchronize();
        cv::cuda::GpuMat d_mat(dsexample->processing_height, dsexample->processing_width, CV_8UC4, eglFrame.frame.pPitch[0]);
        //filter->apply (d_mat, d_mat);
        undistort(dsexample,d_mat);
        status = cuCtxSynchronize();
        status = cuGraphicsUnregisterResource(pResource);
    #endif
    /* Destroy the EGLImage */
   //NvBufSurfaceUnMapEglImage (dsexample->inter_buf, 0);
    NvBufSurfaceUnMapEglImage (surface, 0);
  }
#else
  /* dGPU: is this section of code correct to run in dGPU? */
  if (USE_EGLIMAGE) {
    surface->numFilled = 1;
    surface->batchSize = 1;
    #if 1
        CUresult status;
        NvBufSurfaceParams * surf_params = & surface->surfaceList[0];
        int data_pitch = surf_params->planeParams.pitch[0];
        uint8_t * gpu_pitched_image = (unsigned char *) surface->surfaceList[0].dataPtr;

        cudaFree(0);
        status = cuCtxSynchronize();

        cv::cuda::GpuMat d_mat(dsexample->processing_height, dsexample->processing_width, CV_8UC4,
        		surface->surfaceList[0].dataPtr,data_pitch);

        //filter->apply (d_mat, d_mat);
        undistort(dsexample,d_mat);
        status = cuCtxSynchronize();
    #endif
  }
#endif

  flow_ret = GST_FLOW_OK;

error:
  gst_buffer_unmap (inbuf, &in_map_info);
  return flow_ret;

Thanks a lot for your help again.

Hi @ynjiun

I think you are right about type being used in GpuMat constructor. What is the type of frame in your DeepStream pipeline?

nvbuf-memory-type=3

I am running in a docker with 1080Ti GPU. The pipeline looks like:

gst-launch-1.0 --gst-debug-level=0 filesrc location= /data/agx/ar.h264 ! h264parse ! nvv4l2decoder ! nvvideoconvert nvbuf-memory-type=3 ! m.sink_0 nvstreammux name=m batch-size=1 width=1920 height=1080 ! undistort full-frame=1 processing-width=1920 processing-height=1080 ! nvinfer config-file-path= /opt/nvidia/deepstream/deepstream-5.0/sources/apps/sample_apps/deepstream-test1/dstest1_pgie_config.txt ! nvvideoconvert ! nvdsosd ! nveglglessink

Actually I am able to get CPU version running according to your hints:

  #else
   ```
  /* dGPU: CPU version is running. */
```
    surface->numFilled = surface->batchSize = 1;
    in_mat =
	  cv::Mat (dsexample->processing_height, dsexample->processing_width,
	  CV_8UC4, surface->surfaceList[0].dataPtr,
	  surface->surfaceList[0].planeParams.pitch[0]);
    undistort(dsexample,in_mat);
  #endif

But when I try to upload/download from/to GpuMat, like this:

  #else
   ```
  /* dGPU: GPU version is not running. */
```
    surface->numFilled = surface->batchSize = 1;
    in_mat =
	  cv::Mat (dsexample->processing_height, dsexample->processing_width,
	  CV_8UC4, surface->surfaceList[0].dataPtr,
	  surface->surfaceList[0].planeParams.pitch[0]);
   d_mat.upload(in_mat);
    undistort(dsexample,d_mat); //gpu version undistort
   d_mat.download(in_mat);
  #endif

The same error occurs:

Looks like OpenCV issue, I don’t use it a lot, so couldn’t really help you here.

Have you tried nvbuf-memory-type=2? What is you color format?

Hey there,
Sorry for re-openning the thread.
I used the same method, without applying the transformations, because I used nvvideconvert before the plugin to convert the buffer to RGBA format. My code successfully blurs the image and everything works. However, the blurring (only the filter.apply(d_mat, d_mat) invocation) is much (~5x) slower compared to applying the same kernel on the same size but outside the deepstream platform on a normal script. I don’t get what causes this.

One of my assumptions was the buffer layout. I assume that maybe the NvDsBufSurface has the pitch layout which is less efficient than block linear. I really don’t get it. I am applying a cudaFilter on a cv::cuda::GpuMat. Why would it be so slow?

Do you have any input on this?

I have created a topic with more information here.

Thank you in advance.