Gstdsexample plugin is slow: does GaussianBlur run on GPU?

Dane, thank you for your advice. Per your recommendation above I reduce the gst_dsexample_transform_ip to bare minimum just for experimenting GaussianBlur filter in Cuda as below:

static GstFlowReturn
gst_dsexample_transform_ip (GstBaseTransform * btrans, GstBuffer * inbuf)
{
  GstDsExample *dsexample = GST_DSEXAMPLE (btrans);
  GstMapInfo in_map_info;
  GstFlowReturn flow_ret = GST_FLOW_ERROR;

  NvBufSurface *surface = NULL;

  dsexample->frame_num++;
  CHECK_CUDA_STATUS (cudaSetDevice (dsexample->gpu_id),
      "Unable to set cuda device");

  memset (&in_map_info, 0, sizeof (in_map_info));
  if (!gst_buffer_map (inbuf, &in_map_info, GST_MAP_READ)) {
    g_print ("Error: Failed to map gst buffer\n");
    goto error;
  }

  surface = (NvBufSurface *) in_map_info.data;

  if (CHECK_NVDS_MEMORY_AND_GPUID (dsexample, surface))
    goto error;

//////////////////////cuda filter experiment//////////////////////
#ifdef __aarch64__
  /* To use the converted buffer in CUDA, create an EGLImage and then use
   * CUDA-EGL interop APIs */
  if (USE_EGLIMAGE) {
    if (NvBufSurfaceMapEglImage (surface, 0) !=0 ) {
      goto error;
    }

    /* dsexample->inter_buf->surfaceList[0].mappedAddr.eglImage
     * Use interop APIs cuGraphicsEGLRegisterImage and
     * cuGraphicsResourceGetMappedEglFrame to access the buffer in CUDA */
    #if 1
        //static bool create_filter = true;
        //static cv::Ptr< cv::cuda::Filter > filter;
        CUresult status;
        CUeglFrame eglFrame;
        CUgraphicsResource pResource = NULL;
        cudaFree(0);
        status = cuGraphicsEGLRegisterImage(&pResource,
    		surface->surfaceList[0].mappedAddr.eglImage,
                    CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
        status = cuGraphicsResourceGetMappedEglFrame(&eglFrame, pResource, 0, 0);
        status = cuCtxSynchronize();

        cv::cuda::GpuMat d_mat(dsexample->processing_height, dsexample->processing_width, CV_8UC4, eglFrame.frame.pPitch[0]);

        filter->apply (d_mat, d_mat);

        status = cuCtxSynchronize();
        status = cuGraphicsUnregisterResource(pResource);

    #endif
    /* Destroy the EGLImage */
    NvBufSurfaceUnMapEglImage (dsexample->inter_buf, 0);
  }
#endif

/////////////////////end of experiment////////////////////////////

  flow_ret = GST_FLOW_OK;

error:
  gst_buffer_unmap (inbuf, &in_map_info);
  return flow_ret;
}

I was able to “make” and “sudo make install” successfully, when I run the pipeline, couple things I observed:

  1. now is very fast. When run even in “30W ALL” mode, never drop a frame any more => that’s very good.
  2. however the behaviour of filter act funny: it only filter (blur) the top 1/4 of the frame and bottom 3/4 frame are not filtered (not blur).

Question: am I manipulating the “surface” (eglFrame) correctly? if not, how would this in-place transformation (inbuf → filter → inbuf without copying) be done?

Thank you very much for your help again.

P.S. house keeping changes:

//create filter in gst_dsexample_start
static gboolean
gst_dsexample_start (GstBaseTransform * btrans)
{
....
    filter = cv::cuda::createGaussianFilter(CV_8UC4, CV_8UC4, cv::Size(31,31), 0, 0, cv::BORDER_DEFAULT);
....
}

and declare filter variable in gstdsexample.h
cv::Ptr<cv::cuda::Filter> filter;