nvivafilter customer-lib implementation (code available)

Thank you for sharing your experience here.
I’m developing with Basler camera on Jetson Xavier.
searching here with keyword ;gstreamer, nvivafilter, GpuMat, OpenCV…
I could do some of basics. However it is little bit confusing to modify on imageprocessing part.
I’ve attach code and modified makefile below.

in the example here, if I tried to do 1., then command result shows black image. but if I do 2., then it shows green image which is what I meant to.
‘1.’ is an example. what I need to do is more like cv::cuda::remap, cv::cuda::resize… etc.

Can I get some advise here?
Thank you.

static void cv_process(void *pdata, int32_t width, int32_t height)
{
    /* Create a GpuMat with data pointer */   
    cv::cuda::GpuMat d_mat(height, width, CV_8UC4, pdata);
	
 		
    /* Processing example*/
    ///1.
    cv::cuda::rotate(d_mat, d_mat, cv::Size(height, width), 45.0);

    ///2.
    d_mat.setTo(cv::Scalar(0,255,0));
	
}

opencv_cudaprocess.so

/*
 * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <stdio.h>
#include <stdlib.h>

#include <cuda.h>
#include <opencv2/opencv.hpp>
///#include <opencv2/gpu/gpu.hpp>

#include "cudaEGL.h"

#if defined(__cplusplus)
extern "C" void Handle_EGLImage (EGLImageKHR image);
extern "C" {
#endif

typedef enum {
  COLOR_FORMAT_Y8 = 0,
  COLOR_FORMAT_U8_V8,
  COLOR_FORMAT_RGBA,
  COLOR_FORMAT_NONE
} ColorFormat;

typedef struct {
  /**
  * cuda-process API
  *
  * @param image   : EGL Image to process
  * @param userPtr : point to user alloc data, should be free by user
  */
  void (*fGPUProcess) (EGLImageKHR image, void ** userPtr);

  /**
  * pre-process API
  *
  * @param sBaseAddr  : Mapped Surfaces(YUV) pointers
  * @param smemsize   : surfaces size array
  * @param swidth     : surfaces width array
  * @param sheight    : surfaces height array
  * @param spitch     : surfaces pitch array
  * @param sformat    : surfaces format array
  * @param nsurfcount : surfaces count
  * @param userPtr    : point to user alloc data, should be free by user
  */
  void (*fPreProcess)(void **sBaseAddr,
                      unsigned int *smemsize,
                      unsigned int *swidth,
                      unsigned int *sheight,
                      unsigned int *spitch,
                      ColorFormat *sformat,
                      unsigned int nsurfcount,
                      void ** userPtr);

  /**
  * post-process API
  *
  * @param sBaseAddr  : Mapped Surfaces(YUV) pointers
  * @param smemsize   : surfaces size array
  * @param swidth     : surfaces width array
  * @param sheight    : surfaces height array
  * @param spitch     : surfaces pitch array
  * @param sformat    : surfaces format array
  * @param nsurfcount : surfaces count
  * @param userPtr    : point to user alloc data, should be free by user
  */
  void (*fPostProcess)(void **sBaseAddr,
                      unsigned int *smemsize,
                      unsigned int *swidth,
                      unsigned int *sheight,
                      unsigned int *spitch,
                      ColorFormat *sformat,
                      unsigned int nsurfcount,
                      void ** userPtr);
} CustomerFunction;

void init (CustomerFunction * pFuncs);

#if defined(__cplusplus)
}
#endif


/**
  * Dummy custom pre-process API implematation.
  * It just access mapped surface userspace pointer &
  * memset with specific pattern modifying pixel-data in-place.
  *
  * @param sBaseAddr  : Mapped Surfaces pointers
  * @param smemsize   : surfaces size array
  * @param swidth     : surfaces width array
  * @param sheight    : surfaces height array
  * @param spitch     : surfaces pitch array
  * @param nsurfcount : surfaces count
  */
static void
pre_process (void **sBaseAddr,
                unsigned int *smemsize,
                unsigned int *swidth,
                unsigned int *sheight,
                unsigned int *spitch,
                ColorFormat  *sformat,
                unsigned int nsurfcount,
                void ** usrptr)
{
  /* add your custom pre-process here */
}

/**
  * Dummy custom post-process API implematation.
  * It just access mapped surface userspace pointer &
  * memset with specific pattern modifying pixel-data in-place.
  *
  * @param sBaseAddr  : Mapped Surfaces pointers
  * @param smemsize   : surfaces size array
  * @param swidth     : surfaces width array
  * @param sheight    : surfaces height array
  * @param spitch     : surfaces pitch array
  * @param nsurfcount : surfaces count
  */
static void
post_process (void **sBaseAddr,
                unsigned int *smemsize,
                unsigned int *swidth,
                unsigned int *sheight,
                unsigned int *spitch,
                ColorFormat  *sformat,
                unsigned int nsurfcount,
                void ** usrptr)
{
  /* add your custom post-process here */
}



static void cv_process(void *pdata, int32_t width, int32_t height)
{
    /* Create a GpuMat with data pointer */   
    cv::cuda::GpuMat d_mat(height, width, CV_8UC4, pdata);
	
 		
    /* Apply Sobel filter */
    //cv::gpu::Sobel(d_mat, d_mat, CV_8UC4, 1, 1, 1, 1, 1, cv::BORDER_DEFAULT);	
}

/**
  * Performs CUDA Operations on egl image.
  *
  * @param image : EGL image
  */
static void
gpu_process (EGLImageKHR image, void ** usrptr)
{
  CUresult status;
  CUeglFrame eglFrame;
  CUgraphicsResource pResource = NULL;

  cudaFree(0);
  status = cuGraphicsEGLRegisterImage(&pResource, image, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
  if (status != CUDA_SUCCESS) {
    printf("cuGraphicsEGLRegisterImage failed : %d \n", status);
    return;
  }

  status = cuGraphicsResourceGetMappedEglFrame( &eglFrame, pResource, 0, 0);
  if (status != CUDA_SUCCESS) {
    printf ("cuGraphicsSubResourceGetMappedArray failed\n");
  }

  status = cuCtxSynchronize();
  if (status != CUDA_SUCCESS) {
    printf ("cuCtxSynchronize failed \n");
  }

  if (eglFrame.frameType == CU_EGL_FRAME_TYPE_PITCH) {
    if (eglFrame.eglColorFormat == CU_EGL_COLOR_FORMAT_ABGR) {
	/* Apply CV gpu processing */
	cv_process(eglFrame.frame.pPitch[0], eglFrame.width, eglFrame.height);
    } else {
	printf ("Invalid eglcolorformat for opencv\n");
	std::cout<<eglFrame.eglColorFormat << ", " <<CU_EGL_COLOR_FORMAT_RGBA<<std::endl;
    }
  }
  else {
     printf ("Invalid frame type for opencv\n");
  }

  status = cuCtxSynchronize();
  if (status != CUDA_SUCCESS) {
    printf ("cuCtxSynchronize failed after memcpy \n");
  }

  status = cuGraphicsUnregisterResource(pResource);
  if (status != CUDA_SUCCESS) {
    printf("cuGraphicsEGLUnRegisterResource failed: %d \n", status);
  }
}

extern "C" void
init (CustomerFunction * pFuncs)
{
  pFuncs->fPreProcess = pre_process;
  pFuncs->fGPUProcess = gpu_process;
  pFuncs->fPostProcess = post_process;
}

command

gst-launch-1.0 -v pylonsrc fps=30 ! bayer2rgb ! nvvidconv ! video/x-raw(memory:NVMM), width=1920, height=1200, format=I420, framerate=30/1 ! nvivafilter customer-lib-name=./lib-gst-custom-opencv_cudaprocess.so cuda-process=true ! 'video/x-raw(memory:NVMM), format=RGBA' ! nvegltransform ! nveglglessink

makefile

###############################################################################
#
# Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
###############################################################################

# Location of the CUDA Toolkit
CUDA_PATH ?= /usr/local/cuda-10.0
INCLUDE_DIR = /usr/include
LIB_DIR = /usr/lib/aarch64-linux-gnu
TEGRA_LIB_DIR = /usr/lib/aarch64-linux-gnu/tegra

# This is typical install path of opencv4tegra
OPENCV_DIR = /usr/local

# For hardfp
#LIB_DIR = /usr/lib/arm-linux-gnueabihf
#TEGRA_LIB_DIR = /usr/lib/arm-linux-gnueabihf/tegra

OSUPPER = $(shell uname -s 2>/dev/null | tr "[:lower:]" "[:upper:]")
OSLOWER = $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")

OS_SIZE = $(shell uname -m | sed -e "s/i.86/32/" -e "s/x86_64/64/" -e "s/armv7l/32/")
OS_ARCH = $(shell uname -m | sed -e "s/i386/i686/")

GCC ?= g++
NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(GCC)

# internal flags
NVCCFLAGS   := --shared
CCFLAGS     := -fPIC
CVCCFLAGS:=-I$(OPENCV_DIR)/include
CVLDFLAGS:=-L$(OPENCV_DIR)/lib -lopencv_core -lopencv_cudafilters -lopencv_cudawarping -lopencv_imgcodecs -lopencv_highgui

LDFLAGS     :=

# Extra user flags
EXTRA_NVCCFLAGS   ?=
EXTRA_LDFLAGS     ?=
EXTRA_CCFLAGS     ?=

override abi := aarch64
LDFLAGS += --dynamic-linker=/lib/ld-linux-aarch64.so.1

# For hardfp
#override abi := gnueabihf
#LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
#CCFLAGS += -mfloat-abi=hard

ifeq ($(ARMv7),1)
NVCCFLAGS += -target-cpu-arch ARM
ifneq ($(TARGET_FS),)
CCFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/$(abi)-linux-gnu

# For hardfp
#LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-$(abi)

endif
endif

# Debug build flags
dbg = 0
ifeq ($(dbg),1)
      NVCCFLAGS += -g -G
      TARGET := debug
else
      TARGET := release
endif

ALL_CCFLAGS :=
ALL_CCFLAGS += $(NVCCFLAGS)
ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))

ALL_LDFLAGS :=
ALL_LDFLAGS += $(ALL_CCFLAGS)
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))

# Common includes and paths for CUDA
INCLUDES  := -I./
LIBRARIES := -L$(LIB_DIR) -lEGL -lGLESv2
LIBRARIES += -L$(TEGRA_LIB_DIR) -lcuda -lrt

################################################################################

# CUDA code generation flags
ifneq ($(OS_ARCH),armv7l)
GENCODE_SM10    := -gencode arch=compute_10,code=sm_10
endif
GENCODE_SM20    := -gencode arch=compute_20,code=sm_20
GENCODE_SM30    := -gencode arch=compute_30,code=sm_30
GENCODE_SM32    := -gencode arch=compute_32,code=sm_32
GENCODE_SM35    := -gencode arch=compute_35,code=sm_35
GENCODE_SM50    := -gencode arch=compute_50,code=sm_50
GENCODE_SMXX    := -gencode arch=compute_50,code=compute_50
GENCODE_SM53    := -gencode arch=compute_53,code=compute_53  # for TX1
GENCODE_SM62    := -gencode arch=compute_62,code=compute_62  # for TX2
GENCODE_SM72    := -gencode arch=compute_72,code=compute_72  # for AGX Xavier

ifeq ($(OS_ARCH),armv7l)
GENCODE_FLAGS   ?= $(GENCODE_SM32)
else
# This only support TX1(5.3) or TX2(6.2) -like architectures
GENCODE_FLAGS   ?= $(GEGENCODE_SM53) $(GENCODE_SM62) $(GENCODE_SM72)  
endif

# Target rules
all: build

build: lib-gst-custom-opencv_cudaprocess.so

gst-custom-opencv_cudaprocess.o : gst-custom-opencv_cudaprocess.cu
	$(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(CVCCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<

lib-gst-custom-opencv_cudaprocess.so : gst-custom-opencv_cudaprocess.o
	$(NVCC) $(ALL_LDFLAGS) $(CVLDFLAGS) $(GENCODE_FLAGS) -o $@ $^ $(LIBRARIES)

clean:
	rm lib-gst-custom-opencv_cudaprocess.so gst-custom-opencv_cudaprocess.o

clobber: clean

I think there might be two issues with cv::cuda::rotate:

  • Main one is reallocating buffer for its output. Any data written out of buffer at pdata will be lost
  • Second one is maybe rotate doesn’t work fine for alpha channel.

You may try:

static void cv_process_RGBA(void *pdata, int32_t width, int32_t height)
{
    /* Create a RGBA GpuMat with data pointer */
    cv::cuda::GpuMat d_Mat_RGBA(height, width, CV_8UC4, pdata);

    /* Convert into a RGB GpuMat */
    cv::cuda::GpuMat d_Mat_RGB(height, width, CV_8UC3);
    cv::cuda::cvtColor(d_Mat_RGBA, d_Mat_RGB, cv::COLOR_RGBA2RGB);

    /* Apply rotation into a new RGB GpuMat */
    cv::cuda::GpuMat d_rot;
    cv::cuda::rotate(d_Mat_RGB, d_rot, cv::Size(d_Mat_RGB.cols, d_Mat_RGB.rows), 10.0);

    /* Convert rotated RGB back into RGBA in device for output */
    cv::cuda::cvtColor(d_rot, d_Mat_RGBA, cv::COLOR_RGB2RGBA);
}

Thank you so much @Honew_Patouceul
with this code it works fine.

so, the target buffer I should process as an output is pdata. right?

Thank you. I will do more test and share it here.

@Honey_Patouceul

I manage to render remapped (warped) image in realtime using this method.

but the problem is that if I don’t resize the result to its original input size, then it doesn’t work(it shows only original image).
but, if I resize it to its original size, then it shows the warped image resized to its camera resolution(original size).

is it possible to output as different size from its original input resolution?

so current processing is like below.

static void cv_process(void *pdata, int32_t width, int32_t height)
{
    
    /* Create a RGBA GpuMat with data pointer */
    cv::cuda::GpuMat d_Mat_RGBA(height, width, CV_8UC4, pdata);

    /* Convert into a RGB GpuMat */
    cv::cuda::GpuMat d_Mat_RGB(height, width, CV_8UC3);
    cv::cuda::GpuMat d_Mat_RGB_remap(d_xmap.size(), CV_8UC3);

    cv::cuda::cvtColor(d_Mat_RGBA, d_Mat_RGB, cv::COLOR_RGBA2RGB);

    cv::cuda::remap(d_Mat_RGB, d_Mat_RGB_remap, d_xmap, d_ymap, cv::INTER_LINEAR, cv::BORDER_REPLICATE);
    cv::cuda::resize(d_Mat_RGB_remap, d_Mat_RGB, cv::Size(width, height));
    cv::cuda::cvtColor(d_Mat_RGB, d_Mat_RGBA, cv::COLOR_RGB2RGBA);

}

One thing you might pay attention is the final buffer of d_Mat_RGBA. If it has been reallocated (which is probably the case by last cvtColor due to different h/w sizes), it will be lost. You may use a check before the end of the function:

if(d_Mat_RGBA.data != pdata)
	std::cerr << "Error reallocated buffer for d_Mat_RGBA" << std::endl;

My understanding is that this plugin prepares the output image and you can just modify pixels before it’s outputted, but not change frame size (its name says filter).

You might also reconsider your processing so that it can be done in a single resolution.

If you don’t need dynamic rescaling, you might try using a bigger output resolution. In the following example, a double size buffer is used for processing a 640x480 resolution, so it becomes 1280x480. This is done with nvvidconv ROI options.
In nvivafilter, we get the left part of image and rotate to the right part.
Then another nvvidconv crops the right part only:

gst-launch-1.0 videotestsrc ! video/x-raw, width=640, height=480, framerate=30/1 ! nvvidconv top=0 left=0 right=1280 bottom=480 ! 'video/x-raw(memory:NVMM), width=1280, height=480' ! nvivafilter customer-lib-name=./lib-gst-custom-opencv_cudaprocess.so cuda-process=true ! 'video/x-raw(memory:NVMM),format=(string)RGBA,width=1280,height=480' ! nvvidconv left=640 top=0 right=1280 bottom=480 ! 'video/x-raw(memory:NVMM), format=I420, width=640, height=480' ! nvvidconv ! video/x-raw,format=I420 ! xvimagesink

With this rotate processing:

static void cv_process_RGBA(void *pdata, int32_t width, int32_t height)
{
    /* Create a GpuMat with data pointer */
    cv::cuda::GpuMat d_Mat_RGBA(height, width, CV_8UC4, pdata);

    /* Convert into a RGB GpuMat */
    cv::cuda::GpuMat d_Mat_RGB(height, width, CV_8UC3);
    cv::cuda::cvtColor(d_Mat_RGBA, d_Mat_RGB, cv::COLOR_RGBA2RGB);

    /* Rotate left half of image into right half */
    cv::Rect leftRoi(0, 0, width/2, height);
    cv::Rect rightRoi(width/2, 0, width/2, height);
    cv::cuda::rotate(d_Mat_RGB(leftRoi), d_Mat_RGB(rightRoi), cv::Size(d_Mat_RGB.cols/2, d_Mat_RGB.rows), 10.0);

    /* Convert back to RGBA in device for output */
    cv::cuda::cvtColor(d_Mat_RGB, d_Mat_RGBA, cv::COLOR_RGB2RGBA);

    if(d_Mat_RGBA.data != pdata)
	std::cerr << "Error reallocated buffer for d_Mat_RGBA" << std::endl;
}

Thank you for me to know this kind of concept.

I found that the remapped image is usually(not sure if it is always) smaller than original image. So I managed to copy remapped data within its original buffer size.

Thank you.