Could you add the patch in the open source code and try to see if the performance is improved in your env?
--- sources\gst-plugins\gst-nvdewarper\nvdewarper.cpp
***************
*** 41,46 ****
--- 41,47 ----
#include "cudaEGL.h"
#endif
+ #define USE_CUDA_STREAM
/* Dewarper #defines */
#ifndef M_PI
***************
*** 245,250 ****
--- 246,252 ----
/* Test measurement with 10 iterations */
#ifdef USE_CUDA_STREAM
warper.warp(nvdewarper->stream, srcTex, dstBuffer, cuDstRowBytes);
+ cudaStreamSynchronize(nvdewarper->stream);
#else
warper.warp(0, srcTex, dstBuffer, cuDstRowBytes);
cudaErr = cudaDeviceSynchronize();