How to convert Deepstream NvBufSurface to VPIImage(3.2)?

I am trying to convert an NvBufSurface frame to a VPIImage so that I can do cropped bounding box enhancements to create a PIP(Picture-in-Picture) in the bottom right hand of the output RTSP Video Stream. Currently, I am doing this today using OpenCV operations but I have to take the NvBufSurface from GPU->CPU->GPU which adds a lot of overhead.

Is there a way to convert the NvBufSurface to a VPIImage properly while keeping it in GPU memory to do the image operations(EqualizaeHistogram and Median Filter)?

Today I do this with the following 3 methods using OpenCV for the digital image enhancement and adding of the Picture-in-Picture(PIP) to the original frame:

static GstFlowReturn get_converted_mat(GstPip *pip, NvBufSurface *input_buf, gint idx,
                                       NvOSD_RectParams *crop_rect_params, gdouble &ratio, gint input_width,
                                       gint input_height) {
    NvBufSurfTransform_Error err;
    NvBufSurfTransformConfigParams transform_config_params;
    NvBufSurfTransformParams transform_params;
    NvBufSurfTransformRect src_rect;
    NvBufSurfTransformRect dst_rect;
    NvBufSurface ip_surf;
    cv::Mat in_mat;
    ip_surf = *input_buf;

    ip_surf.numFilled = ip_surf.batchSize = 1;
    ip_surf.surfaceList = &(input_buf->surfaceList[idx]);

    gint src_left = GST_ROUND_UP_2((unsigned int)crop_rect_params->left);
    gint src_top = GST_ROUND_UP_2((unsigned int)crop_rect_params->top);
    gint src_width = GST_ROUND_DOWN_2((unsigned int)crop_rect_params->width);
    gint src_height = GST_ROUND_DOWN_2((unsigned int)crop_rect_params->height);

    // Calculate the extra padding for the object (12.5% of width and height)
    gint padding_width = src_width * 0.25;
    gint padding_height = src_height * 0.25;

    // Adjust left, top, width, and height while ensuring they remain within the
    // frame bounds
    src_left = std::max(0, src_left - padding_width);
    src_top = std::max(0, src_top - padding_height);
    src_width = std::min(input_width - src_left, src_width + 2 * padding_width);
    src_height = std::min(input_height - src_top, src_height + 2 * padding_height);

    /* Maintain aspect ratio */
    double hdest = pip->processing_width * src_height / (double)src_width;
    double wdest = pip->processing_height * src_width / (double)src_height;
    guint dest_width, dest_height;

    if (hdest <= pip->processing_height) {
        dest_width = pip->processing_width;
        dest_height = hdest;
    } else {
        dest_width = wdest;
        dest_height = pip->processing_height;
    }

    /* Configure transform session parameters for the transformation */
    transform_config_params.compute_mode = NvBufSurfTransformCompute_Default;
    transform_config_params.gpu_id = pip->gpu_id;
    transform_config_params.cuda_stream = pip->cuda_stream;

    /* Set the transform session parameters for the conversions executed in this
     * thread. */
    err = NvBufSurfTransformSetSessionParams(&transform_config_params);
    if (err != NvBufSurfTransformError_Success) {
        NVGSTDS_ERR_MSG_V("NvBufSurfTransformSetSessionParams failed with error %d", err);
        goto error;
    }

    /* Calculate scaling ratio while maintaining aspect ratio */
    ratio = MIN(1.0 * dest_width / src_width, 1.0 * dest_height / src_height);

    if ((crop_rect_params->width == 0) || (crop_rect_params->height == 0)) {
        NVGSTDS_ERR_MSG_V("Crop rect params dimensions are zero");
        goto error;
    }

#ifdef __aarch64__
    if (ratio <= 1.0 / 16 || ratio >= 16.0) {
        /* Currently cannot scale by ratio > 16 or < 1/16 for Jetson */
        NVGSTDS_ERR_MSG_V("Cannot scale by ratio > 16 or < 1/16 for Jetson.");
        goto error;
    }
#endif
    /* Set the transform ROIs for source and destination */
    src_rect = {(guint)src_top, (guint)src_left, (guint)src_width, (guint)src_height};
    dst_rect = {0, 0, (guint)dest_width, (guint)dest_height};

    /* Set the transform parameters */
    transform_params.src_rect = &src_rect;
    transform_params.dst_rect = &dst_rect;
    transform_params.transform_flag =
        NVBUFSURF_TRANSFORM_FILTER | NVBUFSURF_TRANSFORM_CROP_SRC | NVBUFSURF_TRANSFORM_CROP_DST;
    transform_params.transform_filter = NvBufSurfTransformInter_Default;

    /* Memset the memory */
    NvBufSurfaceMemSet(pip->inter_buf, 0, 0, 0);

    /* Transformation scaling+format conversion if any. */
    err = NvBufSurfTransform(&ip_surf, pip->inter_buf, &transform_params);
    if (err != NvBufSurfTransformError_Success) {
        NVGSTDS_ERR_MSG_V("NvBufSurfTransform failed with error %d while converting buffer", err);
        goto error;
    }
    /* Map the buffer so that it can be accessed by CPU */
    if (NvBufSurfaceMap(pip->inter_buf, 0, 0, NVBUF_MAP_READ) != 0) {
        goto error;
    }
    if (pip->inter_buf->memType == NVBUF_MEM_SURFACE_ARRAY) {
        /* Cache the mapped data for CPU access */
        NvBufSurfaceSyncForCpu(pip->inter_buf, 0, 0);
    }

    /* Use openCV to remove padding and convert RGBA to BGR. Can be skipped if
     * algorithm can handle padded RGBA data. */
    in_mat = cv::Mat(pip->processing_height, pip->processing_width, CV_8UC4,
                     pip->inter_buf->surfaceList[0].mappedAddr.addr[0], pip->inter_buf->surfaceList[0].pitch);

#if (CV_MAJOR_VERSION >= 4)
    cv::cvtColor(in_mat, *pip->cvmat, cv::COLOR_RGBA2BGR);
#else
    cv::cvtColor(in_mat, *pip->cvmat, CV_RGBA2BGR);
#endif

    if (NvBufSurfaceUnMap(pip->inter_buf, 0, 0)) {
        goto error;
    }

    /* We will first convert only the Region of Interest (the entire frame or
     * the object bounding box) to RGB and then scale the converted RGB frame to
     * processing resolution. */
    return GST_FLOW_OK;

error:
    return GST_FLOW_ERROR;
}

/*
 * Calculates an objects cropped region
 */
static GstFlowReturn calculate_object_region(NvBufSurface *surface, NvDsFrameMeta *frame_meta, int &pipWidth,
                                             int &pipHeight, int &pipPosX, int &pipPosY) {
    pipWidth = static_cast<int>(surface->surfaceList[frame_meta->batch_id].planeParams.width[0] * 0.2);
    pipHeight = static_cast<int>(surface->surfaceList[frame_meta->batch_id].planeParams.height[0] * 0.2);
    pipPosX = surface->surfaceList[frame_meta->batch_id].planeParams.width[0] - pipWidth;
    pipPosY = surface->surfaceList[frame_meta->batch_id].planeParams.height[0] - pipHeight;

    if ((pipWidth == 0) || (pipHeight == 0)) {
        NVGSTDS_ERR_MSG_V("PIP dimensions are zero");
        return GST_FLOW_ERROR;
    }

    return GST_FLOW_OK;
}

/*
 * Digitially enhances an object and inserts this into a PIP into the frame
 */
static GstFlowReturn digitally_enhance_object(GstPip *pip, NvBufSurface *surface, NvDsFrameMeta *frame_meta,
                                              cv::Mat &in_mat) {
    // Enhance the PIP using OpenCV
    cv::Mat enhancedPIP;
    // Increase brightness and contrast
    pip->cvmat->convertTo(enhancedPIP, -1, 1.2, 25);

    // Calculate PIP size and position
    int pipWidth, pipHeight, pipPosX, pipPosY;
    if (calculate_object_region(surface, frame_meta, pipWidth, pipHeight, pipPosX, pipPosY) != GST_FLOW_OK) {
        return GST_FLOW_ERROR;
    }

    // Resize the enhanced PIP
    cv::resize(enhancedPIP, enhancedPIP, cv::Size(pipWidth, pipHeight));

    // Noise Reduction using Median Blur
    cv::medianBlur(enhancedPIP, enhancedPIP, 3); // Add Noise Reduction

    // Bilateral Filter to reduce noise
    cv::Mat bilateralFilteredPIP;
    cv::bilateralFilter(enhancedPIP, bilateralFilteredPIP, 9, 75, 75);

    // Sharpening
    cv::Mat kernel = (cv::Mat_<float>(3, 3) << -1, -1, -1, -1, 9, -1, -1, -1, -1);
    cv::Mat sharpenedPIP;
    cv::filter2D(bilateralFilteredPIP, sharpenedPIP, -1, kernel);

    // Adaptive Histogram Equalization (CLAHE)
    cv::Mat grayPIP, clahePIP;
    cv::cvtColor(sharpenedPIP, grayPIP, cv::COLOR_BGR2GRAY);
    clahe->setClipLimit(2.0);
    clahe->apply(grayPIP, clahePIP);
    cv::cvtColor(clahePIP, sharpenedPIP, cv::COLOR_GRAY2BGR);

    // Define the PIP region on the main frame
    cv::Rect pipRect(pipPosX, pipPosY, pipWidth, pipHeight);
    cv::Mat enhancedPIPWithAlpha;
    cv::cvtColor(sharpenedPIP, enhancedPIPWithAlpha, cv::COLOR_BGR2BGRA);
    enhancedPIPWithAlpha.copyTo(in_mat(pipRect));

    // Cache the enhanced PIP frame
    pip->cached_frames[frame_meta->source_id] = enhancedPIPWithAlpha.clone();
    pip->cached_timestamps[frame_meta->source_id] = std::chrono::steady_clock::now();

    return GST_FLOW_OK;
}

The output looks like so: