How to Use an Image Fusion Model in DeepStream

I referred to the code of Deepstream sample code snippet - Intelligent Video Analytics / DeepStream SDK - NVIDIA Developer Forums, and I converted the output of nvinfer into a 768x1024 cv::Mat. I tried to place this image in the lower left corner of the nvbufsurface, but encountered some data conversion issues. Below is my code.

static GstPadProbeReturn
tiler_src_pad_buffer_probe (GstPad * pad, GstPadProbeInfo * info,
    gpointer u_data)
{
  GstBuffer *buf = (GstBuffer *) info->data;
  guint num_rects = 0; 
  NvDsObjectMeta *obj_meta = NULL;
  guint vehicle_count = 0;
  guint person_count = 0;
  NvDsMetaList * l_frame = NULL;
  NvDsMetaList * l_obj = NULL;
  NvDsDisplayMeta *display_meta = NULL;

  NvDsBatchMeta *batch_meta = gst_buffer_get_nvds_batch_meta (buf);


  // Get original raw data
  GstMapInfo in_map_info;
  if (!gst_buffer_map (buf, &in_map_info, GST_MAP_READ)) {
      g_print ("Error: Failed to map gst buffer\n");
      gst_buffer_unmap (buf, &in_map_info);
      return GST_PAD_PROBE_OK;
  }
  NvBufSurface *surface = (NvBufSurface *)in_map_info.data;

  for (l_frame = batch_meta->frame_meta_list; l_frame != NULL;
    l_frame = l_frame->next) {
      NvDsFrameMeta *frame_meta = (NvDsFrameMeta *) (l_frame->data);

    NvDsUserMetaList *usrMetaList = frame_meta->frame_user_meta_list;
    if (usrMetaList != NULL) {
      NvDsUserMeta *usrMetaData = (NvDsUserMeta *) usrMetaList->data;

      if(usrMetaData->base_meta.meta_type == NVDSINFER_TENSOR_OUTPUT_META){
          // NvDsFrameMeta *frame_meta = (NvDsFrameMeta *) (l_frame->data);
          //TODO for cuda device memory we need to use cudamemcpy
          NvBufSurfaceMap (surface, -1, -1, NVBUF_MAP_READ);
          /* Cache the mapped data for CPU access */
          NvBufSurfaceSyncForCpu (surface, 0, 0); //will do nothing for unified memory type on dGPU
          guint surface_height = surface->surfaceList[frame_meta->batch_id].height;
          guint surface_width = surface->surfaceList[frame_meta->batch_id].width;

          //Create Mat from NvMM memory, refer opencv API for how to create a Mat
          cv::Mat nv12_mat = cv::Mat(surface_height*3/2, surface_width, CV_8UC1, surface->surfaceList[frame_meta->batch_id].mappedAddr.addr[0],
          surface->surfaceList[frame_meta->batch_id].pitch);

          NvDsInferTensorMeta *meta = (NvDsInferTensorMeta *) usrMetaData->user_meta_data;
          for (unsigned int i = 0; i < meta->num_output_layers; i++) {
            NvDsInferLayerInfo *info = &meta->output_layers_info[i];
            info->buffer = meta->out_buf_ptrs_host[i];
            if (meta->out_buf_ptrs_dev[i]) {
              cudaMemcpy (meta->out_buf_ptrs_host[i], meta->out_buf_ptrs_dev[i],
                  info->inferDims.numElements * 4, cudaMemcpyDeviceToHost);
            }
          }

          //Create image from NVDSINFER_TENSOR_OUTPUT_META
          int ch = meta->output_layers_info->inferDims.d[0];
          int fusion_height = meta->output_layers_info->inferDims.d[1];
          int fusion_width = meta->output_layers_info->inferDims.d[2];
          int o_count = meta->output_layers_info->inferDims.numElements;
          int onechannel_size = fusion_height * fusion_width;
          float *outputCoverageBuffer =(float *) meta->output_layers_info[0].buffer;
          cv::Mat fusion_mat; 
          using image_type = uint8_t;
          int image_format = CV_8UC1;
          image_type* uint8Buffer = (image_type *)malloc(o_count * sizeof(image_type));
          image_type* uint8Buffer_C1 = (image_type *)malloc(onechannel_size * sizeof(image_type));
          image_type* uint8Buffer_C2 = (image_type *)malloc(onechannel_size * sizeof(image_type));
          image_type* uint8Buffer_C3 = (image_type *)malloc(onechannel_size * sizeof(image_type));
          
          for(int o_index=0; o_index < o_count; o_index++){
            uint8Buffer[o_index] = static_cast<uint8_t>(std::min(std::max(outputCoverageBuffer[o_index] * 255.0f, 0.0f), 255.0f));
          }

          for(int o_index=0; o_index < onechannel_size; o_index++){
            uint8Buffer_C1[o_index] = uint8Buffer[o_index];
            uint8Buffer_C2[o_index] = uint8Buffer[o_index + onechannel_size];
            uint8Buffer_C3[o_index] = uint8Buffer[o_index + 2 * onechannel_size];

          }

          std::vector<cv::Mat> channels;
          for(int idx=2;idx>=0;idx--){
            cv::Mat dumpimg;
            if (idx == 0) dumpimg = cv::Mat(fusion_height, fusion_width, image_format, uint8Buffer_C1);
            else if (idx == 1) dumpimg = cv::Mat(fusion_height, fusion_width, image_format, uint8Buffer_C2);
            else dumpimg = cv::Mat(fusion_height, fusion_width, image_format, uint8Buffer_C3);
            channels.emplace_back(dumpimg);
          }
          cv::merge(channels, fusion_mat);
          
            NvBufSurface *inter_buf = nullptr;
            NvBufSurfaceCreateParams create_params;
            create_params.gpuId  = surface->gpuId;
            create_params.width  = surface_height*3/2;
            create_params.height = surface_width;
            create_params.size = 0;
            create_params.colorFormat = NVBUF_COLOR_FORMAT_RGBA;
            create_params.layout = NVBUF_LAYOUT_PITCH;
          #ifdef __aarch64__
            create_params.memType = NVBUF_MEM_DEFAULT;
          #else
            create_params.memType = NVBUF_MEM_CUDA_UNIFIED;
          #endif
            //Create another scratch RGBA NvBufSurface
            if (NvBufSurfaceCreate (&inter_buf, 1,
              &create_params) != 0) {
              GST_ERROR ("Error: Could not allocate internal buffer ");
              return GST_PAD_PROBE_OK;
            }
            if(NvBufSurfaceMap (inter_buf, 0, -1, NVBUF_MAP_READ_WRITE) != 0)
              std::cout << "map error" << std::endl;
            NvBufSurfaceSyncForCpu (inter_buf, 0, 0);
            cv::Mat trans_mat = cv::Mat(surface_height*3/2, surface_width, CV_8UC4, inter_buf->surfaceList[frame_meta->batch_id].mappedAddr.addr[0],
          inter_buf->surfaceList[0].pitch);
          nv12_mat.copyTo(trans_mat);

          cv::Mat dstROI = trans_mat(cv::Rect(0, fusion_height, fusion_mat.cols, fusion_mat.rows));
          cv::cvtColor(fusion_mat, fusion_mat, cv::COLOR_BGRA2RGBA);
          // 将源矩阵复制到目标矩阵的ROI区域
          fusion_mat.copyTo(dstROI);
          char file_name[128];
          sprintf(file_name, "fusion_stream%2d_%03d_2.png", frame_meta->source_id, frame_number);
          cv::imwrite(file_name, fusion_mat);


          NvBufSurfaceSyncForDevice(inter_buf, 0, 0);
          inter_buf->numFilled = 1;
          NvBufSurfTransformConfigParams transform_config_params;
          NvBufSurfTransformParams transform_params;
          NvBufSurfTransformRect src_rect;
          NvBufSurfTransformRect dst_rect;
          cudaStream_t cuda_stream;
          CHECK_CUDA_STATUS (cudaStreamCreate (&cuda_stream),
            "Could not create cuda stream");
          transform_config_params.compute_mode = NvBufSurfTransformCompute_Default;
          transform_config_params.gpu_id = surface->gpuId;
          transform_config_params.cuda_stream = cuda_stream;
          /* Set the transform session parameters for the conversions executed in this
            * thread. */
          NvBufSurfTransform_Error err = NvBufSurfTransformSetSessionParams (&transform_config_params);
          if (err != NvBufSurfTransformError_Success) {
            std::cout <<"NvBufSurfTransformSetSessionParams failed with error "<< err << std::endl;
            return GST_PAD_PROBE_OK;
          }
          /* Set the transform ROIs for source and destination, only do the color format conversion*/
          src_rect = {0, 0, surface_height*3/2, surface_width};
          dst_rect = {0, 0, surface_height*3/2, surface_width};

          /* Set the transform parameters */
          transform_params.src_rect = &src_rect;
          transform_params.dst_rect = &dst_rect;
          transform_params.transform_flag =
            NVBUFSURF_TRANSFORM_FILTER | NVBUFSURF_TRANSFORM_CROP_SRC |
              NVBUFSURF_TRANSFORM_CROP_DST;
          transform_params.transform_filter = NvBufSurfTransformInter_Default;

          /* Transformation format conversion, Transform rotated RGBA mat to NV12 memory in original input surface*/
          err = NvBufSurfTransform (inter_buf, surface, &transform_params);
          if (err != NvBufSurfTransformError_Success) {
            std::cout<< "NvBufSurfTransform failed with error %d while converting buffer" << err <<std::endl;
            return GST_PAD_PROBE_OK;
          }
          // nvds_copy_obj_meta();
          NvBufSurfaceUnMap(inter_buf, 0, 0);
        }
    }
    NvBufSurfaceUnMap(surface, 0, 0);

  }
  frame_number++;
  return GST_PAD_PROBE_OK;
}

NvBufSurfaceSyncForDevice(inter_buf, 0, 0) and NvBufSurfaceSyncForCpu(inter_buf, 0, 0) always return -1 and the error nvbufsurface: Wrong buffer index (0). My model output can be the Y channel of YUV or an RGB image; in this code, I am using a model that outputs an RGB image, and the model’s output is correct. The data in the nv12 format of nvbufsurface mapped to cv::Mat only includes one channel, while the model output is saved in the format of a three-channel RGB image. Is it feasible to directly use OpenCV for format conversion? Or should I use a model that outputs the Y channel?