Training SegFormer with Nv-DinoV2 backbone on Segmentation Task

Could you run fan_base firstly?
Docker: nvcr.io/nvidia/tao/tao-toolkit:5.5.0-pyt

An example yaml is as below. Pretrained model is from https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tao/models/pretrained_segformer_imagenet/files?version=fan_hybrid_base_in22k_1k_384
.

$ cat fanbase.yaml
results_dir: /localhome/local-morganh/segformer/fanbase

train:
  num_gpus: 1
  exp_config:
      manual_seed: 49
  checkpoint_interval: 200
  logging_interval: 10
  max_iters: 20000 #5000 #10000 #5000
  resume_training_checkpoint_path: null
  validate: True
  validation_interval: 10 #200 #50
  trainer:
      find_unused_parameters: True
      sf_optim:
        lr: 0.00006
evaluate:
  checkpoint: /localhome/local-morganh/segformer/fanbase/train/iter_20000.pth
model:
  input_height: 672
  input_width: 672
  pretrained_model_path: /localhome/local-morganh/segformer/fan_hybrid_base_in22k_1k_384.pth  #https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tao/models/pretrained_segformer_imagenet/files?version=fan_hybrid_base_in22k_1k_384
  #pretrained_model_path: null
  backbone:
    type: "fan_base_16_p4_hybrid"
    #type: "fan_large_16_p4_hybrid"
    #type: "vit_huge_nvclip_14_siglip"
    # type: "vit_base_nvclip_16_siglip"
    #type: "vit_large_nvdinov2"
    #type: "mit_b5"

dataset:
  input_type: "grayscale"
  img_norm_cfg:
        mean:
          - 127.5
          - 127.5
          - 127.5
        std:
          - 127.5
          - 127.5
          - 127.5
        to_rgb: True
  data_root: /tao-pt/tao-experiments
  train_dataset:
      img_dir:
        - /localhome/local-morganh/segformer/39_password_Div8-rd-nvidia/crop_NG/train
      ann_dir:
        - /localhome/local-morganh/segformer/39_password_Div8-rd-nvidia/crop_mask/train
      pipeline:
        augmentation_config:
          random_crop:
            #crop_size:
            #  - 672
            #  - 672
            cat_max_ratio: 0.75
          resize:
            img_scale:
              - 672
              - 1024
            ratio_range:
              - 0.5
              - 2.0
          random_flip:
            prob: 0.5
  val_dataset:
      img_dir: /localhome/local-morganh/segformer/39_password_Div8-rd-nvidia/crop_NG/val
      ann_dir: /localhome/local-morganh/segformer/39_password_Div8-rd-nvidia/crop_mask/val
      #img_dir: /localhome/local-morganh/segformer/39_password_Div8-rd-nvidia/crop_NG/train
      #ann_dir: /localhome/local-morganh/segformer/39_password_Div8-rd-nvidia/crop_mask/train
  test_dataset:
      #NG images
      img_dir: /localhome/local-morganh/segformer/39_password_Div8-rd-nvidia/crop_NG/val
      ann_dir: /localhome/local-morganh/segformer/39_password_Div8-rd-nvidia/crop_mask/val
      #OK images
      #img_dir: /localhome/local-morganh/segformer/39_password_Div8-rd-nvidia/crop_NG/val_nomask
      #ann_dir: /localhome/local-morganh/segformer/39_password_Div8-rd-nvidia/crop_mask/val_nomask
      #full image
      #img_dir: /localhome/local-morganh/segformer/39_password_Div8-rd-nvidia/NG
      #ann_dir: /localhome/local-morganh/segformer/39_password_Div8-rd-nvidia/mask_NG
  palette:
    - seg_class: background
      rgb:
        - 0
        - 0
        - 0
      label_id: 0
      mapping_class: background
    - seg_class: foreground
      rgb:
        - 255
        - 255
        - 255
      label_id: 1
      mapping_class: foreground
  repeat_data_times: 500
  batch_size: 8 #4 #1 #4
  workers_per_gpu: 1

export:
  # input_height: 512
  # input_width: 512
  input_height: 672
  input_width: 672
  #input_height: 768
  #input_width: 768
  input_channel: 3
  onnx_file: "${results_dir}/iter_500.onnx"

gen_trt_engine:
  #input_width: 512
  #input_height: 512
  input_width: 672
  input_height: 672
  tensorrt:
    data_type: FP32
    workspace_size: 1024
    min_batch_size: 1
    opt_batch_size: 1
    max_batch_size: 1


Run training:
$segformer train -e /localhome/local-morganh/segformer/fanbase.yaml

Result is under folder: /localhome/local-morganh/segformer/fanbase/train/