Excute tao model detectnet_v2 train but Failed

I am sorry I asked several times byt I am stuck with TAO detectnet_v2.

I got annotation data, a lot of YOLO format data and image data.

3 0.498760 0.490741 0.859623 0.768519
0 0.728671 0.213955 0.252976 0.082672
1 0.478423 0.380622 0.730655 0.085317
2 0.544643 0.530093 0.416667 0.184524

However, it is said TAO Toolkit can not recognize YOLO format data, so I converted the YOLO format data to KITTI format data.

card 0 0 0 99.28583999999995 204.44448 1337.14296 1680.0009599999998 0 0 0 0 0 0 0 
barcode 0 0 0 867.1435199999999 331.42848000000004 1231.42896 490.15871999999996 0 0 0 0 0 0 0
name 0 0 0 162.85751999999994 648.8899200000001 1215.00072 812.69856 0 0 0 0 0 0 0
price 0 0 0 484.28567999999996 840.63552 1084.2861599999999 1194.9216 0 0 0 0 0 0 0

Then I used TAO Dataset Convert Tool to convert KITTI data to TFRecords files.

# Setting file
!cat $LOCAL_SPECS_DIR/spec_tfrecords_kitti.txt
kitti_config {
  root_directory_path: "/workspace/tao-experiments/data/"
  image_dir_name: "training/images_aikata"
  label_dir_name: "training/labels_aikata"
  image_extension: ".jpg"
  partition_mode: "random"
  num_partitions: 2
  val_split: 20
  num_shards: 10
}
image_directory_path: "/workspace/tao-experiments/data/"
target_class_mapping {
  key: "barcode"
  value: "barcode"
}
target_class_mapping {
  key: "name"
  value: "name"
}
target_class_mapping {
  key: "price"
  value: "price"
}
target_class_mapping {
  key: "card"
  value: "card"
}
target_class_mapping {
  key: "card7p"
  value: "card7p"
}
target_class_mapping {
  key: "unknown"
  value: "unknown"
}

# Setting file
!cat $LOCAL_SPECS_DIR/spec_train_kitti.txt
random_seed: 42
dataset_config {
  data_sources {
    tfrecords_path: "/workspace/tao-experiments/data/tfrecords_aikata/*"
    image_directory_path: "/workspace/tao-experiments/data/training"
  }
  image_extension: "jpg"
  target_class_mapping {
    key: "barcode"
    value: "barcode"
  }
  target_class_mapping {
    key: "name"
    value: "name"
  }
  target_class_mapping {
    key: "price"
    value: "price"
  }
  target_class_mapping {
    key: "card"
    value: "card"
  }
  target_class_mapping {
    key: "card7p"
    value: "card7p"
  }
  target_class_mapping {
    key: "unknown"
    value: "unknown"
  }
  validation_fold: 0
}
model_config {
  pretrained_model_file: "/workspace/tao-experiments/detectnet_v2_test/pretrained_resnet18/pretrained_detectnet_v2_vresnet18/resnet18.hdf5"
  num_layers: 18
  use_batch_norm: true
  objective_set {
    bbox {
      scale: 35.0
      offset: 0.5
    }
    cov {
    }
  }
  arch: "resnet"
}
augmentation_config {
  preprocessing {
    output_image_width: 1440
    output_image_height: 1920
    min_bbox_width: 1.0
    min_bbox_height: 1.0
    output_image_channel: 3
  }
  spatial_augmentation {
    hflip_probability: 0.5
    zoom_min: 1.0
    zoom_max: 1.0
    translate_max_x: 8.0
    translate_max_y: 8.0
  }
  color_augmentation {
    hue_rotation_max: 25.0
    saturation_shift_max: 0.20000000298
    contrast_scale_max: 0.10000000149
    contrast_center: 0.5
  }
}
postprocessing_config {
  target_class_config {
    key: "barcode"
    value {
      clustering_config {
        clustering_algorithm: DBSCAN
        dbscan_confidence_threshold: 0.9
        coverage_threshold: 0.00499999988824
        dbscan_eps: 0.20000000298
        dbscan_min_samples: 1
        minimum_bounding_box_height: 20
      }
    }
  }
  target_class_config {
    key: "name"
    value {
      clustering_config {
        clustering_algorithm: DBSCAN
        dbscan_confidence_threshold: 0.9
        coverage_threshold: 0.00499999988824
        dbscan_eps: 0.15000000596
        dbscan_min_samples: 1
        minimum_bounding_box_height: 20
      }
    }
  }
  target_class_config {
    key: "price"
    value {
      clustering_config {
        clustering_algorithm: DBSCAN
        dbscan_confidence_threshold: 0.9
        coverage_threshold: 0.00749999983236
        dbscan_eps: 0.230000004172
        dbscan_min_samples: 1
        minimum_bounding_box_height: 20
      }
    }
  }
  target_class_config {
    key: "card"
    value {
      clustering_config {
        clustering_algorithm: DBSCAN
        dbscan_confidence_threshold: 0.9
        coverage_threshold: 0.00749999983236
        dbscan_eps: 0.230000004172
        dbscan_min_samples: 1
        minimum_bounding_box_height: 20
      }
    }
  }
  target_class_config {
    key: "card7p"
    value {
      clustering_config {
        clustering_algorithm: DBSCAN
        dbscan_confidence_threshold: 0.9
        coverage_threshold: 0.00749999983236
        dbscan_eps: 0.230000004172
        dbscan_min_samples: 1
        minimum_bounding_box_height: 20
      }
    }
  }
  target_class_config {
    key: "unknown"
    value {
      clustering_config {
        clustering_algorithm: DBSCAN
        dbscan_confidence_threshold: 0.9
        coverage_threshold: 0.00749999983236
        dbscan_eps: 0.230000004172
        dbscan_min_samples: 1
        minimum_bounding_box_height: 20
      }
    }
  }
}
evaluation_config {
  validation_period_during_training: 10
  first_validation_epoch: 30
  minimum_detection_ground_truth_overlap {
    key: "barcode"
    value: 0.699999988079
  }
  minimum_detection_ground_truth_overlap {
    key: "name"
    value: 0.5
  }
  minimum_detection_ground_truth_overlap {
    key: "price"
    value: 0.5
  }
  minimum_detection_ground_truth_overlap {
    key: "card"
    value: 0.5
  }
  minimum_detection_ground_truth_overlap {
    key: "card7p"
    value: 0.5
  }
  minimum_detection_ground_truth_overlap {
    key: "unknown"
    value: 0.5
  }
  evaluation_box_config {
    key: "barcode"
    value {
      minimum_height: 20
      maximum_height: 9999
      minimum_width: 10
      maximum_width: 9999
    }
  }
  evaluation_box_config {
    key: "name"
    value {
      minimum_height: 20
      maximum_height: 9999
      minimum_width: 10
      maximum_width: 9999
    }
  }
  evaluation_box_config {
    key: "price"
    value {
      minimum_height: 20
      maximum_height: 9999
      minimum_width: 10
      maximum_width: 9999
    }
  }
  evaluation_box_config {
    key: "card"
    value {
      minimum_height: 20
      maximum_height: 9999
      minimum_width: 10
      maximum_width: 9999
    }
  }
  evaluation_box_config {
    key: "card7p"
    value {
      minimum_height: 20
      maximum_height: 9999
      minimum_width: 10
      maximum_width: 9999
    }
  }
  evaluation_box_config {
    key: "unknown"
    value {
      minimum_height: 20
      maximum_height: 9999
      minimum_width: 10
      maximum_width: 9999
    }
  }
  average_precision_mode: INTEGRATE
}
cost_function_config {
  target_classes {
    name: "barcode"
    class_weight: 1.0
    coverage_foreground_weight: 0.0500000007451
    objectives {
      name: "cov"
      initial_weight: 1.0
      weight_target: 1.0
    }
    objectives {
      name: "bbox"
      initial_weight: 10.0
      weight_target: 10.0
    }
  }
  target_classes {
    name: "name"
    class_weight: 8.0
    coverage_foreground_weight: 0.0500000007451
    objectives {
      name: "cov"
      initial_weight: 1.0
      weight_target: 1.0
    }
    objectives {
      name: "bbox"
      initial_weight: 10.0
      weight_target: 1.0
    }
  }
  target_classes {
    name: "price"
    class_weight: 4.0
    coverage_foreground_weight: 0.0500000007451
    objectives {
      name: "cov"
      initial_weight: 1.0
      weight_target: 1.0
    }
    objectives {
      name: "bbox"
      initial_weight: 10.0
      weight_target: 10.0
    }
  }
  target_classes {
    name: "card"
    class_weight: 4.0
    coverage_foreground_weight: 0.0500000007451
    objectives {
      name: "cov"
      initial_weight: 1.0
      weight_target: 1.0
    }
    objectives {
      name: "bbox"
      initial_weight: 10.0
      weight_target: 10.0
    }
  }
  target_classes {
    name: "card7p"
    class_weight: 4.0
    coverage_foreground_weight: 0.0500000007451
    objectives {
      name: "cov"
      initial_weight: 1.0
      weight_target: 1.0
    }
    objectives {
      name: "bbox"
      initial_weight: 10.0
      weight_target: 10.0
    }
  }
  target_classes {
    name: "unknown"
    class_weight: 4.0
    coverage_foreground_weight: 0.0500000007451
    objectives {
      name: "cov"
      initial_weight: 1.0
      weight_target: 1.0
    }
    objectives {
      name: "bbox"
      initial_weight: 10.0
      weight_target: 10.0
    }
  }
  enable_autoweighting: false
  max_objective_weight: 0.999899983406
  min_objective_weight: 9.99999974738e-05
}
training_config {
  batch_size_per_gpu: 4
  num_epochs: 120
  learning_rate {
    soft_start_annealing_schedule {
      min_learning_rate: 5e-07
      max_learning_rate: 5e-05
      soft_start: 0.10000000149
      annealing: 0.699999988079
    }
  }
  regularizer {
    type: L1
    weight: 3.00000002618e-09
  }
  optimizer {
    adam {
      epsilon: 9.99999993923e-09
      beta1: 0.899999976158
      beta2: 0.999000012875
    }
  }
  cost_scaling {
    initial_exponent: 20.0
    increment: 0.005
    decrement: 1.0
  }
  visualizer{
    enabled: true
    num_images: 3
    scalar_logging_frequency: 50
    infrequent_logging_frequency: 5
    target_class_config {
      key: "barcode"
      value: {
        coverage_threshold: 0.005
      }
    }
    target_class_config {
      key: "name"
      value: {
        coverage_threshold: 0.005
      }
    }
    target_class_config {
      key: "price"
      value: {
        coverage_threshold: 0.005
      }
    }
    target_class_config {
      key: "card"
      value: {
        coverage_threshold: 0.005
      }
    }
    target_class_config {
      key: "card7p"
      value: {
        coverage_threshold: 0.005
      }
    }
    target_class_config {
      key: "unknown"
      value: {
        coverage_threshold: 0.005
      }
    }
    clearml_config{
      project: "TAO Toolkit ClearML Demo"
      task: "detectnet_v2_resnet18_clearml"
      tags: "detectnet_v2"
      tags: "training"
      tags: "resnet18"
      tags: "unpruned"
    }
    wandb_config{
      project: "TAO Toolkit Wandb Demo"
      name: "detectnet_v2_resnet18_wandb"
      tags: "detectnet_v2"
      tags: "training"
      tags: "resnet18"
      tags: "unpruned"
    }
  }
  checkpoint_interval: 10
}
bbox_rasterizer_config {
  target_class_config {
    key: "barcode"
    value {
      cov_center_x: 0.5
      cov_center_y: 0.5
      cov_radius_x: 0.40000000596
      cov_radius_y: 0.40000000596
      bbox_min_radius: 1.0
    }
  }
  target_class_config {
    key: "name"
    value {
      cov_center_x: 0.5
      cov_center_y: 0.5
      cov_radius_x: 1.0
      cov_radius_y: 1.0
      bbox_min_radius: 1.0
    }
  }
  target_class_config {
    key: "price"
    value {
      cov_center_x: 0.5
      cov_center_y: 0.5
      cov_radius_x: 1.0
      cov_radius_y: 1.0
      bbox_min_radius: 1.0
    }
  }
  target_class_config {
    key: "card"
    value {
      cov_center_x: 0.5
      cov_center_y: 0.5
      cov_radius_x: 1.0
      cov_radius_y: 1.0
      bbox_min_radius: 1.0
    }
  }
  target_class_config {
    key: "card7p"
    value {
      cov_center_x: 0.5
      cov_center_y: 0.5
      cov_radius_x: 1.0
      cov_radius_y: 1.0
      bbox_min_radius: 1.0
    }
  }
  target_class_config {
    key: "unknown"
    value {
      cov_center_x: 0.5
      cov_center_y: 0.5
      cov_radius_x: 1.0
      cov_radius_y: 1.0
      bbox_min_radius: 1.0
    }
  }
  deadzone_radius: 0.400000154972

# TAO Dataset Converter
!tao model detectnet_v2 dataset_convert -d /workspace/tao-experiments/detectnet_v2/specs/spec_tfrecords_kitti.txt \
                                        -o /workspace/tao-experiments/data/tfrecords_aikata
...                              
2024-06-03 09:04:18,874 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 105: Class map. 
Label in GT: Label in tfrecords file 
b'card': b'card'
b'name': b'name'
b'price': b'price'
b'barcode': b'barcode'
b'card7p': b'card7p'
b'unknown': b'unknown'
For the dataset_config in the experiment_spec, please use labels in the tfrecords file, while writing the classmap.

2024-06-03 09:04:18,874 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 114: Tfrecords generation complete.
Execution status: PASS
2024-06-03 18:04:23,922 [TAO Toolkit] [INFO] nvidia_tao_cli.components.docker_handler.docker_handler 363: Stopping container.                                        

So, I thought it’s ready to train dataset with TAO and run these commands, but FAILED.

!tao model detectnet_v2 train -e $SPECS_DIR/spec_train_kitti.txt \
                        -r $USER_EXPERIMENT_DIR/experiment_dir_unpruned \
                        -k aikata \
                        -n resnet18_detector \
                        --gpus $NUM_GPUS
2024-06-03 18:10:45,022 [TAO Toolkit] [INFO] root 160: Registry: ['nvcr.io']
2024-06-03 18:10:45,086 [TAO Toolkit] [INFO] nvidia_tao_cli.components.instance_handler.local_instance 360: Running command in container: nvcr.io/nvidia/tao/tao-toolkit:5.0.0-tf1.15.5
2024-06-03 18:10:45,321 [TAO Toolkit] [INFO] nvidia_tao_cli.components.docker_handler.docker_handler 301: Printing tty value True
2024-06-03 09:10:46.078291: I tensorflow/stream_executor/platform/default/dso_loader.cc:50] Successfully opened dynamic library libcudart.so.12
2024-06-03 09:10:46,128 [TAO Toolkit] [WARNING] tensorflow 40: Deprecation warnings have been disabled. Set TF_ENABLE_DEPRECATION_WARNINGS=1 to re-enable them.
Using TensorFlow backend.
2024-06-03 09:10:47,178 [TAO Toolkit] [WARNING] tensorflow 43: TensorFlow will not use sklearn by default. This improves performance in some cases. To enable sklearn export the environment variable  TF_ALLOW_IOLIBS=1.
2024-06-03 09:10:47,205 [TAO Toolkit] [WARNING] tensorflow 42: TensorFlow will not use Dask by default. This improves performance in some cases. To enable Dask export the environment variable  TF_ALLOW_IOLIBS=1.
2024-06-03 09:10:47,209 [TAO Toolkit] [WARNING] tensorflow 43: TensorFlow will not use Pandas by default. This improves performance in some cases. To enable Pandas export the environment variable  TF_ALLOW_IOLIBS=1.
2024-06-03 09:10:48,220 [TAO Toolkit] [WARNING] matplotlib 500: Matplotlib created a temporary config/cache directory at /tmp/matplotlib-9thmcfat because the default path (/.config/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.
2024-06-03 09:10:48,390 [TAO Toolkit] [INFO] matplotlib.font_manager 1633: generated new fontManager
WARNING:tensorflow:Deprecation warnings have been disabled. Set TF_ENABLE_DEPRECATION_WARNINGS=1 to re-enable them.
Using TensorFlow backend.
WARNING:tensorflow:TensorFlow will not use sklearn by default. This improves performance in some cases. To enable sklearn export the environment variable  TF_ALLOW_IOLIBS=1.
...
2024-06-03 09:10:53,326 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 133: Loading weights from pretrained model file. /workspace/tao-experiments/detectnet_v2_test/pretrained_resnet18/pretrained_detectnet_v2_vresnet18/resnet18.hdf5
2024-06-03 09:10:53,326 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer input_1 weights set from pre-trained model.
2024-06-03 09:10:53,446 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer conv1 weights set from pre-trained model.
2024-06-03 09:10:53,560 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer bn_conv1 weights set from pre-trained model.
2024-06-03 09:10:53,560 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer activation_1 weights set from pre-trained model.
2024-06-03 09:10:53,675 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_1a_conv_1 weights set from pre-trained model.
2024-06-03 09:10:53,815 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_1a_bn_1 weights set from pre-trained model.
2024-06-03 09:10:53,962 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_1a_conv_2 weights set from pre-trained model.
2024-06-03 09:10:54,099 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_1a_conv_shortcut weights set from pre-trained model.
2024-06-03 09:10:54,225 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_1a_bn_2 weights set from pre-trained model.
2024-06-03 09:10:54,342 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_1a_bn_shortcut weights set from pre-trained model.
2024-06-03 09:10:54,342 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer add_1 weights set from pre-trained model.
2024-06-03 09:10:54,455 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_1b_conv_1 weights set from pre-trained model.
2024-06-03 09:10:54,632 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_1b_bn_1 weights set from pre-trained model.
2024-06-03 09:10:54,770 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_1b_conv_2 weights set from pre-trained model.
2024-06-03 09:10:54,885 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_1b_bn_2 weights set from pre-trained model.
2024-06-03 09:10:54,885 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer add_2 weights set from pre-trained model.
2024-06-03 09:10:54,996 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_2a_conv_1 weights set from pre-trained model.
2024-06-03 09:10:55,111 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_2a_bn_1 weights set from pre-trained model.
2024-06-03 09:10:55,225 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_2a_conv_2 weights set from pre-trained model.
2024-06-03 09:10:55,337 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_2a_conv_shortcut weights set from pre-trained model.
2024-06-03 09:10:55,454 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_2a_bn_2 weights set from pre-trained model.
2024-06-03 09:10:55,572 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_2a_bn_shortcut weights set from pre-trained model.
2024-06-03 09:10:55,572 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer add_3 weights set from pre-trained model.
2024-06-03 09:10:55,685 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_2b_conv_1 weights set from pre-trained model.
2024-06-03 09:10:55,808 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_2b_bn_1 weights set from pre-trained model.
2024-06-03 09:10:55,931 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_2b_conv_2 weights set from pre-trained model.
2024-06-03 09:10:56,051 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_2b_bn_2 weights set from pre-trained model.
2024-06-03 09:10:56,051 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer add_4 weights set from pre-trained model.
2024-06-03 09:10:56,168 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_3a_conv_1 weights set from pre-trained model.
2024-06-03 09:10:56,360 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_3a_bn_1 weights set from pre-trained model.
2024-06-03 09:10:56,570 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_3a_conv_2 weights set from pre-trained model.
2024-06-03 09:10:56,770 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_3a_conv_shortcut weights set from pre-trained model.
2024-06-03 09:10:56,969 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_3a_bn_2 weights set from pre-trained model.
2024-06-03 09:10:57,139 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_3a_bn_shortcut weights set from pre-trained model.
2024-06-03 09:10:57,139 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer add_5 weights set from pre-trained model.
2024-06-03 09:10:57,278 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_3b_conv_1 weights set from pre-trained model.
2024-06-03 09:10:57,399 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_3b_bn_1 weights set from pre-trained model.
2024-06-03 09:10:57,534 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_3b_conv_2 weights set from pre-trained model.
2024-06-03 09:10:57,749 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_3b_bn_2 weights set from pre-trained model.
2024-06-03 09:10:57,749 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer add_6 weights set from pre-trained model.
2024-06-03 09:10:57,887 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_4a_conv_1 weights set from pre-trained model.
2024-06-03 09:10:58,022 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_4a_bn_1 weights set from pre-trained model.
2024-06-03 09:10:58,152 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_4a_conv_2 weights set from pre-trained model.
2024-06-03 09:10:58,271 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_4a_conv_shortcut weights set from pre-trained model.
2024-06-03 09:10:58,402 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_4a_bn_2 weights set from pre-trained model.
2024-06-03 09:10:58,537 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_4a_bn_shortcut weights set from pre-trained model.
2024-06-03 09:10:58,537 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer add_7 weights set from pre-trained model.
2024-06-03 09:10:58,689 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_4b_conv_1 weights set from pre-trained model.
2024-06-03 09:10:58,903 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_4b_bn_1 weights set from pre-trained model.
2024-06-03 09:10:59,074 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_4b_conv_2 weights set from pre-trained model.
2024-06-03 09:10:59,198 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_4b_bn_2 weights set from pre-trained model.
2024-06-03 09:10:59,198 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer add_8 weights set from pre-trained model.
2024-06-03 09:10:59,274 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.objectives.bbox_objective 78: Default L1 loss function will be used.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
==================================================================================================
input_1 (InputLayer)            (None, 3, 1920, 1440 0                                            
__________________________________________________________________________________________________
conv1 (Conv2D)                  (None, 64, 960, 720) 9472        input_1[0][0]                    
__________________________________________________________________________________________________
bn_conv1 (BatchNormalization)   (None, 64, 960, 720) 256         conv1[0][0]                      
__________________________________________________________________________________________________
activation_1 (Activation)       (None, 64, 960, 720) 0           bn_conv1[0][0]                   
__________________________________________________________________________________________________
block_1a_conv_1 (Conv2D)        (None, 64, 480, 360) 36928       activation_1[0][0]               
__________________________________________________________________________________________________
block_1a_bn_1 (BatchNormalizati (None, 64, 480, 360) 256         block_1a_conv_1[0][0]            
__________________________________________________________________________________________________
block_1a_relu_1 (Activation)    (None, 64, 480, 360) 0           block_1a_bn_1[0][0]              
__________________________________________________________________________________________________
block_1a_conv_2 (Conv2D)        (None, 64, 480, 360) 36928       block_1a_relu_1[0][0]            
__________________________________________________________________________________________________
block_1a_conv_shortcut (Conv2D) (None, 64, 480, 360) 4160        activation_1[0][0]               
__________________________________________________________________________________________________
block_1a_bn_2 (BatchNormalizati (None, 64, 480, 360) 256         block_1a_conv_2[0][0]            
__________________________________________________________________________________________________
block_1a_bn_shortcut (BatchNorm (None, 64, 480, 360) 256         block_1a_conv_shortcut[0][0]     
__________________________________________________________________________________________________
add_1 (Add)                     (None, 64, 480, 360) 0           block_1a_bn_2[0][0]              
                                                                 block_1a_bn_shortcut[0][0]       
__________________________________________________________________________________________________
block_1a_relu (Activation)      (None, 64, 480, 360) 0           add_1[0][0]                      
__________________________________________________________________________________________________
block_1b_conv_1 (Conv2D)        (None, 64, 480, 360) 36928       block_1a_relu[0][0]              
__________________________________________________________________________________________________
block_1b_bn_1 (BatchNormalizati (None, 64, 480, 360) 256         block_1b_conv_1[0][0]            
__________________________________________________________________________________________________
block_1b_relu_1 (Activation)    (None, 64, 480, 360) 0           block_1b_bn_1[0][0]              
__________________________________________________________________________________________________
block_1b_conv_2 (Conv2D)        (None, 64, 480, 360) 36928       block_1b_relu_1[0][0]            
__________________________________________________________________________________________________
block_1b_bn_2 (BatchNormalizati (None, 64, 480, 360) 256         block_1b_conv_2[0][0]            
__________________________________________________________________________________________________
add_2 (Add)                     (None, 64, 480, 360) 0           block_1b_bn_2[0][0]              
                                                                 block_1a_relu[0][0]              
__________________________________________________________________________________________________
block_1b_relu (Activation)      (None, 64, 480, 360) 0           add_2[0][0]                      
__________________________________________________________________________________________________
block_2a_conv_1 (Conv2D)        (None, 128, 240, 180 73856       block_1b_relu[0][0]              
__________________________________________________________________________________________________
block_2a_bn_1 (BatchNormalizati (None, 128, 240, 180 512         block_2a_conv_1[0][0]            
__________________________________________________________________________________________________
block_2a_relu_1 (Activation)    (None, 128, 240, 180 0           block_2a_bn_1[0][0]              
__________________________________________________________________________________________________
block_2a_conv_2 (Conv2D)        (None, 128, 240, 180 147584      block_2a_relu_1[0][0]            
__________________________________________________________________________________________________
block_2a_conv_shortcut (Conv2D) (None, 128, 240, 180 8320        block_1b_relu[0][0]              
__________________________________________________________________________________________________
block_2a_bn_2 (BatchNormalizati (None, 128, 240, 180 512         block_2a_conv_2[0][0]            
__________________________________________________________________________________________________
block_2a_bn_shortcut (BatchNorm (None, 128, 240, 180 512         block_2a_conv_shortcut[0][0]     
__________________________________________________________________________________________________
add_3 (Add)                     (None, 128, 240, 180 0           block_2a_bn_2[0][0]              
                                                                 block_2a_bn_shortcut[0][0]       
__________________________________________________________________________________________________
block_2a_relu (Activation)      (None, 128, 240, 180 0           add_3[0][0]                      
__________________________________________________________________________________________________
block_2b_conv_1 (Conv2D)        (None, 128, 240, 180 147584      block_2a_relu[0][0]              
__________________________________________________________________________________________________
block_2b_bn_1 (BatchNormalizati (None, 128, 240, 180 512         block_2b_conv_1[0][0]            
__________________________________________________________________________________________________
block_2b_relu_1 (Activation)    (None, 128, 240, 180 0           block_2b_bn_1[0][0]              
__________________________________________________________________________________________________
block_2b_conv_2 (Conv2D)        (None, 128, 240, 180 147584      block_2b_relu_1[0][0]            
__________________________________________________________________________________________________
block_2b_bn_2 (BatchNormalizati (None, 128, 240, 180 512         block_2b_conv_2[0][0]            
__________________________________________________________________________________________________
add_4 (Add)                     (None, 128, 240, 180 0           block_2b_bn_2[0][0]              
                                                                 block_2a_relu[0][0]              
__________________________________________________________________________________________________
block_2b_relu (Activation)      (None, 128, 240, 180 0           add_4[0][0]                      
__________________________________________________________________________________________________
block_3a_conv_1 (Conv2D)        (None, 256, 120, 90) 295168      block_2b_relu[0][0]              
__________________________________________________________________________________________________
block_3a_bn_1 (BatchNormalizati (None, 256, 120, 90) 1024        block_3a_conv_1[0][0]            
__________________________________________________________________________________________________
block_3a_relu_1 (Activation)    (None, 256, 120, 90) 0           block_3a_bn_1[0][0]              
__________________________________________________________________________________________________
block_3a_conv_2 (Conv2D)        (None, 256, 120, 90) 590080      block_3a_relu_1[0][0]            
__________________________________________________________________________________________________
block_3a_conv_shortcut (Conv2D) (None, 256, 120, 90) 33024       block_2b_relu[0][0]              
__________________________________________________________________________________________________
block_3a_bn_2 (BatchNormalizati (None, 256, 120, 90) 1024        block_3a_conv_2[0][0]            
__________________________________________________________________________________________________
block_3a_bn_shortcut (BatchNorm (None, 256, 120, 90) 1024        block_3a_conv_shortcut[0][0]     
__________________________________________________________________________________________________
add_5 (Add)                     (None, 256, 120, 90) 0           block_3a_bn_2[0][0]              
                                                                 block_3a_bn_shortcut[0][0]       
__________________________________________________________________________________________________
block_3a_relu (Activation)      (None, 256, 120, 90) 0           add_5[0][0]                      
__________________________________________________________________________________________________
block_3b_conv_1 (Conv2D)        (None, 256, 120, 90) 590080      block_3a_relu[0][0]              
__________________________________________________________________________________________________
block_3b_bn_1 (BatchNormalizati (None, 256, 120, 90) 1024        block_3b_conv_1[0][0]            
__________________________________________________________________________________________________
block_3b_relu_1 (Activation)    (None, 256, 120, 90) 0           block_3b_bn_1[0][0]              
__________________________________________________________________________________________________
block_3b_conv_2 (Conv2D)        (None, 256, 120, 90) 590080      block_3b_relu_1[0][0]            
__________________________________________________________________________________________________
block_3b_bn_2 (BatchNormalizati (None, 256, 120, 90) 1024        block_3b_conv_2[0][0]            
__________________________________________________________________________________________________
add_6 (Add)                     (None, 256, 120, 90) 0           block_3b_bn_2[0][0]              
                                                                 block_3a_relu[0][0]              
__________________________________________________________________________________________________
block_3b_relu (Activation)      (None, 256, 120, 90) 0           add_6[0][0]                      
__________________________________________________________________________________________________
block_4a_conv_1 (Conv2D)        (None, 512, 120, 90) 1180160     block_3b_relu[0][0]              
__________________________________________________________________________________________________
block_4a_bn_1 (BatchNormalizati (None, 512, 120, 90) 2048        block_4a_conv_1[0][0]            
__________________________________________________________________________________________________
block_4a_relu_1 (Activation)    (None, 512, 120, 90) 0           block_4a_bn_1[0][0]              
__________________________________________________________________________________________________
block_4a_conv_2 (Conv2D)        (None, 512, 120, 90) 2359808     block_4a_relu_1[0][0]            
__________________________________________________________________________________________________
block_4a_conv_shortcut (Conv2D) (None, 512, 120, 90) 131584      block_3b_relu[0][0]              
__________________________________________________________________________________________________
block_4a_bn_2 (BatchNormalizati (None, 512, 120, 90) 2048        block_4a_conv_2[0][0]            
__________________________________________________________________________________________________
block_4a_bn_shortcut (BatchNorm (None, 512, 120, 90) 2048        block_4a_conv_shortcut[0][0]     
__________________________________________________________________________________________________
add_7 (Add)                     (None, 512, 120, 90) 0           block_4a_bn_2[0][0]              
                                                                 block_4a_bn_shortcut[0][0]       
__________________________________________________________________________________________________
block_4a_relu (Activation)      (None, 512, 120, 90) 0           add_7[0][0]                      
__________________________________________________________________________________________________
block_4b_conv_1 (Conv2D)        (None, 512, 120, 90) 2359808     block_4a_relu[0][0]              
__________________________________________________________________________________________________
block_4b_bn_1 (BatchNormalizati (None, 512, 120, 90) 2048        block_4b_conv_1[0][0]            
__________________________________________________________________________________________________
block_4b_relu_1 (Activation)    (None, 512, 120, 90) 0           block_4b_bn_1[0][0]              
__________________________________________________________________________________________________
block_4b_conv_2 (Conv2D)        (None, 512, 120, 90) 2359808     block_4b_relu_1[0][0]            
__________________________________________________________________________________________________
block_4b_bn_2 (BatchNormalizati (None, 512, 120, 90) 2048        block_4b_conv_2[0][0]            
__________________________________________________________________________________________________
add_8 (Add)                     (None, 512, 120, 90) 0           block_4b_bn_2[0][0]              
                                                                 block_4a_relu[0][0]              
__________________________________________________________________________________________________
block_4b_relu (Activation)      (None, 512, 120, 90) 0           add_8[0][0]                      
__________________________________________________________________________________________________
output_bbox (Conv2D)            (None, 24, 120, 90)  12312       block_4b_relu[0][0]              
__________________________________________________________________________________________________
output_cov (Conv2D)             (None, 6, 120, 90)   3078        block_4b_relu[0][0]              
==================================================================================================
Total params: 11,210,718
Trainable params: 11,200,990
Non-trainable params: 9,728
__________________________________________________________________________________________________
2024-06-03 09:10:59,297 [TAO Toolkit] [INFO] root 2102: DetectNet V2 model built.
2024-06-03 09:10:59,298 [TAO Toolkit] [INFO] root 2102: Building rasterizer.
2024-06-03 09:10:59,298 [TAO Toolkit] [INFO] root 2102: Rasterizers built.
WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/nvidia_tao_tf1/cv/detectnet_v2/training/training_proto_utilities.py:102: The name tf.train.get_or_create_global_step is deprecated. Please use tf.compat.v1.train.get_or_create_global_step instead.
...
INFO:tensorflow:Graph was finalized.
2024-06-03 09:11:10,956 [TAO Toolkit] [INFO] tensorflow 240: Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpjaunkjin/model.ckpt-120
2024-06-03 09:11:10,958 [TAO Toolkit] [INFO] tensorflow 1284: Restoring parameters from /tmp/tmpjaunkjin/model.ckpt-120
2024-06-03 09:11:11,739 [TAO Toolkit] [INFO] root 2102: Restoring from checkpoint failed. This is most likely due to a Variable name or other graph key that is missing from the checkpoint. Please ensure that you have not altered the graph expected based on the checkpoint. Original error:

2 root error(s) found.
  (0) Not found: Key cost_sums/barcode-bbox not found in checkpoint
	 [[node save/RestoreV2 (defined at /tensorflow_core/python/framework/ops.py:1748) ]]
  (1) Not found: Key cost_sums/barcode-bbox not found in checkpoint
	 [[node save/RestoreV2 (defined at /tensorflow_core/python/framework/ops.py:1748) ]]
	 [[save/RestoreV2/_793]]
0 successful operations.
0 derived errors ignored.
...
During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/training/saver.py", line 1300, in restore
    names_to_keys = object_graph_key_mapping(save_path)
  File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/training/saver.py", line 1618, in object_graph_key_mapping
    object_graph_string = reader.get_tensor(trackable.OBJECT_GRAPH_PROTO_KEY)
  File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/pywrap_tensorflow_internal.py", line 915, in get_tensor
    return CheckpointReader_GetTensor(self, compat.as_bytes(tensor_str))
tensorflow.python.framework.errors_impl.NotFoundError: Key _CHECKPOINTABLE_OBJECT_GRAPH not found in checkpoint
...
2024-06-03 09:11:12,086 [TAO Toolkit] [ERROR] tensorflow 70: ==================================
Object was never used (type <class 'tensorflow.python.framework.ops.Tensor'>):
<tf.Tensor 'IsVariableInitialized_302:0' shape=() dtype=bool>
If you want to mark it as used call its "mark_used()" method.
It was originally created here:
  File "/usr/local/lib/python3.8/dist-packages/nvidia_tao_tf1/cv/detectnet_v2/training/utilities.py", line 154, in get_singular_monitored_session
    return tf.train.SingularMonitoredSession(hooks=hooks,  File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/training/monitored_session.py", line 1100, in __init__
    super(SingularMonitoredSession, self).__init__(  File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/training/monitored_session.py", line 727, in __init__
    self._sess = self._coordinated_creator.create_session()  File "/usr/local/lib/python3.8/dist-packages/nvidia_tao_tf1/core/hooks/hooks.py", line 286, in begin
    self._variables_initialized.append(  File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/util/tf_should_use.py", line 198, in wrapped
    return _add_should_use_warning(fn(*args, **kwargs))
==================================
Execution status: FAIL
2024-06-03 18:11:16,861 [TAO Toolkit] [INFO] nvidia_tao_cli.components.docker_handler.docker_handler 363: Stopping container.

It shows FAIL, but I do not have any clue from the stacktrace.
What might be the reason to be failed?
Because of my setting files or KITTI files ?

Is conversion successful? Can you share the log?

Can you share the command line? And also the training spec file?

@Morganh
Thank you for your reply.

Is conversion successful? Can you share the log?

This is the dataset_convert log.

!tao model detectnet_v2 dataset_convert -d /workspace/tao-experiments/detectnet_v2/specs/spec_tfrecords_kitti.txt \
                                        -o /workspace/tao-experiments/data/tfrecords_aikata
2024-06-03 18:04:13,384 [TAO Toolkit] [INFO] root 160: Registry: ['nvcr.io']
2024-06-03 18:04:13,447 [TAO Toolkit] [INFO] nvidia_tao_cli.components.instance_handler.local_instance 360: Running command in container: nvcr.io/nvidia/tao/tao-toolkit:5.0.0-tf1.15.5
2024-06-03 18:04:13,675 [TAO Toolkit] [INFO] nvidia_tao_cli.components.docker_handler.docker_handler 301: Printing tty value True
2024-06-03 09:04:14.419146: I tensorflow/stream_executor/platform/default/dso_loader.cc:50] Successfully opened dynamic library libcudart.so.12
2024-06-03 09:04:14,453 [TAO Toolkit] [WARNING] tensorflow 40: Deprecation warnings have been disabled. Set TF_ENABLE_DEPRECATION_WARNINGS=1 to re-enable them.
Using TensorFlow backend.
2024-06-03 09:04:15,574 [TAO Toolkit] [WARNING] tensorflow 43: TensorFlow will not use sklearn by default. This improves performance in some cases. To enable sklearn export the environment variable  TF_ALLOW_IOLIBS=1.
2024-06-03 09:04:15,602 [TAO Toolkit] [WARNING] tensorflow 42: TensorFlow will not use Dask by default. This improves performance in some cases. To enable Dask export the environment variable  TF_ALLOW_IOLIBS=1.
2024-06-03 09:04:15,605 [TAO Toolkit] [WARNING] tensorflow 43: TensorFlow will not use Pandas by default. This improves performance in some cases. To enable Pandas export the environment variable  TF_ALLOW_IOLIBS=1.
2024-06-03 09:04:16,602 [TAO Toolkit] [WARNING] matplotlib 500: Matplotlib created a temporary config/cache directory at /tmp/matplotlib-pn4f6fy2 because the default path (/.config/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.
2024-06-03 09:04:16,771 [TAO Toolkit] [INFO] matplotlib.font_manager 1633: generated new fontManager
WARNING:tensorflow:Deprecation warnings have been disabled. Set TF_ENABLE_DEPRECATION_WARNINGS=1 to re-enable them.
Using TensorFlow backend.
WARNING:tensorflow:TensorFlow will not use sklearn by default. This improves performance in some cases. To enable sklearn export the environment variable  TF_ALLOW_IOLIBS=1.
2024-06-03 09:04:18,053 [TAO Toolkit] [WARNING] tensorflow 43: TensorFlow will not use sklearn by default. This improves performance in some cases. To enable sklearn export the environment variable  TF_ALLOW_IOLIBS=1.
WARNING:tensorflow:TensorFlow will not use Dask by default. This improves performance in some cases. To enable Dask export the environment variable  TF_ALLOW_IOLIBS=1.
2024-06-03 09:04:18,080 [TAO Toolkit] [WARNING] tensorflow 42: TensorFlow will not use Dask by default. This improves performance in some cases. To enable Dask export the environment variable  TF_ALLOW_IOLIBS=1.
WARNING:tensorflow:TensorFlow will not use Pandas by default. This improves performance in some cases. To enable Pandas export the environment variable  TF_ALLOW_IOLIBS=1.
2024-06-03 09:04:18,083 [TAO Toolkit] [WARNING] tensorflow 43: TensorFlow will not use Pandas by default. This improves performance in some cases. To enable Pandas export the environment variable  TF_ALLOW_IOLIBS=1.
2024-06-03 09:04:18,443 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.build_converter 87: Instantiating a kitti converter
2024-06-03 09:04:18,445 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.kitti_converter_lib 176: Num images in
Train: 1035	Val: 258
2024-06-03 09:04:18,445 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.kitti_converter_lib 197: Validation data in partition 0. Hence, while choosing the validationset during training choose validation_fold 0.
2024-06-03 09:04:18,446 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 166: Writing partition 0, shard 0
WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/nvidia_tao_tf1/cv/detectnet_v2/dataio/dataset_converter_lib.py:181: The name tf.python_io.TFRecordWriter is deprecated. Please use tf.io.TFRecordWriter instead.

2024-06-03 09:04:18,446 [TAO Toolkit] [WARNING] tensorflow 137: From /usr/local/lib/python3.8/dist-packages/nvidia_tao_tf1/cv/detectnet_v2/dataio/dataset_converter_lib.py:181: The name tf.python_io.TFRecordWriter is deprecated. Please use tf.io.TFRecordWriter instead.

2024-06-03 09:04:18,457 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 166: Writing partition 0, shard 1
2024-06-03 09:04:18,465 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 166: Writing partition 0, shard 2
2024-06-03 09:04:18,473 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 166: Writing partition 0, shard 3
2024-06-03 09:04:18,481 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 166: Writing partition 0, shard 4
2024-06-03 09:04:18,489 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 166: Writing partition 0, shard 5
2024-06-03 09:04:18,497 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 166: Writing partition 0, shard 6
2024-06-03 09:04:18,505 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 166: Writing partition 0, shard 7
2024-06-03 09:04:18,513 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 166: Writing partition 0, shard 8
2024-06-03 09:04:18,521 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 166: Writing partition 0, shard 9
2024-06-03 09:04:18,531 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 250: 
Wrote the following numbers of objects:
b'card': 126
b'name': 155
b'price': 157
b'barcode': 162
b'card7p': 33
b'unknown': 394

2024-06-03 09:04:18,531 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 166: Writing partition 1, shard 0
2024-06-03 09:04:18,564 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 166: Writing partition 1, shard 1
2024-06-03 09:04:18,599 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 166: Writing partition 1, shard 2
2024-06-03 09:04:18,634 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 166: Writing partition 1, shard 3
2024-06-03 09:04:18,669 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 166: Writing partition 1, shard 4
2024-06-03 09:04:18,703 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 166: Writing partition 1, shard 5
2024-06-03 09:04:18,736 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 166: Writing partition 1, shard 6
2024-06-03 09:04:18,770 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 166: Writing partition 1, shard 7
2024-06-03 09:04:18,804 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 166: Writing partition 1, shard 8
2024-06-03 09:04:18,838 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 166: Writing partition 1, shard 9
2024-06-03 09:04:18,873 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 250: 
Wrote the following numbers of objects:
b'card': 1316
b'name': 1477
b'barcode': 1494
b'price': 1465
b'card7p': 194
b'unknown': 176

2024-06-03 09:04:18,874 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 89: Cumulative object statistics
2024-06-03 09:04:18,874 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 250: 
Wrote the following numbers of objects:
b'card': 1442
b'name': 1632
b'price': 1622
b'barcode': 1656
b'card7p': 227
b'unknown': 570

2024-06-03 09:04:18,874 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 105: Class map. 
Label in GT: Label in tfrecords file 
b'card': b'card'
b'name': b'name'
b'price': b'price'
b'barcode': b'barcode'
b'card7p': b'card7p'
b'unknown': b'unknown'
For the dataset_config in the experiment_spec, please use labels in the tfrecords file, while writing the classmap.

2024-06-03 09:04:18,874 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 114: Tfrecords generation complete.
Execution status: PASS
2024-06-03 18:04:23,922 [TAO Toolkit] [INFO] nvidia_tao_cli.components.docker_handler.docker_handler 363: Stopping container.

Can you share the command line?

I’m sorry, I wrote these command lines but it was this command line.

!tao model detectnet_v2 train -e $SPECS_DIR/spec_train_kitti.txt \
                        -r $USER_EXPERIMENT_DIR/experiment_dir_unpruned \
                        -k aikata \
                        -n resnet18_detector \
                        --gpus $NUM_GPUS

2024-06-04 03:07:12,293 [TAO Toolkit] [WARNING] tensorflow 137: From /usr/local/lib/python3.8/dist-packages/nvidia_tao_tf1/cv/detectnet_v2/training/utilities.py:154: The name tf.train.SingularMonitoredSession is deprecated. Please use tf.compat.v1.train.SingularMonitoredSession instead.

INFO:tensorflow:Graph was finalized.
2024-06-04 03:07:13,725 [TAO Toolkit] [INFO] tensorflow 240: Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpkvz33fwr/model.ckpt-120
2024-06-04 03:07:13,728 [TAO Toolkit] [INFO] tensorflow 1284: Restoring parameters from /tmp/tmpkvz33fwr/model.ckpt-120
2024-06-04 03:07:14,562 [TAO Toolkit] [INFO] root 2102: Restoring from checkpoint failed. This is most likely due to a Variable name or other graph key that is missing from the checkpoint. Please ensure that you have not altered the graph expected based on the checkpoint. Original error:

2 root error(s) found.
  (0) Not found: Key cost_sums/barcode-bbox not found in checkpoint
	 [[node save/RestoreV2 (defined at /tensorflow_core/python/framework/ops.py:1748) ]]
  (1) Not found: Key cost_sums/barcode-bbox not found in checkpoint
	 [[node save/RestoreV2 (defined at /tensorflow_core/python/framework/ops.py:1748) ]]
	 [[save/RestoreV2/_301]]
0 successful operations.
0 derived errors ignored.

Original stack trace for 'save/RestoreV2':
  File "/nvidia_tao_tf1/cv/detectnet_v2/scripts/train.py", line 1046, in <module>
    main()
  File "/decorator.py", line 232, in fun
    return caller(func, *(extras + args), **kw)
  File "/nvidia_tao_tf1/cv/detectnet_v2/utilities/timer.py", line 46, in wrapped_fn
    return_args = fn(*args, **kwargs)
  File "/nvidia_tao_tf1/cv/detectnet_v2/scripts/train.py", line 1024, in main
    run_experiment(
  File "/nvidia_tao_tf1/cv/detectnet_v2/scripts/train.py", line 887, in run_experiment
    train_gridbox(results_dir, experiment_spec, output_model_file_name, input_model_file_name,
  File "/nvidia_tao_tf1/cv/detectnet_v2/scripts/train.py", line 760, in train_gridbox
    run_training_loop(experiment_spec, results_dir, gridbox_model, hooks, steps_per_epoch,
  File "/nvidia_tao_tf1/cv/detectnet_v2/scripts/train.py", line 227, in run_training_loop
    with get_singular_monitored_session(keras_models=gridbox_model.get_keras_training_model(),
  File "/nvidia_tao_tf1/cv/detectnet_v2/training/utilities.py", line 154, in get_singular_monitored_session
    return tf.train.SingularMonitoredSession(hooks=hooks,
  File "/tensorflow_core/python/training/monitored_session.py", line 1100, in __init__
    super(SingularMonitoredSession, self).__init__(
  File "/tensorflow_core/python/training/monitored_session.py", line 727, in __init__
    self._sess = self._coordinated_creator.create_session()
  File "/tensorflow_core/python/training/monitored_session.py", line 878, in create_session
    self.tf_sess = self._session_creator.create_session()
  File "/tensorflow_core/python/training/monitored_session.py", line 638, in create_session
    self._scaffold.finalize()
  File "/tensorflow_core/python/training/monitored_session.py", line 229, in finalize
    self._saver = training_saver._get_saver_or_default()  # pylint: disable=protected-access
  File "/tensorflow_core/python/training/saver.py", line 599, in _get_saver_or_default
    saver = Saver(sharded=True, allow_empty=True)
  File "/tensorflow_core/python/training/saver.py", line 828, in __init__
    self.build()
  File "/tensorflow_core/python/training/saver.py", line 840, in build
    self._build(self._filename, build_save=True, build_restore=True)
  File "/tensorflow_core/python/training/saver.py", line 868, in _build
    self.saver_def = self._builder._build_internal(  # pylint: disable=protected-access
  File "/tensorflow_core/python/training/saver.py", line 501, in _build_internal
    restore_op = self._AddShardedRestoreOps(filename_tensor, per_device,
  File "/tensorflow_core/python/training/saver.py", line 375, in _AddShardedRestoreOps
    self._AddRestoreOps(
  File "/tensorflow_core/python/training/saver.py", line 327, in _AddRestoreOps
    all_tensors = self.bulk_restore(filename_tensor, saveables, preferred_shard,
  File "/tensorflow_core/python/training/saver.py", line 575, in bulk_restore
    return io_ops.restore_v2(filename_tensor, names, slices, dtypes)
  File "/tensorflow_core/python/ops/gen_io_ops.py", line 1693, in restore_v2
    _, _, _op = _op_def_lib._apply_op_helper(
  File "/tensorflow_core/python/framework/op_def_library.py", line 792, in _apply_op_helper
    op = g.create_op(op_type_name, inputs, dtypes=None, name=scope,
  File "/tensorflow_core/python/util/deprecation.py", line 513, in new_func
    return func(*args, **kwargs)
  File "/tensorflow_core/python/framework/ops.py", line 3356, in create_op
    return self._create_op_internal(op_type, inputs, dtypes, input_types, name,
  File "/tensorflow_core/python/framework/ops.py", line 3418, in _create_op_internal
    ret = Operation(
  File "/tensorflow_core/python/framework/ops.py", line 1748, in __init__
    self._traceback = tf_stack.extract_stack()

Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/client/session.py", line 1365, in _do_call
    return fn(*args)
  File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/client/session.py", line 1349, in _run_fn
    return self._call_tf_sessionrun(options, feed_dict, fetch_list,
  File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/client/session.py", line 1441, in _call_tf_sessionrun
    return tf_session.TF_SessionRun_wrapper(self._session, options, feed_dict,
tensorflow.python.framework.errors_impl.NotFoundError: 2 root error(s) found.
  (0) Not found: Key cost_sums/barcode-bbox not found in checkpoint
	 [[{{node save/RestoreV2}}]]
  (1) Not found: Key cost_sums/barcode-bbox not found in checkpoint
	 [[{{node save/RestoreV2}}]]
	 [[save/RestoreV2/_301]]
0 successful operations.
0 derived errors ignored.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/training/saver.py", line 1289, in restore
    sess.run(self.saver_def.restore_op_name,
  File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/client/session.py", line 955, in run
    result = self._run(None, fetches, feed_dict, options_ptr,
  File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/client/session.py", line 1179, in _run
    results = self._do_run(handle, final_targets, final_fetches,
  File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/client/session.py", line 1358, in _do_run
    return self._do_call(_run_fn, feeds, fetches, targets, options,
  File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/client/session.py", line 1384, in _do_call
    raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.NotFoundError: 2 root error(s) found.
  (0) Not found: Key cost_sums/barcode-bbox not found in checkpoint
	 [[node save/RestoreV2 (defined at /tensorflow_core/python/framework/ops.py:1748) ]]
  (1) Not found: Key cost_sums/barcode-bbox not found in checkpoint
	 [[node save/RestoreV2 (defined at /tensorflow_core/python/framework/ops.py:1748) ]]
	 [[save/RestoreV2/_301]]
0 successful operations.
0 derived errors ignored.

Original stack trace for 'save/RestoreV2':
  File "/nvidia_tao_tf1/cv/detectnet_v2/scripts/train.py", line 1046, in <module>
    main()
  File "/decorator.py", line 232, in fun
    return caller(func, *(extras + args), **kw)
  File "/nvidia_tao_tf1/cv/detectnet_v2/utilities/timer.py", line 46, in wrapped_fn
    return_args = fn(*args, **kwargs)
  File "/nvidia_tao_tf1/cv/detectnet_v2/scripts/train.py", line 1024, in main
    run_experiment(
  File "/nvidia_tao_tf1/cv/detectnet_v2/scripts/train.py", line 887, in run_experiment
    train_gridbox(results_dir, experiment_spec, output_model_file_name, input_model_file_name,
  File "/nvidia_tao_tf1/cv/detectnet_v2/scripts/train.py", line 760, in train_gridbox
    run_training_loop(experiment_spec, results_dir, gridbox_model, hooks, steps_per_epoch,
  File "/nvidia_tao_tf1/cv/detectnet_v2/scripts/train.py", line 227, in run_training_loop
    with get_singular_monitored_session(keras_models=gridbox_model.get_keras_training_model(),
  File "/nvidia_tao_tf1/cv/detectnet_v2/training/utilities.py", line 154, in get_singular_monitored_session
    return tf.train.SingularMonitoredSession(hooks=hooks,
  File "/tensorflow_core/python/training/monitored_session.py", line 1100, in __init__
    super(SingularMonitoredSession, self).__init__(
  File "/tensorflow_core/python/training/monitored_session.py", line 727, in __init__
    self._sess = self._coordinated_creator.create_session()
  File "/tensorflow_core/python/training/monitored_session.py", line 878, in create_session
    self.tf_sess = self._session_creator.create_session()
  File "/tensorflow_core/python/training/monitored_session.py", line 638, in create_session
    self._scaffold.finalize()
  File "/tensorflow_core/python/training/monitored_session.py", line 229, in finalize
    self._saver = training_saver._get_saver_or_default()  # pylint: disable=protected-access
  File "/tensorflow_core/python/training/saver.py", line 599, in _get_saver_or_default
    saver = Saver(sharded=True, allow_empty=True)
  File "/tensorflow_core/python/training/saver.py", line 828, in __init__
    self.build()
  File "/tensorflow_core/python/training/saver.py", line 840, in build
    self._build(self._filename, build_save=True, build_restore=True)
  File "/tensorflow_core/python/training/saver.py", line 868, in _build
    self.saver_def = self._builder._build_internal(  # pylint: disable=protected-access
  File "/tensorflow_core/python/training/saver.py", line 501, in _build_internal
    restore_op = self._AddShardedRestoreOps(filename_tensor, per_device,
  File "/tensorflow_core/python/training/saver.py", line 375, in _AddShardedRestoreOps
    self._AddRestoreOps(
  File "/tensorflow_core/python/training/saver.py", line 327, in _AddRestoreOps
    all_tensors = self.bulk_restore(filename_tensor, saveables, preferred_shard,
  File "/tensorflow_core/python/training/saver.py", line 575, in bulk_restore
    return io_ops.restore_v2(filename_tensor, names, slices, dtypes)
  File "/tensorflow_core/python/ops/gen_io_ops.py", line 1693, in restore_v2
    _, _, _op = _op_def_lib._apply_op_helper(
  File "/tensorflow_core/python/framework/op_def_library.py", line 792, in _apply_op_helper
    op = g.create_op(op_type_name, inputs, dtypes=None, name=scope,
  File "/tensorflow_core/python/util/deprecation.py", line 513, in new_func
    return func(*args, **kwargs)
  File "/tensorflow_core/python/framework/ops.py", line 3356, in create_op
    return self._create_op_internal(op_type, inputs, dtypes, input_types, name,
  File "/tensorflow_core/python/framework/ops.py", line 3418, in _create_op_internal
    ret = Operation(
  File "/tensorflow_core/python/framework/ops.py", line 1748, in __init__
    self._traceback = tf_stack.extract_stack()


2024-06-04 03:07:15,155 [TAO Toolkit] [ERROR] tensorflow 70: ==================================
Object was never used (type <class 'tensorflow.python.framework.ops.Tensor'>):
<tf.Tensor 'IsVariableInitialized_302:0' shape=() dtype=bool>
If you want to mark it as used call its "mark_used()" method.
It was originally created here:
  File "/usr/local/lib/python3.8/dist-packages/nvidia_tao_tf1/cv/detectnet_v2/training/utilities.py", line 154, in get_singular_monitored_session
    return tf.train.SingularMonitoredSession(hooks=hooks,  File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/training/monitored_session.py", line 1100, in __init__
    super(SingularMonitoredSession, self).__init__(  File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/training/monitored_session.py", line 727, in __init__
    self._sess = self._coordinated_creator.create_session()  File "/usr/local/lib/python3.8/dist-packages/nvidia_tao_tf1/core/hooks/hooks.py", line 286, in begin
    self._variables_initialized.append(  File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/util/tf_should_use.py", line 198, in wrapped
    return _add_should_use_warning(fn(*args, **kwargs))
==================================
Execution status: FAIL
2024-06-04 12:07:18,760 [TAO Toolkit] [INFO] nvidia_tao_cli.components.docker_handler.docker_handler 363: Stopping container.

And also the training spec file?

Here’s the training spec file.

!cat $LOCAL_SPECS_DIR/spec_train_kitti.txt
random_seed: 42
dataset_config {
  data_sources {
    tfrecords_path: "/workspace/tao-experiments/data/tfrecords_aikata/*"
    image_directory_path: "/workspace/tao-experiments/data/training"
  }
  image_extension: "jpg"
  target_class_mapping {
    key: "barcode"
    value: "barcode"
  }
  target_class_mapping {
    key: "name"
    value: "name"
  }
  target_class_mapping {
    key: "price"
    value: "price"
  }
  target_class_mapping {
    key: "card"
    value: "card"
  }
  target_class_mapping {
    key: "card7p"
    value: "card7p"
  }
  target_class_mapping {
    key: "unknown"
    value: "unknown"
  }
  validation_fold: 0
}
model_config {
  pretrained_model_file: "/workspace/tao-experiments/detectnet_v2_test/pretrained_resnet18/pretrained_detectnet_v2_vresnet18/resnet18.hdf5"
  num_layers: 18
  use_batch_norm: true
  objective_set {
    bbox {
      scale: 35.0
      offset: 0.5
    }
    cov {
    }
  }
  arch: "resnet"
}
augmentation_config {
  preprocessing {
    output_image_width: 1440
    output_image_height: 1920
    min_bbox_width: 1.0
    min_bbox_height: 1.0
    output_image_channel: 3
  }
  spatial_augmentation {
    hflip_probability: 0.5
    zoom_min: 1.0
    zoom_max: 1.0
    translate_max_x: 8.0
    translate_max_y: 8.0
  }
  color_augmentation {
    hue_rotation_max: 25.0
    saturation_shift_max: 0.20000000298
    contrast_scale_max: 0.10000000149
    contrast_center: 0.5
  }
}
postprocessing_config {
  target_class_config {
    key: "barcode"
    value {
      clustering_config {
        clustering_algorithm: DBSCAN
        dbscan_confidence_threshold: 0.9
        coverage_threshold: 0.00499999988824
        dbscan_eps: 0.20000000298
        dbscan_min_samples: 1
        minimum_bounding_box_height: 20
      }
    }
  }
  target_class_config {
    key: "name"
    value {
      clustering_config {
        clustering_algorithm: DBSCAN
        dbscan_confidence_threshold: 0.9
        coverage_threshold: 0.00499999988824
        dbscan_eps: 0.15000000596
        dbscan_min_samples: 1
        minimum_bounding_box_height: 20
      }
    }
  }
  target_class_config {
    key: "price"
    value {
      clustering_config {
        clustering_algorithm: DBSCAN
        dbscan_confidence_threshold: 0.9
        coverage_threshold: 0.00749999983236
        dbscan_eps: 0.230000004172
        dbscan_min_samples: 1
        minimum_bounding_box_height: 20
      }
    }
  }
  target_class_config {
    key: "card"
    value {
      clustering_config {
        clustering_algorithm: DBSCAN
        dbscan_confidence_threshold: 0.9
        coverage_threshold: 0.00749999983236
        dbscan_eps: 0.230000004172
        dbscan_min_samples: 1
        minimum_bounding_box_height: 20
      }
    }
  }
  target_class_config {
    key: "card7p"
    value {
      clustering_config {
        clustering_algorithm: DBSCAN
        dbscan_confidence_threshold: 0.9
        coverage_threshold: 0.00749999983236
        dbscan_eps: 0.230000004172
        dbscan_min_samples: 1
        minimum_bounding_box_height: 20
      }
    }
  }
  target_class_config {
    key: "unknown"
    value {
      clustering_config {
        clustering_algorithm: DBSCAN
        dbscan_confidence_threshold: 0.9
        coverage_threshold: 0.00749999983236
        dbscan_eps: 0.230000004172
        dbscan_min_samples: 1
        minimum_bounding_box_height: 20
      }
    }
  }
}
evaluation_config {
  validation_period_during_training: 10
  first_validation_epoch: 30
  minimum_detection_ground_truth_overlap {
    key: "barcode"
    value: 0.699999988079
  }
  minimum_detection_ground_truth_overlap {
    key: "name"
    value: 0.5
  }
  minimum_detection_ground_truth_overlap {
    key: "price"
    value: 0.5
  }
  minimum_detection_ground_truth_overlap {
    key: "card"
    value: 0.5
  }
  minimum_detection_ground_truth_overlap {
    key: "card7p"
    value: 0.5
  }
  minimum_detection_ground_truth_overlap {
    key: "unknown"
    value: 0.5
  }
  evaluation_box_config {
    key: "barcode"
    value {
      minimum_height: 20
      maximum_height: 9999
      minimum_width: 10
      maximum_width: 9999
    }
  }
  evaluation_box_config {
    key: "name"
    value {
      minimum_height: 20
      maximum_height: 9999
      minimum_width: 10
      maximum_width: 9999
    }
  }
  evaluation_box_config {
    key: "price"
    value {
      minimum_height: 20
      maximum_height: 9999
      minimum_width: 10
      maximum_width: 9999
    }
  }
  evaluation_box_config {
    key: "card"
    value {
      minimum_height: 20
      maximum_height: 9999
      minimum_width: 10
      maximum_width: 9999
    }
  }
  evaluation_box_config {
    key: "card7p"
    value {
      minimum_height: 20
      maximum_height: 9999
      minimum_width: 10
      maximum_width: 9999
    }
  }
  evaluation_box_config {
    key: "unknown"
    value {
      minimum_height: 20
      maximum_height: 9999
      minimum_width: 10
      maximum_width: 9999
    }
  }
  average_precision_mode: INTEGRATE
}
cost_function_config {
  target_classes {
    name: "barcode"
    class_weight: 1.0
    coverage_foreground_weight: 0.0500000007451
    objectives {
      name: "cov"
      initial_weight: 1.0
      weight_target: 1.0
    }
    objectives {
      name: "bbox"
      initial_weight: 10.0
      weight_target: 10.0
    }
  }
  target_classes {
    name: "name"
    class_weight: 8.0
    coverage_foreground_weight: 0.0500000007451
    objectives {
      name: "cov"
      initial_weight: 1.0
      weight_target: 1.0
    }
    objectives {
      name: "bbox"
      initial_weight: 10.0
      weight_target: 1.0
    }
  }
  target_classes {
    name: "price"
    class_weight: 4.0
    coverage_foreground_weight: 0.0500000007451
    objectives {
      name: "cov"
      initial_weight: 1.0
      weight_target: 1.0
    }
    objectives {
      name: "bbox"
      initial_weight: 10.0
      weight_target: 10.0
    }
  }
  target_classes {
    name: "card"
    class_weight: 4.0
    coverage_foreground_weight: 0.0500000007451
    objectives {
      name: "cov"
      initial_weight: 1.0
      weight_target: 1.0
    }
    objectives {
      name: "bbox"
      initial_weight: 10.0
      weight_target: 10.0
    }
  }
  target_classes {
    name: "card7p"
    class_weight: 4.0
    coverage_foreground_weight: 0.0500000007451
    objectives {
      name: "cov"
      initial_weight: 1.0
      weight_target: 1.0
    }
    objectives {
      name: "bbox"
      initial_weight: 10.0
      weight_target: 10.0
    }
  }
  target_classes {
    name: "unknown"
    class_weight: 4.0
    coverage_foreground_weight: 0.0500000007451
    objectives {
      name: "cov"
      initial_weight: 1.0
      weight_target: 1.0
    }
    objectives {
      name: "bbox"
      initial_weight: 10.0
      weight_target: 10.0
    }
  }
  enable_autoweighting: false
  max_objective_weight: 0.999899983406
  min_objective_weight: 9.99999974738e-05
}
training_config {
  batch_size_per_gpu: 4
  num_epochs: 120
  learning_rate {
    soft_start_annealing_schedule {
      min_learning_rate: 5e-07
      max_learning_rate: 5e-05
      soft_start: 0.10000000149
      annealing: 0.699999988079
    }
  }
  regularizer {
    type: L1
    weight: 3.00000002618e-09
  }
  optimizer {
    adam {
      epsilon: 9.99999993923e-09
      beta1: 0.899999976158
      beta2: 0.999000012875
    }
  }
  cost_scaling {
    initial_exponent: 20.0
    increment: 0.005
    decrement: 1.0
  }
  visualizer{
    enabled: true
    num_images: 3
    scalar_logging_frequency: 50
    infrequent_logging_frequency: 5
    target_class_config {
      key: "barcode"
      value: {
        coverage_threshold: 0.005
      }
    }
    target_class_config {
      key: "name"
      value: {
        coverage_threshold: 0.005
      }
    }
    target_class_config {
      key: "price"
      value: {
        coverage_threshold: 0.005
      }
    }
    target_class_config {
      key: "card"
      value: {
        coverage_threshold: 0.005
      }
    }
    target_class_config {
      key: "card7p"
      value: {
        coverage_threshold: 0.005
      }
    }
    target_class_config {
      key: "unknown"
      value: {
        coverage_threshold: 0.005
      }
    }
    clearml_config{
      project: "TAO Toolkit ClearML Demo"
      task: "detectnet_v2_resnet18_clearml"
      tags: "detectnet_v2"
      tags: "training"
      tags: "resnet18"
      tags: "unpruned"
    }
    wandb_config{
      project: "TAO Toolkit Wandb Demo"
      name: "detectnet_v2_resnet18_wandb"
      tags: "detectnet_v2"
      tags: "training"
      tags: "resnet18"
      tags: "unpruned"
    }
  }
  checkpoint_interval: 10
}
bbox_rasterizer_config {
  target_class_config {
    key: "barcode"
    value {
      cov_center_x: 0.5
      cov_center_y: 0.5
      cov_radius_x: 0.40000000596
      cov_radius_y: 0.40000000596
      bbox_min_radius: 1.0
    }
  }
  target_class_config {
    key: "name"
    value {
      cov_center_x: 0.5
      cov_center_y: 0.5
      cov_radius_x: 1.0
      cov_radius_y: 1.0
      bbox_min_radius: 1.0
    }
  }
  target_class_config {
    key: "price"
    value {
      cov_center_x: 0.5
      cov_center_y: 0.5
      cov_radius_x: 1.0
      cov_radius_y: 1.0
      bbox_min_radius: 1.0
    }
  }
  target_class_config {
    key: "card"
    value {
      cov_center_x: 0.5
      cov_center_y: 0.5
      cov_radius_x: 1.0
      cov_radius_y: 1.0
      bbox_min_radius: 1.0
    }
  }
  target_class_config {
    key: "card7p"
    value {
      cov_center_x: 0.5
      cov_center_y: 0.5
      cov_radius_x: 1.0
      cov_radius_y: 1.0
      bbox_min_radius: 1.0
    }
  }
  target_class_config {
    key: "unknown"
    value {
      cov_center_x: 0.5
      cov_center_y: 0.5
      cov_radius_x: 1.0
      cov_radius_y: 1.0
      bbox_min_radius: 1.0
    }
  }
  deadzone_radius: 0.400000154972

INFO

env: NUM_GPUS=1
env: USER_EXPERIMENT_DIR=/workspace/tao-experiments/detectnet_v2
env: DATA_DOWNLOAD_DIR=/workspace/tao-experiments/data
env: SPECS_DIR=/workspace/tao-experiments/detectnet_v2/specs
{
    "Mounts": [
        {
            "source": "/home/ym7/tao-jupyter/getting_started_v5.3.0/notebooks/tao_launcher_starter_kit/detectnet_v2_test",
            "destination": "/workspace/tao-experiments"
        },
        {
            "source": "/home/ym7/tao-jupyter/getting_started_v5.3.0/notebooks/tao_launcher_starter_kit/detectnet_v2_test/specs",
            "destination": "/workspace/tao-experiments/detectnet_v2/specs"
        }
    ],
    "DockerOptions": {
        "user": "1000:1000"
    }
}

!ls -rlt $LOCAL_EXPERIMENT_DIR/pretrained_resnet18/pretrained_detectnet_v2_vresnet18
-rw-r--r-- 1 ym7 ym7 93345248 May 19 12:48 resnet18.hdf5

$ ll detectnet_v2_test/pretrained_resnet18/pretrained_detectnet_v2_vresnet18/
-rw-r--r-- 1 ym7 ym7 93345248 May 19 12:48 resnet18.hdf5

$ ll data/tfrecords_aikata
-rw-r--r-- 1 ym7 ym7  21611 Jun  3 18:04 tfrecords_aikata-fold-000-of-002-shard-00000-of-00010
-rw-r--r-- 1 ym7 ym7  19896 Jun  3 18:04 tfrecords_aikata-fold-000-of-002-shard-00001-of-00010
-rw-r--r-- 1 ym7 ym7  19934 Jun  3 18:04 tfrecords_aikata-fold-000-of-002-shard-00002-of-00010
-rw-r--r-- 1 ym7 ym7  19034 Jun  3 18:04 tfrecords_aikata-fold-000-of-002-shard-00003-of-00010
-rw-r--r-- 1 ym7 ym7  20116 Jun  3 18:04 tfrecords_aikata-fold-000-of-002-shard-00004-of-00010
-rw-r--r-- 1 ym7 ym7  19864 Jun  3 18:04 tfrecords_aikata-fold-000-of-002-shard-00005-of-00010
-rw-r--r-- 1 ym7 ym7  20102 Jun  3 18:04 tfrecords_aikata-fold-000-of-002-shard-00006-of-00010
-rw-r--r-- 1 ym7 ym7  21005 Jun  3 18:04 tfrecords_aikata-fold-000-of-002-shard-00007-of-00010
-rw-r--r-- 1 ym7 ym7  20952 Jun  3 18:04 tfrecords_aikata-fold-000-of-002-shard-00008-of-00010
-rw-r--r-- 1 ym7 ym7  25980 Jun  3 18:04 tfrecords_aikata-fold-000-of-002-shard-00009-of-00010
-rw-r--r-- 1 ym7 ym7  91719 Jun  3 18:04 tfrecords_aikata-fold-001-of-002-shard-00000-of-00010
-rw-r--r-- 1 ym7 ym7  94916 Jun  3 18:04 tfrecords_aikata-fold-001-of-002-shard-00001-of-00010
-rw-r--r-- 1 ym7 ym7  94321 Jun  3 18:04 tfrecords_aikata-fold-001-of-002-shard-00002-of-00010
-rw-r--r-- 1 ym7 ym7  98650 Jun  3 18:04 tfrecords_aikata-fold-001-of-002-shard-00003-of-00010
-rw-r--r-- 1 ym7 ym7  92281 Jun  3 18:04 tfrecords_aikata-fold-001-of-002-shard-00004-of-00010
-rw-r--r-- 1 ym7 ym7  92103 Jun  3 18:04 tfrecords_aikata-fold-001-of-002-shard-00005-of-00010
-rw-r--r-- 1 ym7 ym7  96474 Jun  3 18:04 tfrecords_aikata-fold-001-of-002-shard-00006-of-00010
-rw-r--r-- 1 ym7 ym7  95012 Jun  3 18:04 tfrecords_aikata-fold-001-of-002-shard-00007-of-00010
-rw-r--r-- 1 ym7 ym7  97688 Jun  3 18:04 tfrecords_aikata-fold-001-of-002-shard-00008-of-00010
-rw-r--r-- 1 ym7 ym7 100730 Jun  3 18:04 tfrecords_aikata-fold-001-of-002-shard-00009-of-00010

$ ll data/training/
drwxrwxrwx 2 ym7 ym7 237568 Jun  1  2012 image_2/
drwxrwxrwx 2 ym7 ym7  61440 Jun  3 18:03 images_aikata/
drwxrwxrwx 2 ym7 ym7  32768 May 19 18:48 images_val/
drwxrwxrwx 2 ym7 ym7 245760 May 21  2015 label_2/
drwxrwxrwx 2 ym7 ym7  73728 Jun  3 18:01 labels_aikata/
drwxrwxrwx 2 ym7 ym7  36864 May 19 18:48 labels_val/

$ ll data
drwxrwxrwx 4 ym7 ym7        4096 Jun  3 15:42 testing/
drwxrwxrwx 2 ym7 ym7        4096 May 17 07:06 test_samples/
drwxrwxrwx 2 ym7 ym7        4096 Jun  3 18:05 tfrecords_aikata/
drwxrwxrwx 8 ym7 ym7        4096 May 31 11:09 training/

@Morganh
I am sorry, after I setup env one by one again, it might be solved.

It seems tao model detectnet_v2 train is running, although it is run out of GPU memory.

2024-06-04 03:58:54,445 [TAO Toolkit] [INFO] root 2102: Training graph built.
2024-06-04 03:58:54,445 [TAO Toolkit] [INFO] root 2102: Building validation graph.
2024-06-04 03:58:54,446 [TAO Toolkit] [INFO] nvidia_tao_tf1.blocks.multi_source_loader.data_loader 175: Serial augmentation enabled = False
2024-06-04 03:58:54,446 [TAO Toolkit] [INFO] nvidia_tao_tf1.blocks.multi_source_loader.data_loader 177: Pseudo sharding enabled = False
2024-06-04 03:58:54,446 [TAO Toolkit] [INFO] nvidia_tao_tf1.blocks.multi_source_loader.data_loader 269: Max Image Dimensions (all sources): (0, 0)
2024-06-04 03:58:54,446 [TAO Toolkit] [INFO] nvidia_tao_tf1.blocks.multi_source_loader.data_loader 380: number of cpus: 16, io threads: 32, compute threads: 16, buffered batches: 4
2024-06-04 03:58:54,446 [TAO Toolkit] [INFO] nvidia_tao_tf1.blocks.multi_source_loader.data_loader 387: total dataset size 258, number of sources: 1, batch size per gpu: 1, steps: 258
2024-06-04 03:58:54,464 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataloader.default_dataloader 546: Bounding box coordinates were detected in the input specification! Bboxes will be automatically converted to polygon coordinates.
2024-06-04 03:58:54,598 [TAO Toolkit] [INFO] nvidia_tao_tf1.blocks.multi_source_loader.data_loader 409: shuffle: False - shard 0 of 1
2024-06-04 03:58:54,601 [TAO Toolkit] [INFO] nvidia_tao_tf1.blocks.multi_source_loader.data_loader 479: sampling 1 datasets with weights:
2024-06-04 03:58:54,601 [TAO Toolkit] [INFO] nvidia_tao_tf1.blocks.multi_source_loader.data_loader 481: source: 0 weight: 1.000000
2024-06-04 03:58:54,737 [TAO Toolkit] [INFO] __main__ 591: Found 258 samples in validation set
2024-06-04 03:58:54,737 [TAO Toolkit] [INFO] root 2102: Rasterizing tensors.
2024-06-04 03:58:54,854 [TAO Toolkit] [INFO] root 2102: Tensors rasterized.
2024-06-04 03:58:55,071 [TAO Toolkit] [INFO] root 2102: Validation graph built.
WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/nvidia_tao_tf1/cv/detectnet_v2/tfhooks/validation_hook.py:58: The name tf.summary.FileWriterCache is deprecated. Please use tf.compat.v1.summary.FileWriterCache instead.

2024-06-04 03:58:55,072 [TAO Toolkit] [WARNING] tensorflow 137: From /usr/local/lib/python3.8/dist-packages/nvidia_tao_tf1/cv/detectnet_v2/tfhooks/validation_hook.py:58: The name tf.summary.FileWriterCache is deprecated. Please use tf.compat.v1.summary.FileWriterCache instead.

2024-06-04 03:58:56,296 [TAO Toolkit] [INFO] root 2102: Running training loop.
2024-06-04 03:58:56,296 [TAO Toolkit] [INFO] __main__ 135: Checkpoint interval: 10
2024-06-04 03:58:56,297 [TAO Toolkit] [INFO] __main__ 175: Scalars logged at every 20 steps
2024-06-04 03:58:56,297 [TAO Toolkit] [INFO] __main__ 180: Images logged at every 5175 steps
INFO:tensorflow:Create CheckpointSaverHook.
2024-06-04 03:58:56,299 [TAO Toolkit] [INFO] tensorflow 541: Create CheckpointSaverHook.
WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/nvidia_tao_tf1/cv/detectnet_v2/training/utilities.py:154: The name tf.train.SingularMonitoredSession is deprecated. Please use tf.compat.v1.train.SingularMonitoredSession instead.

2024-06-04 03:58:56,587 [TAO Toolkit] [WARNING] tensorflow 137: From /usr/local/lib/python3.8/dist-packages/nvidia_tao_tf1/cv/detectnet_v2/training/utilities.py:154: The name tf.train.SingularMonitoredSession is deprecated. Please use tf.compat.v1.train.SingularMonitoredSession instead.

INFO:tensorflow:Graph was finalized.
2024-06-04 03:58:58,001 [TAO Toolkit] [INFO] tensorflow 240: Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpuuxekwqa/model.ckpt-0
2024-06-04 03:58:58,003 [TAO Toolkit] [INFO] tensorflow 1284: Restoring parameters from /tmp/tmpuuxekwqa/model.ckpt-0
INFO:tensorflow:Running local_init_op.
2024-06-04 03:58:59,364 [TAO Toolkit] [INFO] tensorflow 500: Running local_init_op.
INFO:tensorflow:Done running local_init_op.
2024-06-04 03:58:59,850 [TAO Toolkit] [INFO] tensorflow 502: Done running local_init_op.
INFO:tensorflow:Saving checkpoints for step-0.
2024-06-04 03:59:03,521 [TAO Toolkit] [INFO] tensorflow 81: Saving checkpoints for step-0.
2024-06-04 03:59:17,147 [TAO Toolkit] [INFO] nvidia_tao_tf1.core.hooks.hooks 224: Overwritten Keras model: /workspace/tao-experiments/detectnet_v2_test/experiment_dir_unpruned/model.epoch-0.hdf5.
2024-06-04 04:12:15,938 [TAO Toolkit] [INFO] root 2102: Saving trained model.
2024-06-04 04:12:16,101 [TAO Toolkit] [INFO] root 2102: Model saved.
2024-06-04 04:12:16,284 [TAO Toolkit] [ERROR] __main__ 1050: Ran out of GPU memory, please lower the batch size, use a smaller input resolution, or use a smaller backbone.
2024-06-04 04:12:16,284 [TAO Toolkit] [INFO] root 2102: Ran out of GPU memory, please lower the batch size, use a smaller input resolution, or use a smaller backbone.
Execution status: FAIL
2024-06-04 13:12:20,790 [TAO Toolkit] [INFO] nvidia_tao_cli.components.docker_handler.docker_handler 363: Stopping container.

I’ll try this after I use another GPU machine, then I judge if I can solve it or not.

Thanks.
I could train with tao model detectnet_v2 train.

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.