Excute tao model detectnet_v2 train but Failed

uma7 · June 3, 2024, 1:34pm

I am sorry I asked several times byt I am stuck with TAO detectnet_v2.

I got annotation data, a lot of YOLO format data and image data.

3 0.498760 0.490741 0.859623 0.768519
0 0.728671 0.213955 0.252976 0.082672
1 0.478423 0.380622 0.730655 0.085317
2 0.544643 0.530093 0.416667 0.184524

However, it is said TAO Toolkit can not recognize YOLO format data, so I converted the YOLO format data to KITTI format data.

card 0 0 0 99.28583999999995 204.44448 1337.14296 1680.0009599999998 0 0 0 0 0 0 0 
barcode 0 0 0 867.1435199999999 331.42848000000004 1231.42896 490.15871999999996 0 0 0 0 0 0 0
name 0 0 0 162.85751999999994 648.8899200000001 1215.00072 812.69856 0 0 0 0 0 0 0
price 0 0 0 484.28567999999996 840.63552 1084.2861599999999 1194.9216 0 0 0 0 0 0 0

Then I used TAO Dataset Convert Tool to convert KITTI data to TFRecords files.

# Setting file
!cat $LOCAL_SPECS_DIR/spec_tfrecords_kitti.txt
kitti_config {
  root_directory_path: "/workspace/tao-experiments/data/"
  image_dir_name: "training/images_aikata"
  label_dir_name: "training/labels_aikata"
  image_extension: ".jpg"
  partition_mode: "random"
  num_partitions: 2
  val_split: 20
  num_shards: 10
}
image_directory_path: "/workspace/tao-experiments/data/"
target_class_mapping {
  key: "barcode"
  value: "barcode"
}
target_class_mapping {
  key: "name"
  value: "name"
}
target_class_mapping {
  key: "price"
  value: "price"
}
target_class_mapping {
  key: "card"
  value: "card"
}
target_class_mapping {
  key: "card7p"
  value: "card7p"
}
target_class_mapping {
  key: "unknown"
  value: "unknown"
}

# Setting file
!cat $LOCAL_SPECS_DIR/spec_train_kitti.txt
random_seed: 42
dataset_config {
  data_sources {
    tfrecords_path: "/workspace/tao-experiments/data/tfrecords_aikata/*"
    image_directory_path: "/workspace/tao-experiments/data/training"
  }
  image_extension: "jpg"
  target_class_mapping {
    key: "barcode"
    value: "barcode"
  }
  target_class_mapping {
    key: "name"
    value: "name"
  }
  target_class_mapping {
    key: "price"
    value: "price"
  }
  target_class_mapping {
    key: "card"
    value: "card"
  }
  target_class_mapping {
    key: "card7p"
    value: "card7p"
  }
  target_class_mapping {
    key: "unknown"
    value: "unknown"
  }
  validation_fold: 0
}
model_config {
  pretrained_model_file: "/workspace/tao-experiments/detectnet_v2_test/pretrained_resnet18/pretrained_detectnet_v2_vresnet18/resnet18.hdf5"
  num_layers: 18
  use_batch_norm: true
  objective_set {
    bbox {
      scale: 35.0
      offset: 0.5
    }
    cov {
    }
  }
  arch: "resnet"
}
augmentation_config {
  preprocessing {
    output_image_width: 1440
    output_image_height: 1920
    min_bbox_width: 1.0
    min_bbox_height: 1.0
    output_image_channel: 3
  }
  spatial_augmentation {
    hflip_probability: 0.5
    zoom_min: 1.0
    zoom_max: 1.0
    translate_max_x: 8.0
    translate_max_y: 8.0
  }
  color_augmentation {
    hue_rotation_max: 25.0
    saturation_shift_max: 0.20000000298
    contrast_scale_max: 0.10000000149
    contrast_center: 0.5
  }
}
postprocessing_config {
  target_class_config {
    key: "barcode"
    value {
      clustering_config {
        clustering_algorithm: DBSCAN
        dbscan_confidence_threshold: 0.9
        coverage_threshold: 0.00499999988824
        dbscan_eps: 0.20000000298
        dbscan_min_samples: 1
        minimum_bounding_box_height: 20
      }
    }
  }
  target_class_config {
    key: "name"
    value {
      clustering_config {
        clustering_algorithm: DBSCAN
        dbscan_confidence_threshold: 0.9
        coverage_threshold: 0.00499999988824
        dbscan_eps: 0.15000000596
        dbscan_min_samples: 1
        minimum_bounding_box_height: 20
      }
    }
  }
  target_class_config {
    key: "price"
    value {
      clustering_config {
        clustering_algorithm: DBSCAN
        dbscan_confidence_threshold: 0.9
        coverage_threshold: 0.00749999983236
        dbscan_eps: 0.230000004172
        dbscan_min_samples: 1
        minimum_bounding_box_height: 20
      }
    }
  }
  target_class_config {
    key: "card"
    value {
      clustering_config {
        clustering_algorithm: DBSCAN
        dbscan_confidence_threshold: 0.9
        coverage_threshold: 0.00749999983236
        dbscan_eps: 0.230000004172
        dbscan_min_samples: 1
        minimum_bounding_box_height: 20
      }
    }
  }
  target_class_config {
    key: "card7p"
    value {
      clustering_config {
        clustering_algorithm: DBSCAN
        dbscan_confidence_threshold: 0.9
        coverage_threshold: 0.00749999983236
        dbscan_eps: 0.230000004172
        dbscan_min_samples: 1
        minimum_bounding_box_height: 20
      }
    }
  }
  target_class_config {
    key: "unknown"
    value {
      clustering_config {
        clustering_algorithm: DBSCAN
        dbscan_confidence_threshold: 0.9
        coverage_threshold: 0.00749999983236
        dbscan_eps: 0.230000004172
        dbscan_min_samples: 1
        minimum_bounding_box_height: 20
      }
    }
  }
}
evaluation_config {
  validation_period_during_training: 10
  first_validation_epoch: 30
  minimum_detection_ground_truth_overlap {
    key: "barcode"
    value: 0.699999988079
  }
  minimum_detection_ground_truth_overlap {
    key: "name"
    value: 0.5
  }
  minimum_detection_ground_truth_overlap {
    key: "price"
    value: 0.5
  }
  minimum_detection_ground_truth_overlap {
    key: "card"
    value: 0.5
  }
  minimum_detection_ground_truth_overlap {
    key: "card7p"
    value: 0.5
  }
  minimum_detection_ground_truth_overlap {
    key: "unknown"
    value: 0.5
  }
  evaluation_box_config {
    key: "barcode"
    value {
      minimum_height: 20
      maximum_height: 9999
      minimum_width: 10
      maximum_width: 9999
    }
  }
  evaluation_box_config {
    key: "name"
    value {
      minimum_height: 20
      maximum_height: 9999
      minimum_width: 10
      maximum_width: 9999
    }
  }
  evaluation_box_config {
    key: "price"
    value {
      minimum_height: 20
      maximum_height: 9999
      minimum_width: 10
      maximum_width: 9999
    }
  }
  evaluation_box_config {
    key: "card"
    value {
      minimum_height: 20
      maximum_height: 9999
      minimum_width: 10
      maximum_width: 9999
    }
  }
  evaluation_box_config {
    key: "card7p"
    value {
      minimum_height: 20
      maximum_height: 9999
      minimum_width: 10
      maximum_width: 9999
    }
  }
  evaluation_box_config {
    key: "unknown"
    value {
      minimum_height: 20
      maximum_height: 9999
      minimum_width: 10
      maximum_width: 9999
    }
  }
  average_precision_mode: INTEGRATE
}
cost_function_config {
  target_classes {
    name: "barcode"
    class_weight: 1.0
    coverage_foreground_weight: 0.0500000007451
    objectives {
      name: "cov"
      initial_weight: 1.0
      weight_target: 1.0
    }
    objectives {
      name: "bbox"
      initial_weight: 10.0
      weight_target: 10.0
    }
  }
  target_classes {
    name: "name"
    class_weight: 8.0
    coverage_foreground_weight: 0.0500000007451
    objectives {
      name: "cov"
      initial_weight: 1.0
      weight_target: 1.0
    }
    objectives {
      name: "bbox"
      initial_weight: 10.0
      weight_target: 1.0
    }
  }
  target_classes {
    name: "price"
    class_weight: 4.0
    coverage_foreground_weight: 0.0500000007451
    objectives {
      name: "cov"
      initial_weight: 1.0
      weight_target: 1.0
    }
    objectives {
      name: "bbox"
      initial_weight: 10.0
      weight_target: 10.0
    }
  }
  target_classes {
    name: "card"
    class_weight: 4.0
    coverage_foreground_weight: 0.0500000007451
    objectives {
      name: "cov"
      initial_weight: 1.0
      weight_target: 1.0
    }
    objectives {
      name: "bbox"
      initial_weight: 10.0
      weight_target: 10.0
    }
  }
  target_classes {
    name: "card7p"
    class_weight: 4.0
    coverage_foreground_weight: 0.0500000007451
    objectives {
      name: "cov"
      initial_weight: 1.0
      weight_target: 1.0
    }
    objectives {
      name: "bbox"
      initial_weight: 10.0
      weight_target: 10.0
    }
  }
  target_classes {
    name: "unknown"
    class_weight: 4.0
    coverage_foreground_weight: 0.0500000007451
    objectives {
      name: "cov"
      initial_weight: 1.0
      weight_target: 1.0
    }
    objectives {
      name: "bbox"
      initial_weight: 10.0
      weight_target: 10.0
    }
  }
  enable_autoweighting: false
  max_objective_weight: 0.999899983406
  min_objective_weight: 9.99999974738e-05
}
training_config {
  batch_size_per_gpu: 4
  num_epochs: 120
  learning_rate {
    soft_start_annealing_schedule {
      min_learning_rate: 5e-07
      max_learning_rate: 5e-05
      soft_start: 0.10000000149
      annealing: 0.699999988079
    }
  }
  regularizer {
    type: L1
    weight: 3.00000002618e-09
  }
  optimizer {
    adam {
      epsilon: 9.99999993923e-09
      beta1: 0.899999976158
      beta2: 0.999000012875
    }
  }
  cost_scaling {
    initial_exponent: 20.0
    increment: 0.005
    decrement: 1.0
  }
  visualizer{
    enabled: true
    num_images: 3
    scalar_logging_frequency: 50
    infrequent_logging_frequency: 5
    target_class_config {
      key: "barcode"
      value: {
        coverage_threshold: 0.005
      }
    }
    target_class_config {
      key: "name"
      value: {
        coverage_threshold: 0.005
      }
    }
    target_class_config {
      key: "price"
      value: {
        coverage_threshold: 0.005
      }
    }
    target_class_config {
      key: "card"
      value: {
        coverage_threshold: 0.005
      }
    }
    target_class_config {
      key: "card7p"
      value: {
        coverage_threshold: 0.005
      }
    }
    target_class_config {
      key: "unknown"
      value: {
        coverage_threshold: 0.005
      }
    }
    clearml_config{
      project: "TAO Toolkit ClearML Demo"
      task: "detectnet_v2_resnet18_clearml"
      tags: "detectnet_v2"
      tags: "training"
      tags: "resnet18"
      tags: "unpruned"
    }
    wandb_config{
      project: "TAO Toolkit Wandb Demo"
      name: "detectnet_v2_resnet18_wandb"
      tags: "detectnet_v2"
      tags: "training"
      tags: "resnet18"
      tags: "unpruned"
    }
  }
  checkpoint_interval: 10
}
bbox_rasterizer_config {
  target_class_config {
    key: "barcode"
    value {
      cov_center_x: 0.5
      cov_center_y: 0.5
      cov_radius_x: 0.40000000596
      cov_radius_y: 0.40000000596
      bbox_min_radius: 1.0
    }
  }
  target_class_config {
    key: "name"
    value {
      cov_center_x: 0.5
      cov_center_y: 0.5
      cov_radius_x: 1.0
      cov_radius_y: 1.0
      bbox_min_radius: 1.0
    }
  }
  target_class_config {
    key: "price"
    value {
      cov_center_x: 0.5
      cov_center_y: 0.5
      cov_radius_x: 1.0
      cov_radius_y: 1.0
      bbox_min_radius: 1.0
    }
  }
  target_class_config {
    key: "card"
    value {
      cov_center_x: 0.5
      cov_center_y: 0.5
      cov_radius_x: 1.0
      cov_radius_y: 1.0
      bbox_min_radius: 1.0
    }
  }
  target_class_config {
    key: "card7p"
    value {
      cov_center_x: 0.5
      cov_center_y: 0.5
      cov_radius_x: 1.0
      cov_radius_y: 1.0
      bbox_min_radius: 1.0
    }
  }
  target_class_config {
    key: "unknown"
    value {
      cov_center_x: 0.5
      cov_center_y: 0.5
      cov_radius_x: 1.0
      cov_radius_y: 1.0
      bbox_min_radius: 1.0
    }
  }
  deadzone_radius: 0.400000154972

# TAO Dataset Converter
!tao model detectnet_v2 dataset_convert -d /workspace/tao-experiments/detectnet_v2/specs/spec_tfrecords_kitti.txt \
                                        -o /workspace/tao-experiments/data/tfrecords_aikata
...                              
2024-06-03 09:04:18,874 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 105: Class map. 
Label in GT: Label in tfrecords file 
b'card': b'card'
b'name': b'name'
b'price': b'price'
b'barcode': b'barcode'
b'card7p': b'card7p'
b'unknown': b'unknown'
For the dataset_config in the experiment_spec, please use labels in the tfrecords file, while writing the classmap.

2024-06-03 09:04:18,874 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 114: Tfrecords generation complete.
Execution status: PASS
2024-06-03 18:04:23,922 [TAO Toolkit] [INFO] nvidia_tao_cli.components.docker_handler.docker_handler 363: Stopping container.

So, I thought it’s ready to train dataset with TAO and run these commands, but FAILED.

!tao model detectnet_v2 train -e $SPECS_DIR/spec_train_kitti.txt \
                        -r $USER_EXPERIMENT_DIR/experiment_dir_unpruned \
                        -k aikata \
                        -n resnet18_detector \
                        --gpus $NUM_GPUS
2024-06-03 18:10:45,022 [TAO Toolkit] [INFO] root 160: Registry: ['nvcr.io']
2024-06-03 18:10:45,086 [TAO Toolkit] [INFO] nvidia_tao_cli.components.instance_handler.local_instance 360: Running command in container: nvcr.io/nvidia/tao/tao-toolkit:5.0.0-tf1.15.5
2024-06-03 18:10:45,321 [TAO Toolkit] [INFO] nvidia_tao_cli.components.docker_handler.docker_handler 301: Printing tty value True
2024-06-03 09:10:46.078291: I tensorflow/stream_executor/platform/default/dso_loader.cc:50] Successfully opened dynamic library libcudart.so.12
2024-06-03 09:10:46,128 [TAO Toolkit] [WARNING] tensorflow 40: Deprecation warnings have been disabled. Set TF_ENABLE_DEPRECATION_WARNINGS=1 to re-enable them.
Using TensorFlow backend.
2024-06-03 09:10:47,178 [TAO Toolkit] [WARNING] tensorflow 43: TensorFlow will not use sklearn by default. This improves performance in some cases. To enable sklearn export the environment variable  TF_ALLOW_IOLIBS=1.
2024-06-03 09:10:47,205 [TAO Toolkit] [WARNING] tensorflow 42: TensorFlow will not use Dask by default. This improves performance in some cases. To enable Dask export the environment variable  TF_ALLOW_IOLIBS=1.
2024-06-03 09:10:47,209 [TAO Toolkit] [WARNING] tensorflow 43: TensorFlow will not use Pandas by default. This improves performance in some cases. To enable Pandas export the environment variable  TF_ALLOW_IOLIBS=1.
2024-06-03 09:10:48,220 [TAO Toolkit] [WARNING] matplotlib 500: Matplotlib created a temporary config/cache directory at /tmp/matplotlib-9thmcfat because the default path (/.config/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.
2024-06-03 09:10:48,390 [TAO Toolkit] [INFO] matplotlib.font_manager 1633: generated new fontManager
WARNING:tensorflow:Deprecation warnings have been disabled. Set TF_ENABLE_DEPRECATION_WARNINGS=1 to re-enable them.
Using TensorFlow backend.
WARNING:tensorflow:TensorFlow will not use sklearn by default. This improves performance in some cases. To enable sklearn export the environment variable  TF_ALLOW_IOLIBS=1.
...
2024-06-03 09:10:53,326 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 133: Loading weights from pretrained model file. /workspace/tao-experiments/detectnet_v2_test/pretrained_resnet18/pretrained_detectnet_v2_vresnet18/resnet18.hdf5
2024-06-03 09:10:53,326 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer input_1 weights set from pre-trained model.
2024-06-03 09:10:53,446 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer conv1 weights set from pre-trained model.
2024-06-03 09:10:53,560 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer bn_conv1 weights set from pre-trained model.
2024-06-03 09:10:53,560 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer activation_1 weights set from pre-trained model.
2024-06-03 09:10:53,675 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_1a_conv_1 weights set from pre-trained model.
2024-06-03 09:10:53,815 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_1a_bn_1 weights set from pre-trained model.
2024-06-03 09:10:53,962 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_1a_conv_2 weights set from pre-trained model.
2024-06-03 09:10:54,099 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_1a_conv_shortcut weights set from pre-trained model.
2024-06-03 09:10:54,225 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_1a_bn_2 weights set from pre-trained model.
2024-06-03 09:10:54,342 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_1a_bn_shortcut weights set from pre-trained model.
2024-06-03 09:10:54,342 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer add_1 weights set from pre-trained model.
2024-06-03 09:10:54,455 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_1b_conv_1 weights set from pre-trained model.
2024-06-03 09:10:54,632 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_1b_bn_1 weights set from pre-trained model.
2024-06-03 09:10:54,770 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_1b_conv_2 weights set from pre-trained model.
2024-06-03 09:10:54,885 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_1b_bn_2 weights set from pre-trained model.
2024-06-03 09:10:54,885 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer add_2 weights set from pre-trained model.
2024-06-03 09:10:54,996 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_2a_conv_1 weights set from pre-trained model.
2024-06-03 09:10:55,111 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_2a_bn_1 weights set from pre-trained model.
2024-06-03 09:10:55,225 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_2a_conv_2 weights set from pre-trained model.
2024-06-03 09:10:55,337 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_2a_conv_shortcut weights set from pre-trained model.
2024-06-03 09:10:55,454 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_2a_bn_2 weights set from pre-trained model.
2024-06-03 09:10:55,572 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_2a_bn_shortcut weights set from pre-trained model.
2024-06-03 09:10:55,572 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer add_3 weights set from pre-trained model.
2024-06-03 09:10:55,685 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_2b_conv_1 weights set from pre-trained model.
2024-06-03 09:10:55,808 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_2b_bn_1 weights set from pre-trained model.
2024-06-03 09:10:55,931 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_2b_conv_2 weights set from pre-trained model.
2024-06-03 09:10:56,051 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_2b_bn_2 weights set from pre-trained model.
2024-06-03 09:10:56,051 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer add_4 weights set from pre-trained model.
2024-06-03 09:10:56,168 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_3a_conv_1 weights set from pre-trained model.
2024-06-03 09:10:56,360 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_3a_bn_1 weights set from pre-trained model.
2024-06-03 09:10:56,570 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_3a_conv_2 weights set from pre-trained model.
2024-06-03 09:10:56,770 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_3a_conv_shortcut weights set from pre-trained model.
2024-06-03 09:10:56,969 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_3a_bn_2 weights set from pre-trained model.
2024-06-03 09:10:57,139 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_3a_bn_shortcut weights set from pre-trained model.
2024-06-03 09:10:57,139 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer add_5 weights set from pre-trained model.
2024-06-03 09:10:57,278 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_3b_conv_1 weights set from pre-trained model.
2024-06-03 09:10:57,399 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_3b_bn_1 weights set from pre-trained model.
2024-06-03 09:10:57,534 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_3b_conv_2 weights set from pre-trained model.
2024-06-03 09:10:57,749 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_3b_bn_2 weights set from pre-trained model.
2024-06-03 09:10:57,749 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer add_6 weights set from pre-trained model.
2024-06-03 09:10:57,887 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_4a_conv_1 weights set from pre-trained model.
2024-06-03 09:10:58,022 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_4a_bn_1 weights set from pre-trained model.
2024-06-03 09:10:58,152 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_4a_conv_2 weights set from pre-trained model.
2024-06-03 09:10:58,271 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_4a_conv_shortcut weights set from pre-trained model.
2024-06-03 09:10:58,402 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_4a_bn_2 weights set from pre-trained model.
2024-06-03 09:10:58,537 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_4a_bn_shortcut weights set from pre-trained model.
2024-06-03 09:10:58,537 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer add_7 weights set from pre-trained model.
2024-06-03 09:10:58,689 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_4b_conv_1 weights set from pre-trained model.
2024-06-03 09:10:58,903 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_4b_bn_1 weights set from pre-trained model.
2024-06-03 09:10:59,074 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_4b_conv_2 weights set from pre-trained model.
2024-06-03 09:10:59,198 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer block_4b_bn_2 weights set from pre-trained model.
2024-06-03 09:10:59,198 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.model.detectnet_model 142: Layer add_8 weights set from pre-trained model.
2024-06-03 09:10:59,274 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.objectives.bbox_objective 78: Default L1 loss function will be used.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
==================================================================================================
input_1 (InputLayer)            (None, 3, 1920, 1440 0                                            
__________________________________________________________________________________________________
conv1 (Conv2D)                  (None, 64, 960, 720) 9472        input_1[0][0]                    
__________________________________________________________________________________________________
bn_conv1 (BatchNormalization)   (None, 64, 960, 720) 256         conv1[0][0]                      
__________________________________________________________________________________________________
activation_1 (Activation)       (None, 64, 960, 720) 0           bn_conv1[0][0]                   
__________________________________________________________________________________________________
block_1a_conv_1 (Conv2D)        (None, 64, 480, 360) 36928       activation_1[0][0]               
__________________________________________________________________________________________________
block_1a_bn_1 (BatchNormalizati (None, 64, 480, 360) 256         block_1a_conv_1[0][0]            
__________________________________________________________________________________________________
block_1a_relu_1 (Activation)    (None, 64, 480, 360) 0           block_1a_bn_1[0][0]              
__________________________________________________________________________________________________
block_1a_conv_2 (Conv2D)        (None, 64, 480, 360) 36928       block_1a_relu_1[0][0]            
__________________________________________________________________________________________________
block_1a_conv_shortcut (Conv2D) (None, 64, 480, 360) 4160        activation_1[0][0]               
__________________________________________________________________________________________________
block_1a_bn_2 (BatchNormalizati (None, 64, 480, 360) 256         block_1a_conv_2[0][0]            
__________________________________________________________________________________________________
block_1a_bn_shortcut (BatchNorm (None, 64, 480, 360) 256         block_1a_conv_shortcut[0][0]     
__________________________________________________________________________________________________
add_1 (Add)                     (None, 64, 480, 360) 0           block_1a_bn_2[0][0]              
                                                                 block_1a_bn_shortcut[0][0]       
__________________________________________________________________________________________________
block_1a_relu (Activation)      (None, 64, 480, 360) 0           add_1[0][0]                      
__________________________________________________________________________________________________
block_1b_conv_1 (Conv2D)        (None, 64, 480, 360) 36928       block_1a_relu[0][0]              
__________________________________________________________________________________________________
block_1b_bn_1 (BatchNormalizati (None, 64, 480, 360) 256         block_1b_conv_1[0][0]            
__________________________________________________________________________________________________
block_1b_relu_1 (Activation)    (None, 64, 480, 360) 0           block_1b_bn_1[0][0]              
__________________________________________________________________________________________________
block_1b_conv_2 (Conv2D)        (None, 64, 480, 360) 36928       block_1b_relu_1[0][0]            
__________________________________________________________________________________________________
block_1b_bn_2 (BatchNormalizati (None, 64, 480, 360) 256         block_1b_conv_2[0][0]            
__________________________________________________________________________________________________
add_2 (Add)                     (None, 64, 480, 360) 0           block_1b_bn_2[0][0]              
                                                                 block_1a_relu[0][0]              
__________________________________________________________________________________________________
block_1b_relu (Activation)      (None, 64, 480, 360) 0           add_2[0][0]                      
__________________________________________________________________________________________________
block_2a_conv_1 (Conv2D)        (None, 128, 240, 180 73856       block_1b_relu[0][0]              
__________________________________________________________________________________________________
block_2a_bn_1 (BatchNormalizati (None, 128, 240, 180 512         block_2a_conv_1[0][0]            
__________________________________________________________________________________________________
block_2a_relu_1 (Activation)    (None, 128, 240, 180 0           block_2a_bn_1[0][0]              
__________________________________________________________________________________________________
block_2a_conv_2 (Conv2D)        (None, 128, 240, 180 147584      block_2a_relu_1[0][0]            
__________________________________________________________________________________________________
block_2a_conv_shortcut (Conv2D) (None, 128, 240, 180 8320        block_1b_relu[0][0]              
__________________________________________________________________________________________________
block_2a_bn_2 (BatchNormalizati (None, 128, 240, 180 512         block_2a_conv_2[0][0]            
__________________________________________________________________________________________________
block_2a_bn_shortcut (BatchNorm (None, 128, 240, 180 512         block_2a_conv_shortcut[0][0]     
__________________________________________________________________________________________________
add_3 (Add)                     (None, 128, 240, 180 0           block_2a_bn_2[0][0]              
                                                                 block_2a_bn_shortcut[0][0]       
__________________________________________________________________________________________________
block_2a_relu (Activation)      (None, 128, 240, 180 0           add_3[0][0]                      
__________________________________________________________________________________________________
block_2b_conv_1 (Conv2D)        (None, 128, 240, 180 147584      block_2a_relu[0][0]              
__________________________________________________________________________________________________
block_2b_bn_1 (BatchNormalizati (None, 128, 240, 180 512         block_2b_conv_1[0][0]            
__________________________________________________________________________________________________
block_2b_relu_1 (Activation)    (None, 128, 240, 180 0           block_2b_bn_1[0][0]              
__________________________________________________________________________________________________
block_2b_conv_2 (Conv2D)        (None, 128, 240, 180 147584      block_2b_relu_1[0][0]            
__________________________________________________________________________________________________
block_2b_bn_2 (BatchNormalizati (None, 128, 240, 180 512         block_2b_conv_2[0][0]            
__________________________________________________________________________________________________
add_4 (Add)                     (None, 128, 240, 180 0           block_2b_bn_2[0][0]              
                                                                 block_2a_relu[0][0]              
__________________________________________________________________________________________________
block_2b_relu (Activation)      (None, 128, 240, 180 0           add_4[0][0]                      
__________________________________________________________________________________________________
block_3a_conv_1 (Conv2D)        (None, 256, 120, 90) 295168      block_2b_relu[0][0]              
__________________________________________________________________________________________________
block_3a_bn_1 (BatchNormalizati (None, 256, 120, 90) 1024        block_3a_conv_1[0][0]            
__________________________________________________________________________________________________
block_3a_relu_1 (Activation)    (None, 256, 120, 90) 0           block_3a_bn_1[0][0]              
__________________________________________________________________________________________________
block_3a_conv_2 (Conv2D)        (None, 256, 120, 90) 590080      block_3a_relu_1[0][0]            
__________________________________________________________________________________________________
block_3a_conv_shortcut (Conv2D) (None, 256, 120, 90) 33024       block_2b_relu[0][0]              
__________________________________________________________________________________________________
block_3a_bn_2 (BatchNormalizati (None, 256, 120, 90) 1024        block_3a_conv_2[0][0]            
__________________________________________________________________________________________________
block_3a_bn_shortcut (BatchNorm (None, 256, 120, 90) 1024        block_3a_conv_shortcut[0][0]     
__________________________________________________________________________________________________
add_5 (Add)                     (None, 256, 120, 90) 0           block_3a_bn_2[0][0]              
                                                                 block_3a_bn_shortcut[0][0]       
__________________________________________________________________________________________________
block_3a_relu (Activation)      (None, 256, 120, 90) 0           add_5[0][0]                      
__________________________________________________________________________________________________
block_3b_conv_1 (Conv2D)        (None, 256, 120, 90) 590080      block_3a_relu[0][0]              
__________________________________________________________________________________________________
block_3b_bn_1 (BatchNormalizati (None, 256, 120, 90) 1024        block_3b_conv_1[0][0]            
__________________________________________________________________________________________________
block_3b_relu_1 (Activation)    (None, 256, 120, 90) 0           block_3b_bn_1[0][0]              
__________________________________________________________________________________________________
block_3b_conv_2 (Conv2D)        (None, 256, 120, 90) 590080      block_3b_relu_1[0][0]            
__________________________________________________________________________________________________
block_3b_bn_2 (BatchNormalizati (None, 256, 120, 90) 1024        block_3b_conv_2[0][0]            
__________________________________________________________________________________________________
add_6 (Add)                     (None, 256, 120, 90) 0           block_3b_bn_2[0][0]              
                                                                 block_3a_relu[0][0]              
__________________________________________________________________________________________________
block_3b_relu (Activation)      (None, 256, 120, 90) 0           add_6[0][0]                      
__________________________________________________________________________________________________
block_4a_conv_1 (Conv2D)        (None, 512, 120, 90) 1180160     block_3b_relu[0][0]              
__________________________________________________________________________________________________
block_4a_bn_1 (BatchNormalizati (None, 512, 120, 90) 2048        block_4a_conv_1[0][0]            
__________________________________________________________________________________________________
block_4a_relu_1 (Activation)    (None, 512, 120, 90) 0           block_4a_bn_1[0][0]              
__________________________________________________________________________________________________
block_4a_conv_2 (Conv2D)        (None, 512, 120, 90) 2359808     block_4a_relu_1[0][0]            
__________________________________________________________________________________________________
block_4a_conv_shortcut (Conv2D) (None, 512, 120, 90) 131584      block_3b_relu[0][0]              
__________________________________________________________________________________________________
block_4a_bn_2 (BatchNormalizati (None, 512, 120, 90) 2048        block_4a_conv_2[0][0]            
__________________________________________________________________________________________________
block_4a_bn_shortcut (BatchNorm (None, 512, 120, 90) 2048        block_4a_conv_shortcut[0][0]     
__________________________________________________________________________________________________
add_7 (Add)                     (None, 512, 120, 90) 0           block_4a_bn_2[0][0]              
                                                                 block_4a_bn_shortcut[0][0]       
__________________________________________________________________________________________________
block_4a_relu (Activation)      (None, 512, 120, 90) 0           add_7[0][0]                      
__________________________________________________________________________________________________
block_4b_conv_1 (Conv2D)        (None, 512, 120, 90) 2359808     block_4a_relu[0][0]              
__________________________________________________________________________________________________
block_4b_bn_1 (BatchNormalizati (None, 512, 120, 90) 2048        block_4b_conv_1[0][0]            
__________________________________________________________________________________________________
block_4b_relu_1 (Activation)    (None, 512, 120, 90) 0           block_4b_bn_1[0][0]              
__________________________________________________________________________________________________
block_4b_conv_2 (Conv2D)        (None, 512, 120, 90) 2359808     block_4b_relu_1[0][0]            
__________________________________________________________________________________________________
block_4b_bn_2 (BatchNormalizati (None, 512, 120, 90) 2048        block_4b_conv_2[0][0]            
__________________________________________________________________________________________________
add_8 (Add)                     (None, 512, 120, 90) 0           block_4b_bn_2[0][0]              
                                                                 block_4a_relu[0][0]              
__________________________________________________________________________________________________
block_4b_relu (Activation)      (None, 512, 120, 90) 0           add_8[0][0]                      
__________________________________________________________________________________________________
output_bbox (Conv2D)            (None, 24, 120, 90)  12312       block_4b_relu[0][0]              
__________________________________________________________________________________________________
output_cov (Conv2D)             (None, 6, 120, 90)   3078        block_4b_relu[0][0]              
==================================================================================================
Total params: 11,210,718
Trainable params: 11,200,990
Non-trainable params: 9,728
__________________________________________________________________________________________________
2024-06-03 09:10:59,297 [TAO Toolkit] [INFO] root 2102: DetectNet V2 model built.
2024-06-03 09:10:59,298 [TAO Toolkit] [INFO] root 2102: Building rasterizer.
2024-06-03 09:10:59,298 [TAO Toolkit] [INFO] root 2102: Rasterizers built.
WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/nvidia_tao_tf1/cv/detectnet_v2/training/training_proto_utilities.py:102: The name tf.train.get_or_create_global_step is deprecated. Please use tf.compat.v1.train.get_or_create_global_step instead.
...
INFO:tensorflow:Graph was finalized.
2024-06-03 09:11:10,956 [TAO Toolkit] [INFO] tensorflow 240: Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpjaunkjin/model.ckpt-120
2024-06-03 09:11:10,958 [TAO Toolkit] [INFO] tensorflow 1284: Restoring parameters from /tmp/tmpjaunkjin/model.ckpt-120
2024-06-03 09:11:11,739 [TAO Toolkit] [INFO] root 2102: Restoring from checkpoint failed. This is most likely due to a Variable name or other graph key that is missing from the checkpoint. Please ensure that you have not altered the graph expected based on the checkpoint. Original error:

2 root error(s) found.
  (0) Not found: Key cost_sums/barcode-bbox not found in checkpoint
	 [[node save/RestoreV2 (defined at /tensorflow_core/python/framework/ops.py:1748) ]]
  (1) Not found: Key cost_sums/barcode-bbox not found in checkpoint
	 [[node save/RestoreV2 (defined at /tensorflow_core/python/framework/ops.py:1748) ]]
	 [[save/RestoreV2/_793]]
0 successful operations.
0 derived errors ignored.
...
During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/training/saver.py", line 1300, in restore
    names_to_keys = object_graph_key_mapping(save_path)
  File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/training/saver.py", line 1618, in object_graph_key_mapping
    object_graph_string = reader.get_tensor(trackable.OBJECT_GRAPH_PROTO_KEY)
  File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/pywrap_tensorflow_internal.py", line 915, in get_tensor
    return CheckpointReader_GetTensor(self, compat.as_bytes(tensor_str))
tensorflow.python.framework.errors_impl.NotFoundError: Key _CHECKPOINTABLE_OBJECT_GRAPH not found in checkpoint
...
2024-06-03 09:11:12,086 [TAO Toolkit] [ERROR] tensorflow 70: ==================================
Object was never used (type <class 'tensorflow.python.framework.ops.Tensor'>):
<tf.Tensor 'IsVariableInitialized_302:0' shape=() dtype=bool>
If you want to mark it as used call its "mark_used()" method.
It was originally created here:
  File "/usr/local/lib/python3.8/dist-packages/nvidia_tao_tf1/cv/detectnet_v2/training/utilities.py", line 154, in get_singular_monitored_session
    return tf.train.SingularMonitoredSession(hooks=hooks,  File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/training/monitored_session.py", line 1100, in __init__
    super(SingularMonitoredSession, self).__init__(  File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/training/monitored_session.py", line 727, in __init__
    self._sess = self._coordinated_creator.create_session()  File "/usr/local/lib/python3.8/dist-packages/nvidia_tao_tf1/core/hooks/hooks.py", line 286, in begin
    self._variables_initialized.append(  File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/util/tf_should_use.py", line 198, in wrapped
    return _add_should_use_warning(fn(*args, **kwargs))
==================================
Execution status: FAIL
2024-06-03 18:11:16,861 [TAO Toolkit] [INFO] nvidia_tao_cli.components.docker_handler.docker_handler 363: Stopping container.

It shows FAIL, but I do not have any clue from the stacktrace.
What might be the reason to be failed?
Because of my setting files or KITTI files ?

Morganh · June 3, 2024, 4:47pm

Is conversion successful? Can you share the log?

Can you share the command line? And also the training spec file?

uma7 · June 4, 2024, 1:37am

@Morganh
Thank you for your reply.

Is conversion successful? Can you share the log?

This is the dataset_convert log.

!tao model detectnet_v2 dataset_convert -d /workspace/tao-experiments/detectnet_v2/specs/spec_tfrecords_kitti.txt \
                                        -o /workspace/tao-experiments/data/tfrecords_aikata
2024-06-03 18:04:13,384 [TAO Toolkit] [INFO] root 160: Registry: ['nvcr.io']
2024-06-03 18:04:13,447 [TAO Toolkit] [INFO] nvidia_tao_cli.components.instance_handler.local_instance 360: Running command in container: nvcr.io/nvidia/tao/tao-toolkit:5.0.0-tf1.15.5
2024-06-03 18:04:13,675 [TAO Toolkit] [INFO] nvidia_tao_cli.components.docker_handler.docker_handler 301: Printing tty value True
2024-06-03 09:04:14.419146: I tensorflow/stream_executor/platform/default/dso_loader.cc:50] Successfully opened dynamic library libcudart.so.12
2024-06-03 09:04:14,453 [TAO Toolkit] [WARNING] tensorflow 40: Deprecation warnings have been disabled. Set TF_ENABLE_DEPRECATION_WARNINGS=1 to re-enable them.
Using TensorFlow backend.
2024-06-03 09:04:15,574 [TAO Toolkit] [WARNING] tensorflow 43: TensorFlow will not use sklearn by default. This improves performance in some cases. To enable sklearn export the environment variable  TF_ALLOW_IOLIBS=1.
2024-06-03 09:04:15,602 [TAO Toolkit] [WARNING] tensorflow 42: TensorFlow will not use Dask by default. This improves performance in some cases. To enable Dask export the environment variable  TF_ALLOW_IOLIBS=1.
2024-06-03 09:04:15,605 [TAO Toolkit] [WARNING] tensorflow 43: TensorFlow will not use Pandas by default. This improves performance in some cases. To enable Pandas export the environment variable  TF_ALLOW_IOLIBS=1.
2024-06-03 09:04:16,602 [TAO Toolkit] [WARNING] matplotlib 500: Matplotlib created a temporary config/cache directory at /tmp/matplotlib-pn4f6fy2 because the default path (/.config/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.
2024-06-03 09:04:16,771 [TAO Toolkit] [INFO] matplotlib.font_manager 1633: generated new fontManager
WARNING:tensorflow:Deprecation warnings have been disabled. Set TF_ENABLE_DEPRECATION_WARNINGS=1 to re-enable them.
Using TensorFlow backend.
WARNING:tensorflow:TensorFlow will not use sklearn by default. This improves performance in some cases. To enable sklearn export the environment variable  TF_ALLOW_IOLIBS=1.
2024-06-03 09:04:18,053 [TAO Toolkit] [WARNING] tensorflow 43: TensorFlow will not use sklearn by default. This improves performance in some cases. To enable sklearn export the environment variable  TF_ALLOW_IOLIBS=1.
WARNING:tensorflow:TensorFlow will not use Dask by default. This improves performance in some cases. To enable Dask export the environment variable  TF_ALLOW_IOLIBS=1.
2024-06-03 09:04:18,080 [TAO Toolkit] [WARNING] tensorflow 42: TensorFlow will not use Dask by default. This improves performance in some cases. To enable Dask export the environment variable  TF_ALLOW_IOLIBS=1.
WARNING:tensorflow:TensorFlow will not use Pandas by default. This improves performance in some cases. To enable Pandas export the environment variable  TF_ALLOW_IOLIBS=1.
2024-06-03 09:04:18,083 [TAO Toolkit] [WARNING] tensorflow 43: TensorFlow will not use Pandas by default. This improves performance in some cases. To enable Pandas export the environment variable  TF_ALLOW_IOLIBS=1.
2024-06-03 09:04:18,443 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.build_converter 87: Instantiating a kitti converter
2024-06-03 09:04:18,445 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.kitti_converter_lib 176: Num images in
Train: 1035	Val: 258
2024-06-03 09:04:18,445 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.kitti_converter_lib 197: Validation data in partition 0. Hence, while choosing the validationset during training choose validation_fold 0.
2024-06-03 09:04:18,446 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 166: Writing partition 0, shard 0
WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/nvidia_tao_tf1/cv/detectnet_v2/dataio/dataset_converter_lib.py:181: The name tf.python_io.TFRecordWriter is deprecated. Please use tf.io.TFRecordWriter instead.

2024-06-03 09:04:18,446 [TAO Toolkit] [WARNING] tensorflow 137: From /usr/local/lib/python3.8/dist-packages/nvidia_tao_tf1/cv/detectnet_v2/dataio/dataset_converter_lib.py:181: The name tf.python_io.TFRecordWriter is deprecated. Please use tf.io.TFRecordWriter instead.

2024-06-03 09:04:18,457 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 166: Writing partition 0, shard 1
2024-06-03 09:04:18,465 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 166: Writing partition 0, shard 2
2024-06-03 09:04:18,473 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 166: Writing partition 0, shard 3
2024-06-03 09:04:18,481 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 166: Writing partition 0, shard 4
2024-06-03 09:04:18,489 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 166: Writing partition 0, shard 5
2024-06-03 09:04:18,497 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 166: Writing partition 0, shard 6
2024-06-03 09:04:18,505 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 166: Writing partition 0, shard 7
2024-06-03 09:04:18,513 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 166: Writing partition 0, shard 8
2024-06-03 09:04:18,521 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 166: Writing partition 0, shard 9
2024-06-03 09:04:18,531 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 250: 
Wrote the following numbers of objects:
b'card': 126
b'name': 155
b'price': 157
b'barcode': 162
b'card7p': 33
b'unknown': 394

2024-06-03 09:04:18,531 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 166: Writing partition 1, shard 0
2024-06-03 09:04:18,564 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 166: Writing partition 1, shard 1
2024-06-03 09:04:18,599 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 166: Writing partition 1, shard 2
2024-06-03 09:04:18,634 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 166: Writing partition 1, shard 3
2024-06-03 09:04:18,669 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 166: Writing partition 1, shard 4
2024-06-03 09:04:18,703 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 166: Writing partition 1, shard 5
2024-06-03 09:04:18,736 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 166: Writing partition 1, shard 6
2024-06-03 09:04:18,770 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 166: Writing partition 1, shard 7
2024-06-03 09:04:18,804 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 166: Writing partition 1, shard 8
2024-06-03 09:04:18,838 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 166: Writing partition 1, shard 9
2024-06-03 09:04:18,873 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 250: 
Wrote the following numbers of objects:
b'card': 1316
b'name': 1477
b'barcode': 1494
b'price': 1465
b'card7p': 194
b'unknown': 176

2024-06-03 09:04:18,874 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 89: Cumulative object statistics
2024-06-03 09:04:18,874 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 250: 
Wrote the following numbers of objects:
b'card': 1442
b'name': 1632
b'price': 1622
b'barcode': 1656
b'card7p': 227
b'unknown': 570

2024-06-03 09:04:18,874 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 105: Class map. 
Label in GT: Label in tfrecords file 
b'card': b'card'
b'name': b'name'
b'price': b'price'
b'barcode': b'barcode'
b'card7p': b'card7p'
b'unknown': b'unknown'
For the dataset_config in the experiment_spec, please use labels in the tfrecords file, while writing the classmap.

2024-06-03 09:04:18,874 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataio.dataset_converter_lib 114: Tfrecords generation complete.
Execution status: PASS
2024-06-03 18:04:23,922 [TAO Toolkit] [INFO] nvidia_tao_cli.components.docker_handler.docker_handler 363: Stopping container.

Can you share the command line?

I’m sorry, I wrote these command lines but it was this command line.

!tao model detectnet_v2 train -e $SPECS_DIR/spec_train_kitti.txt \
                        -r $USER_EXPERIMENT_DIR/experiment_dir_unpruned \
                        -k aikata \
                        -n resnet18_detector \
                        --gpus $NUM_GPUS

2024-06-04 03:07:12,293 [TAO Toolkit] [WARNING] tensorflow 137: From /usr/local/lib/python3.8/dist-packages/nvidia_tao_tf1/cv/detectnet_v2/training/utilities.py:154: The name tf.train.SingularMonitoredSession is deprecated. Please use tf.compat.v1.train.SingularMonitoredSession instead.

INFO:tensorflow:Graph was finalized.
2024-06-04 03:07:13,725 [TAO Toolkit] [INFO] tensorflow 240: Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpkvz33fwr/model.ckpt-120
2024-06-04 03:07:13,728 [TAO Toolkit] [INFO] tensorflow 1284: Restoring parameters from /tmp/tmpkvz33fwr/model.ckpt-120
2024-06-04 03:07:14,562 [TAO Toolkit] [INFO] root 2102: Restoring from checkpoint failed. This is most likely due to a Variable name or other graph key that is missing from the checkpoint. Please ensure that you have not altered the graph expected based on the checkpoint. Original error:

2 root error(s) found.
  (0) Not found: Key cost_sums/barcode-bbox not found in checkpoint
	 [[node save/RestoreV2 (defined at /tensorflow_core/python/framework/ops.py:1748) ]]
  (1) Not found: Key cost_sums/barcode-bbox not found in checkpoint
	 [[node save/RestoreV2 (defined at /tensorflow_core/python/framework/ops.py:1748) ]]
	 [[save/RestoreV2/_301]]
0 successful operations.
0 derived errors ignored.

Original stack trace for 'save/RestoreV2':
  File "/nvidia_tao_tf1/cv/detectnet_v2/scripts/train.py", line 1046, in <module>
    main()
  File "/decorator.py", line 232, in fun
    return caller(func, *(extras + args), **kw)
  File "/nvidia_tao_tf1/cv/detectnet_v2/utilities/timer.py", line 46, in wrapped_fn
    return_args = fn(*args, **kwargs)
  File "/nvidia_tao_tf1/cv/detectnet_v2/scripts/train.py", line 1024, in main
    run_experiment(
  File "/nvidia_tao_tf1/cv/detectnet_v2/scripts/train.py", line 887, in run_experiment
    train_gridbox(results_dir, experiment_spec, output_model_file_name, input_model_file_name,
  File "/nvidia_tao_tf1/cv/detectnet_v2/scripts/train.py", line 760, in train_gridbox
    run_training_loop(experiment_spec, results_dir, gridbox_model, hooks, steps_per_epoch,
  File "/nvidia_tao_tf1/cv/detectnet_v2/scripts/train.py", line 227, in run_training_loop
    with get_singular_monitored_session(keras_models=gridbox_model.get_keras_training_model(),
  File "/nvidia_tao_tf1/cv/detectnet_v2/training/utilities.py", line 154, in get_singular_monitored_session
    return tf.train.SingularMonitoredSession(hooks=hooks,
  File "/tensorflow_core/python/training/monitored_session.py", line 1100, in __init__
    super(SingularMonitoredSession, self).__init__(
  File "/tensorflow_core/python/training/monitored_session.py", line 727, in __init__
    self._sess = self._coordinated_creator.create_session()
  File "/tensorflow_core/python/training/monitored_session.py", line 878, in create_session
    self.tf_sess = self._session_creator.create_session()
  File "/tensorflow_core/python/training/monitored_session.py", line 638, in create_session
    self._scaffold.finalize()
  File "/tensorflow_core/python/training/monitored_session.py", line 229, in finalize
    self._saver = training_saver._get_saver_or_default()  # pylint: disable=protected-access
  File "/tensorflow_core/python/training/saver.py", line 599, in _get_saver_or_default
    saver = Saver(sharded=True, allow_empty=True)
  File "/tensorflow_core/python/training/saver.py", line 828, in __init__
    self.build()
  File "/tensorflow_core/python/training/saver.py", line 840, in build
    self._build(self._filename, build_save=True, build_restore=True)
  File "/tensorflow_core/python/training/saver.py", line 868, in _build
    self.saver_def = self._builder._build_internal(  # pylint: disable=protected-access
  File "/tensorflow_core/python/training/saver.py", line 501, in _build_internal
    restore_op = self._AddShardedRestoreOps(filename_tensor, per_device,
  File "/tensorflow_core/python/training/saver.py", line 375, in _AddShardedRestoreOps
    self._AddRestoreOps(
  File "/tensorflow_core/python/training/saver.py", line 327, in _AddRestoreOps
    all_tensors = self.bulk_restore(filename_tensor, saveables, preferred_shard,
  File "/tensorflow_core/python/training/saver.py", line 575, in bulk_restore
    return io_ops.restore_v2(filename_tensor, names, slices, dtypes)
  File "/tensorflow_core/python/ops/gen_io_ops.py", line 1693, in restore_v2
    _, _, _op = _op_def_lib._apply_op_helper(
  File "/tensorflow_core/python/framework/op_def_library.py", line 792, in _apply_op_helper
    op = g.create_op(op_type_name, inputs, dtypes=None, name=scope,
  File "/tensorflow_core/python/util/deprecation.py", line 513, in new_func
    return func(*args, **kwargs)
  File "/tensorflow_core/python/framework/ops.py", line 3356, in create_op
    return self._create_op_internal(op_type, inputs, dtypes, input_types, name,
  File "/tensorflow_core/python/framework/ops.py", line 3418, in _create_op_internal
    ret = Operation(
  File "/tensorflow_core/python/framework/ops.py", line 1748, in __init__
    self._traceback = tf_stack.extract_stack()

Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/client/session.py", line 1365, in _do_call
    return fn(*args)
  File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/client/session.py", line 1349, in _run_fn
    return self._call_tf_sessionrun(options, feed_dict, fetch_list,
  File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/client/session.py", line 1441, in _call_tf_sessionrun
    return tf_session.TF_SessionRun_wrapper(self._session, options, feed_dict,
tensorflow.python.framework.errors_impl.NotFoundError: 2 root error(s) found.
  (0) Not found: Key cost_sums/barcode-bbox not found in checkpoint
	 [[{{node save/RestoreV2}}]]
  (1) Not found: Key cost_sums/barcode-bbox not found in checkpoint
	 [[{{node save/RestoreV2}}]]
	 [[save/RestoreV2/_301]]
0 successful operations.
0 derived errors ignored.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/training/saver.py", line 1289, in restore
    sess.run(self.saver_def.restore_op_name,
  File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/client/session.py", line 955, in run
    result = self._run(None, fetches, feed_dict, options_ptr,
  File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/client/session.py", line 1179, in _run
    results = self._do_run(handle, final_targets, final_fetches,
  File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/client/session.py", line 1358, in _do_run
    return self._do_call(_run_fn, feeds, fetches, targets, options,
  File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/client/session.py", line 1384, in _do_call
    raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.NotFoundError: 2 root error(s) found.
  (0) Not found: Key cost_sums/barcode-bbox not found in checkpoint
	 [[node save/RestoreV2 (defined at /tensorflow_core/python/framework/ops.py:1748) ]]
  (1) Not found: Key cost_sums/barcode-bbox not found in checkpoint
	 [[node save/RestoreV2 (defined at /tensorflow_core/python/framework/ops.py:1748) ]]
	 [[save/RestoreV2/_301]]
0 successful operations.
0 derived errors ignored.

Original stack trace for 'save/RestoreV2':
  File "/nvidia_tao_tf1/cv/detectnet_v2/scripts/train.py", line 1046, in <module>
    main()
  File "/decorator.py", line 232, in fun
    return caller(func, *(extras + args), **kw)
  File "/nvidia_tao_tf1/cv/detectnet_v2/utilities/timer.py", line 46, in wrapped_fn
    return_args = fn(*args, **kwargs)
  File "/nvidia_tao_tf1/cv/detectnet_v2/scripts/train.py", line 1024, in main
    run_experiment(
  File "/nvidia_tao_tf1/cv/detectnet_v2/scripts/train.py", line 887, in run_experiment
    train_gridbox(results_dir, experiment_spec, output_model_file_name, input_model_file_name,
  File "/nvidia_tao_tf1/cv/detectnet_v2/scripts/train.py", line 760, in train_gridbox
    run_training_loop(experiment_spec, results_dir, gridbox_model, hooks, steps_per_epoch,
  File "/nvidia_tao_tf1/cv/detectnet_v2/scripts/train.py", line 227, in run_training_loop
    with get_singular_monitored_session(keras_models=gridbox_model.get_keras_training_model(),
  File "/nvidia_tao_tf1/cv/detectnet_v2/training/utilities.py", line 154, in get_singular_monitored_session
    return tf.train.SingularMonitoredSession(hooks=hooks,
  File "/tensorflow_core/python/training/monitored_session.py", line 1100, in __init__
    super(SingularMonitoredSession, self).__init__(
  File "/tensorflow_core/python/training/monitored_session.py", line 727, in __init__
    self._sess = self._coordinated_creator.create_session()
  File "/tensorflow_core/python/training/monitored_session.py", line 878, in create_session
    self.tf_sess = self._session_creator.create_session()
  File "/tensorflow_core/python/training/monitored_session.py", line 638, in create_session
    self._scaffold.finalize()
  File "/tensorflow_core/python/training/monitored_session.py", line 229, in finalize
    self._saver = training_saver._get_saver_or_default()  # pylint: disable=protected-access
  File "/tensorflow_core/python/training/saver.py", line 599, in _get_saver_or_default
    saver = Saver(sharded=True, allow_empty=True)
  File "/tensorflow_core/python/training/saver.py", line 828, in __init__
    self.build()
  File "/tensorflow_core/python/training/saver.py", line 840, in build
    self._build(self._filename, build_save=True, build_restore=True)
  File "/tensorflow_core/python/training/saver.py", line 868, in _build
    self.saver_def = self._builder._build_internal(  # pylint: disable=protected-access
  File "/tensorflow_core/python/training/saver.py", line 501, in _build_internal
    restore_op = self._AddShardedRestoreOps(filename_tensor, per_device,
  File "/tensorflow_core/python/training/saver.py", line 375, in _AddShardedRestoreOps
    self._AddRestoreOps(
  File "/tensorflow_core/python/training/saver.py", line 327, in _AddRestoreOps
    all_tensors = self.bulk_restore(filename_tensor, saveables, preferred_shard,
  File "/tensorflow_core/python/training/saver.py", line 575, in bulk_restore
    return io_ops.restore_v2(filename_tensor, names, slices, dtypes)
  File "/tensorflow_core/python/ops/gen_io_ops.py", line 1693, in restore_v2
    _, _, _op = _op_def_lib._apply_op_helper(
  File "/tensorflow_core/python/framework/op_def_library.py", line 792, in _apply_op_helper
    op = g.create_op(op_type_name, inputs, dtypes=None, name=scope,
  File "/tensorflow_core/python/util/deprecation.py", line 513, in new_func
    return func(*args, **kwargs)
  File "/tensorflow_core/python/framework/ops.py", line 3356, in create_op
    return self._create_op_internal(op_type, inputs, dtypes, input_types, name,
  File "/tensorflow_core/python/framework/ops.py", line 3418, in _create_op_internal
    ret = Operation(
  File "/tensorflow_core/python/framework/ops.py", line 1748, in __init__
    self._traceback = tf_stack.extract_stack()


2024-06-04 03:07:15,155 [TAO Toolkit] [ERROR] tensorflow 70: ==================================
Object was never used (type <class 'tensorflow.python.framework.ops.Tensor'>):
<tf.Tensor 'IsVariableInitialized_302:0' shape=() dtype=bool>
If you want to mark it as used call its "mark_used()" method.
It was originally created here:
  File "/usr/local/lib/python3.8/dist-packages/nvidia_tao_tf1/cv/detectnet_v2/training/utilities.py", line 154, in get_singular_monitored_session
    return tf.train.SingularMonitoredSession(hooks=hooks,  File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/training/monitored_session.py", line 1100, in __init__
    super(SingularMonitoredSession, self).__init__(  File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/training/monitored_session.py", line 727, in __init__
    self._sess = self._coordinated_creator.create_session()  File "/usr/local/lib/python3.8/dist-packages/nvidia_tao_tf1/core/hooks/hooks.py", line 286, in begin
    self._variables_initialized.append(  File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/util/tf_should_use.py", line 198, in wrapped
    return _add_should_use_warning(fn(*args, **kwargs))
==================================
Execution status: FAIL
2024-06-04 12:07:18,760 [TAO Toolkit] [INFO] nvidia_tao_cli.components.docker_handler.docker_handler 363: Stopping container.

And also the training spec file?

Here’s the training spec file.

!cat $LOCAL_SPECS_DIR/spec_train_kitti.txt
random_seed: 42
dataset_config {
  data_sources {
    tfrecords_path: "/workspace/tao-experiments/data/tfrecords_aikata/*"
    image_directory_path: "/workspace/tao-experiments/data/training"
  }
  image_extension: "jpg"
  target_class_mapping {
    key: "barcode"
    value: "barcode"
  }
  target_class_mapping {
    key: "name"
    value: "name"
  }
  target_class_mapping {
    key: "price"
    value: "price"
  }
  target_class_mapping {
    key: "card"
    value: "card"
  }
  target_class_mapping {
    key: "card7p"
    value: "card7p"
  }
  target_class_mapping {
    key: "unknown"
    value: "unknown"
  }
  validation_fold: 0
}
model_config {
  pretrained_model_file: "/workspace/tao-experiments/detectnet_v2_test/pretrained_resnet18/pretrained_detectnet_v2_vresnet18/resnet18.hdf5"
  num_layers: 18
  use_batch_norm: true
  objective_set {
    bbox {
      scale: 35.0
      offset: 0.5
    }
    cov {
    }
  }
  arch: "resnet"
}
augmentation_config {
  preprocessing {
    output_image_width: 1440
    output_image_height: 1920
    min_bbox_width: 1.0
    min_bbox_height: 1.0
    output_image_channel: 3
  }
  spatial_augmentation {
    hflip_probability: 0.5
    zoom_min: 1.0
    zoom_max: 1.0
    translate_max_x: 8.0
    translate_max_y: 8.0
  }
  color_augmentation {
    hue_rotation_max: 25.0
    saturation_shift_max: 0.20000000298
    contrast_scale_max: 0.10000000149
    contrast_center: 0.5
  }
}
postprocessing_config {
  target_class_config {
    key: "barcode"
    value {
      clustering_config {
        clustering_algorithm: DBSCAN
        dbscan_confidence_threshold: 0.9
        coverage_threshold: 0.00499999988824
        dbscan_eps: 0.20000000298
        dbscan_min_samples: 1
        minimum_bounding_box_height: 20
      }
    }
  }
  target_class_config {
    key: "name"
    value {
      clustering_config {
        clustering_algorithm: DBSCAN
        dbscan_confidence_threshold: 0.9
        coverage_threshold: 0.00499999988824
        dbscan_eps: 0.15000000596
        dbscan_min_samples: 1
        minimum_bounding_box_height: 20
      }
    }
  }
  target_class_config {
    key: "price"
    value {
      clustering_config {
        clustering_algorithm: DBSCAN
        dbscan_confidence_threshold: 0.9
        coverage_threshold: 0.00749999983236
        dbscan_eps: 0.230000004172
        dbscan_min_samples: 1
        minimum_bounding_box_height: 20
      }
    }
  }
  target_class_config {
    key: "card"
    value {
      clustering_config {
        clustering_algorithm: DBSCAN
        dbscan_confidence_threshold: 0.9
        coverage_threshold: 0.00749999983236
        dbscan_eps: 0.230000004172
        dbscan_min_samples: 1
        minimum_bounding_box_height: 20
      }
    }
  }
  target_class_config {
    key: "card7p"
    value {
      clustering_config {
        clustering_algorithm: DBSCAN
        dbscan_confidence_threshold: 0.9
        coverage_threshold: 0.00749999983236
        dbscan_eps: 0.230000004172
        dbscan_min_samples: 1
        minimum_bounding_box_height: 20
      }
    }
  }
  target_class_config {
    key: "unknown"
    value {
      clustering_config {
        clustering_algorithm: DBSCAN
        dbscan_confidence_threshold: 0.9
        coverage_threshold: 0.00749999983236
        dbscan_eps: 0.230000004172
        dbscan_min_samples: 1
        minimum_bounding_box_height: 20
      }
    }
  }
}
evaluation_config {
  validation_period_during_training: 10
  first_validation_epoch: 30
  minimum_detection_ground_truth_overlap {
    key: "barcode"
    value: 0.699999988079
  }
  minimum_detection_ground_truth_overlap {
    key: "name"
    value: 0.5
  }
  minimum_detection_ground_truth_overlap {
    key: "price"
    value: 0.5
  }
  minimum_detection_ground_truth_overlap {
    key: "card"
    value: 0.5
  }
  minimum_detection_ground_truth_overlap {
    key: "card7p"
    value: 0.5
  }
  minimum_detection_ground_truth_overlap {
    key: "unknown"
    value: 0.5
  }
  evaluation_box_config {
    key: "barcode"
    value {
      minimum_height: 20
      maximum_height: 9999
      minimum_width: 10
      maximum_width: 9999
    }
  }
  evaluation_box_config {
    key: "name"
    value {
      minimum_height: 20
      maximum_height: 9999
      minimum_width: 10
      maximum_width: 9999
    }
  }
  evaluation_box_config {
    key: "price"
    value {
      minimum_height: 20
      maximum_height: 9999
      minimum_width: 10
      maximum_width: 9999
    }
  }
  evaluation_box_config {
    key: "card"
    value {
      minimum_height: 20
      maximum_height: 9999
      minimum_width: 10
      maximum_width: 9999
    }
  }
  evaluation_box_config {
    key: "card7p"
    value {
      minimum_height: 20
      maximum_height: 9999
      minimum_width: 10
      maximum_width: 9999
    }
  }
  evaluation_box_config {
    key: "unknown"
    value {
      minimum_height: 20
      maximum_height: 9999
      minimum_width: 10
      maximum_width: 9999
    }
  }
  average_precision_mode: INTEGRATE
}
cost_function_config {
  target_classes {
    name: "barcode"
    class_weight: 1.0
    coverage_foreground_weight: 0.0500000007451
    objectives {
      name: "cov"
      initial_weight: 1.0
      weight_target: 1.0
    }
    objectives {
      name: "bbox"
      initial_weight: 10.0
      weight_target: 10.0
    }
  }
  target_classes {
    name: "name"
    class_weight: 8.0
    coverage_foreground_weight: 0.0500000007451
    objectives {
      name: "cov"
      initial_weight: 1.0
      weight_target: 1.0
    }
    objectives {
      name: "bbox"
      initial_weight: 10.0
      weight_target: 1.0
    }
  }
  target_classes {
    name: "price"
    class_weight: 4.0
    coverage_foreground_weight: 0.0500000007451
    objectives {
      name: "cov"
      initial_weight: 1.0
      weight_target: 1.0
    }
    objectives {
      name: "bbox"
      initial_weight: 10.0
      weight_target: 10.0
    }
  }
  target_classes {
    name: "card"
    class_weight: 4.0
    coverage_foreground_weight: 0.0500000007451
    objectives {
      name: "cov"
      initial_weight: 1.0
      weight_target: 1.0
    }
    objectives {
      name: "bbox"
      initial_weight: 10.0
      weight_target: 10.0
    }
  }
  target_classes {
    name: "card7p"
    class_weight: 4.0
    coverage_foreground_weight: 0.0500000007451
    objectives {
      name: "cov"
      initial_weight: 1.0
      weight_target: 1.0
    }
    objectives {
      name: "bbox"
      initial_weight: 10.0
      weight_target: 10.0
    }
  }
  target_classes {
    name: "unknown"
    class_weight: 4.0
    coverage_foreground_weight: 0.0500000007451
    objectives {
      name: "cov"
      initial_weight: 1.0
      weight_target: 1.0
    }
    objectives {
      name: "bbox"
      initial_weight: 10.0
      weight_target: 10.0
    }
  }
  enable_autoweighting: false
  max_objective_weight: 0.999899983406
  min_objective_weight: 9.99999974738e-05
}
training_config {
  batch_size_per_gpu: 4
  num_epochs: 120
  learning_rate {
    soft_start_annealing_schedule {
      min_learning_rate: 5e-07
      max_learning_rate: 5e-05
      soft_start: 0.10000000149
      annealing: 0.699999988079
    }
  }
  regularizer {
    type: L1
    weight: 3.00000002618e-09
  }
  optimizer {
    adam {
      epsilon: 9.99999993923e-09
      beta1: 0.899999976158
      beta2: 0.999000012875
    }
  }
  cost_scaling {
    initial_exponent: 20.0
    increment: 0.005
    decrement: 1.0
  }
  visualizer{
    enabled: true
    num_images: 3
    scalar_logging_frequency: 50
    infrequent_logging_frequency: 5
    target_class_config {
      key: "barcode"
      value: {
        coverage_threshold: 0.005
      }
    }
    target_class_config {
      key: "name"
      value: {
        coverage_threshold: 0.005
      }
    }
    target_class_config {
      key: "price"
      value: {
        coverage_threshold: 0.005
      }
    }
    target_class_config {
      key: "card"
      value: {
        coverage_threshold: 0.005
      }
    }
    target_class_config {
      key: "card7p"
      value: {
        coverage_threshold: 0.005
      }
    }
    target_class_config {
      key: "unknown"
      value: {
        coverage_threshold: 0.005
      }
    }
    clearml_config{
      project: "TAO Toolkit ClearML Demo"
      task: "detectnet_v2_resnet18_clearml"
      tags: "detectnet_v2"
      tags: "training"
      tags: "resnet18"
      tags: "unpruned"
    }
    wandb_config{
      project: "TAO Toolkit Wandb Demo"
      name: "detectnet_v2_resnet18_wandb"
      tags: "detectnet_v2"
      tags: "training"
      tags: "resnet18"
      tags: "unpruned"
    }
  }
  checkpoint_interval: 10
}
bbox_rasterizer_config {
  target_class_config {
    key: "barcode"
    value {
      cov_center_x: 0.5
      cov_center_y: 0.5
      cov_radius_x: 0.40000000596
      cov_radius_y: 0.40000000596
      bbox_min_radius: 1.0
    }
  }
  target_class_config {
    key: "name"
    value {
      cov_center_x: 0.5
      cov_center_y: 0.5
      cov_radius_x: 1.0
      cov_radius_y: 1.0
      bbox_min_radius: 1.0
    }
  }
  target_class_config {
    key: "price"
    value {
      cov_center_x: 0.5
      cov_center_y: 0.5
      cov_radius_x: 1.0
      cov_radius_y: 1.0
      bbox_min_radius: 1.0
    }
  }
  target_class_config {
    key: "card"
    value {
      cov_center_x: 0.5
      cov_center_y: 0.5
      cov_radius_x: 1.0
      cov_radius_y: 1.0
      bbox_min_radius: 1.0
    }
  }
  target_class_config {
    key: "card7p"
    value {
      cov_center_x: 0.5
      cov_center_y: 0.5
      cov_radius_x: 1.0
      cov_radius_y: 1.0
      bbox_min_radius: 1.0
    }
  }
  target_class_config {
    key: "unknown"
    value {
      cov_center_x: 0.5
      cov_center_y: 0.5
      cov_radius_x: 1.0
      cov_radius_y: 1.0
      bbox_min_radius: 1.0
    }
  }
  deadzone_radius: 0.400000154972

INFO

env: NUM_GPUS=1
env: USER_EXPERIMENT_DIR=/workspace/tao-experiments/detectnet_v2
env: DATA_DOWNLOAD_DIR=/workspace/tao-experiments/data
env: SPECS_DIR=/workspace/tao-experiments/detectnet_v2/specs
{
    "Mounts": [
        {
            "source": "/home/ym7/tao-jupyter/getting_started_v5.3.0/notebooks/tao_launcher_starter_kit/detectnet_v2_test",
            "destination": "/workspace/tao-experiments"
        },
        {
            "source": "/home/ym7/tao-jupyter/getting_started_v5.3.0/notebooks/tao_launcher_starter_kit/detectnet_v2_test/specs",
            "destination": "/workspace/tao-experiments/detectnet_v2/specs"
        }
    ],
    "DockerOptions": {
        "user": "1000:1000"
    }
}

!ls -rlt $LOCAL_EXPERIMENT_DIR/pretrained_resnet18/pretrained_detectnet_v2_vresnet18
-rw-r--r-- 1 ym7 ym7 93345248 May 19 12:48 resnet18.hdf5

$ ll detectnet_v2_test/pretrained_resnet18/pretrained_detectnet_v2_vresnet18/
-rw-r--r-- 1 ym7 ym7 93345248 May 19 12:48 resnet18.hdf5

$ ll data/tfrecords_aikata
-rw-r--r-- 1 ym7 ym7  21611 Jun  3 18:04 tfrecords_aikata-fold-000-of-002-shard-00000-of-00010
-rw-r--r-- 1 ym7 ym7  19896 Jun  3 18:04 tfrecords_aikata-fold-000-of-002-shard-00001-of-00010
-rw-r--r-- 1 ym7 ym7  19934 Jun  3 18:04 tfrecords_aikata-fold-000-of-002-shard-00002-of-00010
-rw-r--r-- 1 ym7 ym7  19034 Jun  3 18:04 tfrecords_aikata-fold-000-of-002-shard-00003-of-00010
-rw-r--r-- 1 ym7 ym7  20116 Jun  3 18:04 tfrecords_aikata-fold-000-of-002-shard-00004-of-00010
-rw-r--r-- 1 ym7 ym7  19864 Jun  3 18:04 tfrecords_aikata-fold-000-of-002-shard-00005-of-00010
-rw-r--r-- 1 ym7 ym7  20102 Jun  3 18:04 tfrecords_aikata-fold-000-of-002-shard-00006-of-00010
-rw-r--r-- 1 ym7 ym7  21005 Jun  3 18:04 tfrecords_aikata-fold-000-of-002-shard-00007-of-00010
-rw-r--r-- 1 ym7 ym7  20952 Jun  3 18:04 tfrecords_aikata-fold-000-of-002-shard-00008-of-00010
-rw-r--r-- 1 ym7 ym7  25980 Jun  3 18:04 tfrecords_aikata-fold-000-of-002-shard-00009-of-00010
-rw-r--r-- 1 ym7 ym7  91719 Jun  3 18:04 tfrecords_aikata-fold-001-of-002-shard-00000-of-00010
-rw-r--r-- 1 ym7 ym7  94916 Jun  3 18:04 tfrecords_aikata-fold-001-of-002-shard-00001-of-00010
-rw-r--r-- 1 ym7 ym7  94321 Jun  3 18:04 tfrecords_aikata-fold-001-of-002-shard-00002-of-00010
-rw-r--r-- 1 ym7 ym7  98650 Jun  3 18:04 tfrecords_aikata-fold-001-of-002-shard-00003-of-00010
-rw-r--r-- 1 ym7 ym7  92281 Jun  3 18:04 tfrecords_aikata-fold-001-of-002-shard-00004-of-00010
-rw-r--r-- 1 ym7 ym7  92103 Jun  3 18:04 tfrecords_aikata-fold-001-of-002-shard-00005-of-00010
-rw-r--r-- 1 ym7 ym7  96474 Jun  3 18:04 tfrecords_aikata-fold-001-of-002-shard-00006-of-00010
-rw-r--r-- 1 ym7 ym7  95012 Jun  3 18:04 tfrecords_aikata-fold-001-of-002-shard-00007-of-00010
-rw-r--r-- 1 ym7 ym7  97688 Jun  3 18:04 tfrecords_aikata-fold-001-of-002-shard-00008-of-00010
-rw-r--r-- 1 ym7 ym7 100730 Jun  3 18:04 tfrecords_aikata-fold-001-of-002-shard-00009-of-00010

$ ll data/training/
drwxrwxrwx 2 ym7 ym7 237568 Jun  1  2012 image_2/
drwxrwxrwx 2 ym7 ym7  61440 Jun  3 18:03 images_aikata/
drwxrwxrwx 2 ym7 ym7  32768 May 19 18:48 images_val/
drwxrwxrwx 2 ym7 ym7 245760 May 21  2015 label_2/
drwxrwxrwx 2 ym7 ym7  73728 Jun  3 18:01 labels_aikata/
drwxrwxrwx 2 ym7 ym7  36864 May 19 18:48 labels_val/

$ ll data
drwxrwxrwx 4 ym7 ym7        4096 Jun  3 15:42 testing/
drwxrwxrwx 2 ym7 ym7        4096 May 17 07:06 test_samples/
drwxrwxrwx 2 ym7 ym7        4096 Jun  3 18:05 tfrecords_aikata/
drwxrwxrwx 8 ym7 ym7        4096 May 31 11:09 training/

uma7 · June 4, 2024, 5:45am

@Morganh
I am sorry, after I setup env one by one again, it might be solved.

It seems tao model detectnet_v2 train is running, although it is run out of GPU memory.

2024-06-04 03:58:54,445 [TAO Toolkit] [INFO] root 2102: Training graph built.
2024-06-04 03:58:54,445 [TAO Toolkit] [INFO] root 2102: Building validation graph.
2024-06-04 03:58:54,446 [TAO Toolkit] [INFO] nvidia_tao_tf1.blocks.multi_source_loader.data_loader 175: Serial augmentation enabled = False
2024-06-04 03:58:54,446 [TAO Toolkit] [INFO] nvidia_tao_tf1.blocks.multi_source_loader.data_loader 177: Pseudo sharding enabled = False
2024-06-04 03:58:54,446 [TAO Toolkit] [INFO] nvidia_tao_tf1.blocks.multi_source_loader.data_loader 269: Max Image Dimensions (all sources): (0, 0)
2024-06-04 03:58:54,446 [TAO Toolkit] [INFO] nvidia_tao_tf1.blocks.multi_source_loader.data_loader 380: number of cpus: 16, io threads: 32, compute threads: 16, buffered batches: 4
2024-06-04 03:58:54,446 [TAO Toolkit] [INFO] nvidia_tao_tf1.blocks.multi_source_loader.data_loader 387: total dataset size 258, number of sources: 1, batch size per gpu: 1, steps: 258
2024-06-04 03:58:54,464 [TAO Toolkit] [INFO] nvidia_tao_tf1.cv.detectnet_v2.dataloader.default_dataloader 546: Bounding box coordinates were detected in the input specification! Bboxes will be automatically converted to polygon coordinates.
2024-06-04 03:58:54,598 [TAO Toolkit] [INFO] nvidia_tao_tf1.blocks.multi_source_loader.data_loader 409: shuffle: False - shard 0 of 1
2024-06-04 03:58:54,601 [TAO Toolkit] [INFO] nvidia_tao_tf1.blocks.multi_source_loader.data_loader 479: sampling 1 datasets with weights:
2024-06-04 03:58:54,601 [TAO Toolkit] [INFO] nvidia_tao_tf1.blocks.multi_source_loader.data_loader 481: source: 0 weight: 1.000000
2024-06-04 03:58:54,737 [TAO Toolkit] [INFO] __main__ 591: Found 258 samples in validation set
2024-06-04 03:58:54,737 [TAO Toolkit] [INFO] root 2102: Rasterizing tensors.
2024-06-04 03:58:54,854 [TAO Toolkit] [INFO] root 2102: Tensors rasterized.
2024-06-04 03:58:55,071 [TAO Toolkit] [INFO] root 2102: Validation graph built.
WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/nvidia_tao_tf1/cv/detectnet_v2/tfhooks/validation_hook.py:58: The name tf.summary.FileWriterCache is deprecated. Please use tf.compat.v1.summary.FileWriterCache instead.

2024-06-04 03:58:55,072 [TAO Toolkit] [WARNING] tensorflow 137: From /usr/local/lib/python3.8/dist-packages/nvidia_tao_tf1/cv/detectnet_v2/tfhooks/validation_hook.py:58: The name tf.summary.FileWriterCache is deprecated. Please use tf.compat.v1.summary.FileWriterCache instead.

2024-06-04 03:58:56,296 [TAO Toolkit] [INFO] root 2102: Running training loop.
2024-06-04 03:58:56,296 [TAO Toolkit] [INFO] __main__ 135: Checkpoint interval: 10
2024-06-04 03:58:56,297 [TAO Toolkit] [INFO] __main__ 175: Scalars logged at every 20 steps
2024-06-04 03:58:56,297 [TAO Toolkit] [INFO] __main__ 180: Images logged at every 5175 steps
INFO:tensorflow:Create CheckpointSaverHook.
2024-06-04 03:58:56,299 [TAO Toolkit] [INFO] tensorflow 541: Create CheckpointSaverHook.
WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/nvidia_tao_tf1/cv/detectnet_v2/training/utilities.py:154: The name tf.train.SingularMonitoredSession is deprecated. Please use tf.compat.v1.train.SingularMonitoredSession instead.

2024-06-04 03:58:56,587 [TAO Toolkit] [WARNING] tensorflow 137: From /usr/local/lib/python3.8/dist-packages/nvidia_tao_tf1/cv/detectnet_v2/training/utilities.py:154: The name tf.train.SingularMonitoredSession is deprecated. Please use tf.compat.v1.train.SingularMonitoredSession instead.

INFO:tensorflow:Graph was finalized.
2024-06-04 03:58:58,001 [TAO Toolkit] [INFO] tensorflow 240: Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpuuxekwqa/model.ckpt-0
2024-06-04 03:58:58,003 [TAO Toolkit] [INFO] tensorflow 1284: Restoring parameters from /tmp/tmpuuxekwqa/model.ckpt-0
INFO:tensorflow:Running local_init_op.
2024-06-04 03:58:59,364 [TAO Toolkit] [INFO] tensorflow 500: Running local_init_op.
INFO:tensorflow:Done running local_init_op.
2024-06-04 03:58:59,850 [TAO Toolkit] [INFO] tensorflow 502: Done running local_init_op.
INFO:tensorflow:Saving checkpoints for step-0.
2024-06-04 03:59:03,521 [TAO Toolkit] [INFO] tensorflow 81: Saving checkpoints for step-0.
2024-06-04 03:59:17,147 [TAO Toolkit] [INFO] nvidia_tao_tf1.core.hooks.hooks 224: Overwritten Keras model: /workspace/tao-experiments/detectnet_v2_test/experiment_dir_unpruned/model.epoch-0.hdf5.
2024-06-04 04:12:15,938 [TAO Toolkit] [INFO] root 2102: Saving trained model.
2024-06-04 04:12:16,101 [TAO Toolkit] [INFO] root 2102: Model saved.
2024-06-04 04:12:16,284 [TAO Toolkit] [ERROR] __main__ 1050: Ran out of GPU memory, please lower the batch size, use a smaller input resolution, or use a smaller backbone.
2024-06-04 04:12:16,284 [TAO Toolkit] [INFO] root 2102: Ran out of GPU memory, please lower the batch size, use a smaller input resolution, or use a smaller backbone.
Execution status: FAIL
2024-06-04 13:12:20,790 [TAO Toolkit] [INFO] nvidia_tao_cli.components.docker_handler.docker_handler 363: Stopping container.

I’ll try this after I use another GPU machine, then I judge if I can solve it or not.

uma7 · June 4, 2024, 1:26pm

Thanks.
I could train with tao model detectnet_v2 train.

system · June 18, 2024, 1:27pm

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.

Topic		Replies	Views
ValueError: Number of logging points 50 must be <= than the number of steps per epoch 16 TAO Toolkit	2	467	August 29, 2023
Tao toolkit observations TAO Toolkit	56	786	May 29, 2024
The mAP value starting from 0.05% although the freeze block is set. Shouldn't it preserve the features of the base peoplenet model? TAO Toolkit	4	278	March 21, 2024
TAO 5.0 failed to train TAO Toolkit	8	533	August 1, 2023
How to create an AI model with annotation and image data? TAO Toolkit tao	8	229	June 3, 2024
Detectnet_v2.ipynb issue with custom data TAO Toolkit tao	3	272	May 17, 2024
Detectnetv2 tfrecords error TAO Toolkit	4	415	January 13, 2024
Tao toolkit detectnet training kitty format error TAO Toolkit	10	412	December 8, 2023
Detectnet2 TAO Toolkit model training fail on formating dataset on kitti format TAO Toolkit	69	948	January 22, 2024
Tao Training Model Error TAO Toolkit	7	484	January 15, 2024

Excute tao model detectnet_v2 train but Failed

Related topics