Original error: could not get source code

I get the following error.
I am using a “p3.16xlarge” instance of AWS.
There are 8 GPUs.

The TAO training fails.
How can I resolve this?

!tao detectnet_v2 train -e $SPECS_DIR/detectnet_v2_train_resnet18_kitti.txt \
                        -r $USER_EXPERIMENT_DIR/experiment_dir_unpruned \
                        -k $KEY \
                        -n resnet18_detector \
                        --gpus 1

■detectnet_v2_tfrecords_kitti_trainval.txt

kitti_config {
  root_directory_path: "/home/ubuntu/development/workspace/aquarium/dataDownloadDir/training"
  image_dir_name: "image_2"
  label_dir_name: "label_2"
  image_extension: ".jpg"
  partition_mode: "random"
  num_partitions: 2
  val_split: 14
  num_shards: 10
}
image_directory_path: "/home/ubuntu/development/workspace/aquarium/dataDownloadDir/training"

■detectnet_v2_train_resnet18_kitti.txt

random_seed: 42
dataset_config {
  data_sources {
    tfrecords_path: "/home/ubuntu/development/workspace/aquarium/dataDownloadDir/tfrecords/kitti_trainval/*"
    image_directory_path: "/home/ubuntu/development/workspace/aquarium/dataDownloadDir/training"
  }
  image_extension: "jpg"
  target_class_mapping {
    key: "fish"
    value: "fish"
  }
  validation_fold: 0
}
augmentation_config {
  preprocessing {
    output_image_width: 1248
    output_image_height: 384
    min_bbox_width: 1.0
    min_bbox_height: 1.0
    output_image_channel: 3
  }
  spatial_augmentation {
    hflip_probability: 0.5
    zoom_min: 1.0
    zoom_max: 1.0
    translate_max_x: 8.0
    translate_max_y: 8.0
  }
  color_augmentation {
    hue_rotation_max: 25.0
    saturation_shift_max: 0.20000000298
    contrast_scale_max: 0.10000000149
    contrast_center: 0.5
  }
}
postprocessing_config {
  target_class_config {
    key: "fish"
    value {
      clustering_config {
        clustering_algorithm: DBSCAN
        dbscan_confidence_threshold: 0.9
        coverage_threshold: 0.00499999988824
        dbscan_eps: 0.20000000298
        dbscan_min_samples: 0.0500000007451
        minimum_bounding_box_height: 20
      }
    }
  }
}
model_config {
  pretrained_model_file: "/home/ubuntu/development/workspace/aquarium/localProjectDir/detectnet_v2/pretrained_resnet18/pretrained_detectnet_v2_vresnet18/resnet18.hdf5"
  num_layers: 18
  use_batch_norm: true
  objective_set {
    bbox {
      scale: 35.0
      offset: 0.5
    }
    cov {
    }
  }
  arch: "resnet"
}
evaluation_config {
  validation_period_during_training: 10
  first_validation_epoch: 10
  minimum_detection_ground_truth_overlap {
    key: "fish"
    value: 0.699999988079
  }
  evaluation_box_config {
    key: "fish"
    value {
      minimum_height: 20
      maximum_height: 9999
      minimum_width: 10
      maximum_width: 9999
    }
  }
  average_precision_mode: INTEGRATE
}
cost_function_config {
  target_classes {
    name: "fish"
    class_weight: 1.0
    coverage_foreground_weight: 0.0500000007451
    objectives {
      name: "cov"
      initial_weight: 1.0
      weight_target: 1.0
    }
    objectives {
      name: "bbox"
      initial_weight: 10.0
      weight_target: 10.0
    }
  }
  enable_autoweighting: true
  max_objective_weight: 0.999899983406
  min_objective_weight: 9.99999974738e-05
}
training_config {
  batch_size_per_gpu: 1
  num_epochs: 120
  learning_rate {
    soft_start_annealing_schedule {
      min_learning_rate: 5e-06
      max_learning_rate: 5e-04
      soft_start: 0.10000000149
      annealing: 0.699999988079
    }
  }
  regularizer {
    type: L1
    weight: 3.00000002618e-09
  }
  optimizer {
    adam {
      epsilon: 9.99999993923e-09
      beta1: 0.899999976158
      beta2: 0.999000012875
    }
  }
  cost_scaling {
    initial_exponent: 20.0
    increment: 0.005
    decrement: 1.0
  }
  checkpoint_interval: 10
}
bbox_rasterizer_config {
  target_class_config {
    key: "fish"
    value {
      cov_center_x: 0.5
      cov_center_y: 0.5
      cov_radius_x: 0.40000000596
      cov_radius_y: 0.40000000596
      bbox_min_radius: 1.0
    }
  }
  deadzone_radius: 0.400000154972
}

■error log

2022-06-11 06:37:44,412 [INFO] root: Tensors rasterized.
WARNING:tensorflow:From /root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/training/training_proto_utilities.py:89: The name tf.train.get_or_create_global_step is deprecated. Please use tf.compat.v1.train.get_or_create_global_step instead.

2022-06-11 06:37:44,412 [WARNING] tensorflow: From /root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/training/training_proto_utilities.py:89: The name tf.train.get_or_create_global_step is deprecated. Please use tf.compat.v1.train.get_or_create_global_step instead.

WARNING:tensorflow:From /root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py:515: The name tf.summary.scalar is deprecated. Please use tf.compat.v1.summary.scalar instead.

2022-06-11 06:37:44,434 [WARNING] tensorflow: From /root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py:515: The name tf.summary.scalar is deprecated. Please use tf.compat.v1.summary.scalar instead.

WARNING:tensorflow:From /root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/training/training_proto_utilities.py:36: The name tf.train.AdamOptimizer is deprecated. Please use tf.compat.v1.train.AdamOptimizer instead.

2022-06-11 06:37:44,436 [WARNING] tensorflow: From /root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/training/training_proto_utilities.py:36: The name tf.train.AdamOptimizer is deprecated. Please use tf.compat.v1.train.AdamOptimizer instead.

WARNING:tensorflow:From /root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/cost_function/cost_functions.py:17: The name tf.log is deprecated. Please use tf.math.log instead.

2022-06-11 06:37:44,659 [WARNING] tensorflow: From /root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/cost_function/cost_functions.py:17: The name tf.log is deprecated. Please use tf.math.log instead.

WARNING:tensorflow:From /root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/cost_function/cost_auto_weight_hook.py:235: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.

2022-06-11 06:37:44,673 [WARNING] tensorflow: From /root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/cost_function/cost_auto_weight_hook.py:235: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.

2022-06-11 06:37:47,233 [INFO] root: Training graph built.
2022-06-11 06:37:47,234 [INFO] root: Building validation graph.
2022-06-11 06:37:47,234 [INFO] modulus.blocks.data_loaders.multi_source_loader.data_loader: Serial augmentation enabled = False
2022-06-11 06:37:47,234 [INFO] modulus.blocks.data_loaders.multi_source_loader.data_loader: Pseudo sharding enabled = False
2022-06-11 06:37:47,235 [INFO] modulus.blocks.data_loaders.multi_source_loader.data_loader: Max Image Dimensions (all sources): (0, 0)
2022-06-11 06:37:47,235 [INFO] modulus.blocks.data_loaders.multi_source_loader.data_loader: number of cpus: 64, io threads: 128, compute threads: 64, buffered batches: 4
2022-06-11 06:37:47,235 [INFO] modulus.blocks.data_loaders.multi_source_loader.data_loader: total dataset size 3, number of sources: 1, batch size per gpu: 1, steps: 3
WARNING:tensorflow:Entity <bound method DriveNetTFRecordsParser.__call__ of <iva.detectnet_v2.dataloader.drivenet_dataloader.DriveNetTFRecordsParser object at 0x7fce64da77f0>> could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Unable to locate the source code of <bound method DriveNetTFRecordsParser.__call__ of <iva.detectnet_v2.dataloader.drivenet_dataloader.DriveNetTFRecordsParser object at 0x7fce64da77f0>>. Note that functions defined in certain environments, like the interactive Python shell do not expose their source code. If that is the case, you should to define them in a .py source file. If you are certain the code is graph-compatible, wrap the call using @tf.autograph.do_not_convert. Original error: could not get source code
2022-06-11 06:37:47,248 [WARNING] tensorflow: Entity <bound method DriveNetTFRecordsParser.__call__ of <iva.detectnet_v2.dataloader.drivenet_dataloader.DriveNetTFRecordsParser object at 0x7fce64da77f0>> could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Unable to locate the source code of <bound method DriveNetTFRecordsParser.__call__ of <iva.detectnet_v2.dataloader.drivenet_dataloader.DriveNetTFRecordsParser object at 0x7fce64da77f0>>. Note that functions defined in certain environments, like the interactive Python shell do not expose their source code. If that is the case, you should to define them in a .py source file. If you are certain the code is graph-compatible, wrap the call using @tf.autograph.do_not_convert. Original error: could not get source code
Traceback (most recent call last):
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 917, in <module>
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 906, in <module>
  File "<decorator-gen-2>", line 2, in main
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/utilities/timer.py", line 46, in wrapped_fn
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 893, in main
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 757, in run_experiment
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 682, in train_gridbox
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 556, in build_validation_graph
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/dataloader/drivenet_dataloader.py", line 708, in get_dataset_tensors
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/core/build_wheel.runfiles/ai_infra/moduluspy/modulus/blocks/trainers/multi_task_trainer/data_loader_interface.py", line 77, in __call__
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/core/build_wheel.runfiles/ai_infra/moduluspy/modulus/blocks/data_loaders/multi_source_loader/data_loader.py", line 396, in call
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/data/ops/dataset_ops.py", line 2081, in apply
    return DatasetV1Adapter(super(DatasetV1, self).apply(transformation_func))
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/data/ops/dataset_ops.py", line 1422, in apply
    dataset = transformation_func(self)
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/dataloader/drivenet_dataloader.py", line 408, in <lambda>
  File "/opt/nvidia/third_party/keras/tensorflow_backend.py", line 356, in new_map
    self, _map_func_set_random_wrapper, num_parallel_calls=num_parallel_calls
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/data/ops/dataset_ops.py", line 2000, in map
    MapDataset(self, map_func, preserve_cardinality=False))
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/data/ops/dataset_ops.py", line 3531, in __init__
    use_legacy_function=use_legacy_function)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/data/ops/dataset_ops.py", line 2810, in __init__
    self._function = wrapper_fn._get_concrete_function_internal()
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/eager/function.py", line 1853, in _get_concrete_function_internal
    *args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/eager/function.py", line 1847, in _get_concrete_function_internal_garbage_collected
    graph_function, _, _ = self._maybe_define_function(args, kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/eager/function.py", line 2147, in _maybe_define_function
    graph_function = self._create_graph_function(args, kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/eager/function.py", line 2038, in _create_graph_function
    capture_by_value=self._capture_by_value),
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/func_graph.py", line 915, in func_graph_from_py_func
    func_outputs = python_func(*func_args, **func_kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/data/ops/dataset_ops.py", line 2804, in wrapper_fn
    ret = _wrapper_helper(*args)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/data/ops/dataset_ops.py", line 2749, in _wrapper_helper
    ret = autograph.tf_convert(func, ag_ctx)(*nested_args)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/autograph/impl/api.py", line 237, in wrapper
    raise e.ag_error_metadata.to_exception(e)
StopIteration: in converted code:

    /opt/nvidia/third_party/keras/tensorflow_backend.py:353 _map_func_set_random_wrapper  *
        return map_func(*args, **kwargs)
    /root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/dataloader/drivenet_dataloader.py:147 __call__
        
    /root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/dataloader/drivenet_dataloader.py:115 _get_parse_example
        
    /root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/dataloader/utilities.py:217 extract_tfrecords_features
        

    StopIteration: 

2022-06-11 06:37:48,750 [INFO] tlt.components.docker_handler.docker_handler: Stopping container.

■error command

!tao detectnet_v2 train -e $SPECS_DIR/detectnet_v2_train_resnet18_kitti.txt \
                        -r $USER_EXPERIMENT_DIR/experiment_dir_unpruned \
                        -k $KEY \
                        -n resnet18_detector \
                        --gpus 1

Trying it with kitti car data works.
It seems that the number of epochs and batch_size_per_gpu are related.

It did not work with the original dataset.
Is it also related to the image ratio?

I converted the original dataset to kitti format.

This is what the image of my original data set looks like.
It is smaller than the KITTI image.

IMG_2427_jpeg_jpg.rf.a1f5cac56335f054c3c52f3dfe4e6cc6

■label data like this

fish 0.00 0 0.0 62.00 12.00 256.00 156.00 0.0 0.0 0.0 0.0 0.0 0.0 0.0
fish 0.00 0 0.0 199.00 192.00 293.00 289.00 0.0 0.0 0.0 0.0 0.0 0.0 0.0
fish 0.00 0 0.0 300.00 272.00 399.00 352.00 0.0 0.0 0.0 0.0 0.0 0.0 0.0
fish 0.00 0 0.0 266.00 125.00 320.00 204.00 0.0 0.0 0.0 0.0 0.0 0.0 0.0
fish 0.00 0 0.0 30.00 260.00 49.00 291.00 0.0 0.0 0.0 0.0 0.0 0.0 0.0
fish 0.00 0 0.0 153.00 251.00 176.00 278.00 0.0 0.0 0.0 0.0 0.0 0.0 0.0
fish 0.00 0 0.0 101.00 245.00 130.00 271.00 0.0 0.0 0.0 0.0 0.0 0.0 0.0
fish 0.00 0 0.0 1.00 295.00 28.00 318.00 0.0 0.0 0.0 0.0 0.0 0.0 0.0
fish 0.00 0 0.0 182.00 266.00 211.00 288.00 0.0 0.0 0.0 0.0 0.0 0.0 0.0
fish 0.00 0 0.0 109.00 274.00 146.00 295.00 0.0 0.0 0.0 0.0 0.0 0.0 0.0
fish 0.00 0 0.0 53.00 272.00 88.00 287.00 0.0 0.0 0.0 0.0 0.0 0.0 0.0
fish 0.00 0 0.0 124.00 248.00 147.00 263.00 0.0 0.0 0.0 0.0 0.0 0.0 0.0
fish 0.00 0 0.0 19.00 277.00 60.00 292.00 0.0 0.0 0.0 0.0 0.0 0.0 0.0
fish 0.00 0 0.0 1.00 300.00 36.00 315.00 0.0 0.0 0.0 0.0 0.0 0.0 0.0

■detectnet_v2_train_resnet18_kitti.txt

random_seed: 42
dataset_config {
  data_sources {
    tfrecords_path: "/home/ubuntu/development/workspace/aquarium/dataDownloadDir/tfrecords/kitti_trainval/*"
    image_directory_path: "/home/ubuntu/development/workspace/aquarium/dataDownloadDir/training"
  }
  image_extension: "jpg"
  target_class_mapping {
    key: "fish"
    value: "fish"
  }
  target_class_mapping {
    key: "jellyfish"
    value: "jellyfish"
  }
  target_class_mapping {
    key: "penguin"
    value: "penguin"
  }
  target_class_mapping {
    key: "puffin"
    value: "puffin"
  }
  target_class_mapping {
    key: "shark"
    value: "shark"
  }
  validation_fold: 0
}
augmentation_config {
  preprocessing {
    output_image_width: 1248
    output_image_height: 384
    min_bbox_width: 1.0
    min_bbox_height: 1.0
    output_image_channel: 3
  }
  spatial_augmentation {
    hflip_probability: 0.5
    zoom_min: 1.0
    zoom_max: 1.0
    translate_max_x: 8.0
    translate_max_y: 8.0
  }
  color_augmentation {
    hue_rotation_max: 25.0
    saturation_shift_max: 0.20000000298
    contrast_scale_max: 0.10000000149
    contrast_center: 0.5
  }
}
postprocessing_config {
  target_class_config {
    key: "fish"
    value {
      clustering_config {
        clustering_algorithm: DBSCAN
        dbscan_confidence_threshold: 0.9
        coverage_threshold: 0.00499999988824
        dbscan_eps: 0.20000000298
        dbscan_min_samples: 0.0500000007451
        minimum_bounding_box_height: 20
      }
    }
  }
  target_class_config {
    key: "jellyfish"
    value {
      clustering_config {
        clustering_algorithm: DBSCAN
        dbscan_confidence_threshold: 0.9
        coverage_threshold: 0.00499999988824
        dbscan_eps: 0.15000000596
        dbscan_min_samples: 0.0500000007451
        minimum_bounding_box_height: 20
      }
    }
  }
  target_class_config {
    key: "penguin"
    value {
      clustering_config {
        clustering_algorithm: DBSCAN
        dbscan_confidence_threshold: 0.9
        coverage_threshold: 0.00749999983236
        dbscan_eps: 0.230000004172
        dbscan_min_samples: 0.0500000007451
        minimum_bounding_box_height: 20
      }
    }
  }
}
model_config {
  pretrained_model_file: "/home/ubuntu/development/workspace/aquarium/localProjectDir/detectnet_v2/pretrained_resnet18/pretrained_detectnet_v2_vresnet18/resnet18.hdf5"
  num_layers: 18
  use_batch_norm: true
  objective_set {
    bbox {
      scale: 35.0
      offset: 0.5
    }
    cov {
    }
  }
  arch: "resnet"
}
evaluation_config {
  validation_period_during_training: 10
  first_validation_epoch: 30
  minimum_detection_ground_truth_overlap {
    key: "fish"
    value: 0.699999988079
  }
  minimum_detection_ground_truth_overlap {
    key: "jellyfish"
    value: 0.5
  }
  minimum_detection_ground_truth_overlap {
    key: "penguin"
    value: 0.5
  }
  evaluation_box_config {
    key: "fish"
    value {
      minimum_height: 20
      maximum_height: 9999
      minimum_width: 10
      maximum_width: 9999
    }
  }
  evaluation_box_config {
    key: "jellyfish"
    value {
      minimum_height: 20
      maximum_height: 9999
      minimum_width: 10
      maximum_width: 9999
    }
  }
  evaluation_box_config {
    key: "penguin"
    value {
      minimum_height: 20
      maximum_height: 9999
      minimum_width: 10
      maximum_width: 9999
    }
  }
  average_precision_mode: INTEGRATE
}
cost_function_config {
  target_classes {
    name: "fish"
    class_weight: 1.0
    coverage_foreground_weight: 0.0500000007451
    objectives {
      name: "cov"
      initial_weight: 1.0
      weight_target: 1.0
    }
    objectives {
      name: "bbox"
      initial_weight: 10.0
      weight_target: 10.0
    }
  }
  target_classes {
    name: "jellyfish"
    class_weight: 8.0
    coverage_foreground_weight: 0.0500000007451
    objectives {
      name: "cov"
      initial_weight: 1.0
      weight_target: 1.0
    }
    objectives {
      name: "bbox"
      initial_weight: 10.0
      weight_target: 1.0
    }
  }
  target_classes {
    name: "penguin"
    class_weight: 4.0
    coverage_foreground_weight: 0.0500000007451
    objectives {
      name: "cov"
      initial_weight: 1.0
      weight_target: 1.0
    }
    objectives {
      name: "bbox"
      initial_weight: 10.0
      weight_target: 10.0
    }
  }
  enable_autoweighting: true
  max_objective_weight: 0.999899983406
  min_objective_weight: 9.99999974738e-05
}
training_config {
  batch_size_per_gpu: 1
  num_epochs: 120
  learning_rate {
    soft_start_annealing_schedule {
      min_learning_rate: 5e-06
      max_learning_rate: 5e-04
      soft_start: 0.10000000149
      annealing: 0.699999988079
    }
  }
  regularizer {
    type: L1
    weight: 3.00000002618e-09
  }
  optimizer {
    adam {
      epsilon: 9.99999993923e-09
      beta1: 0.899999976158
      beta2: 0.999000012875
    }
  }
  cost_scaling {
    initial_exponent: 20.0
    increment: 0.005
    decrement: 1.0
  }
  checkpoint_interval: 10
}
bbox_rasterizer_config {
  target_class_config {
    key: "fish"
    value {
      cov_center_x: 0.5
      cov_center_y: 0.5
      cov_radius_x: 0.40000000596
      cov_radius_y: 0.40000000596
      bbox_min_radius: 1.0
    }
  }
  target_class_config {
    key: "jellyfish"
    value {
      cov_center_x: 0.5
      cov_center_y: 0.5
      cov_radius_x: 1.0
      cov_radius_y: 1.0
      bbox_min_radius: 1.0
    }
  }
  target_class_config {
    key: "penguin"
    value {
      cov_center_x: 0.5
      cov_center_y: 0.5
      cov_radius_x: 1.0
      cov_radius_y: 1.0
      bbox_min_radius: 1.0
    }
  }
  deadzone_radius: 0.400000154972
}

now, this error occur.

■error log

2022-06-11 09:42:51,199 [WARNING] tensorflow: From /root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/common/graph/initializers.py:16: The name tf.get_collection is deprecated. Please use tf.compat.v1.get_collection instead.

WARNING:tensorflow:From /root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/tfhooks/utils.py:61: The name tf.train.LoggingTensorHook is deprecated. Please use tf.estimator.LoggingTensorHook instead.

2022-06-11 09:42:51,201 [WARNING] tensorflow: From /root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/tfhooks/utils.py:61: The name tf.train.LoggingTensorHook is deprecated. Please use tf.estimator.LoggingTensorHook instead.

WARNING:tensorflow:From /root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/tfhooks/utils.py:62: The name tf.train.StopAtStepHook is deprecated. Please use tf.estimator.StopAtStepHook instead.

2022-06-11 09:42:51,202 [WARNING] tensorflow: From /root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/tfhooks/utils.py:62: The name tf.train.StopAtStepHook is deprecated. Please use tf.estimator.StopAtStepHook instead.

WARNING:tensorflow:From /root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/tfhooks/utils.py:75: The name tf.train.StepCounterHook is deprecated. Please use tf.estimator.StepCounterHook instead.

2022-06-11 09:42:51,202 [WARNING] tensorflow: From /root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/tfhooks/utils.py:75: The name tf.train.StepCounterHook is deprecated. Please use tf.estimator.StepCounterHook instead.

INFO:tensorflow:Create CheckpointSaverHook.
2022-06-11 09:42:51,202 [INFO] tensorflow: Create CheckpointSaverHook.
WARNING:tensorflow:From /root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/tfhooks/utils.py:104: The name tf.train.SummarySaverHook is deprecated. Please use tf.estimator.SummarySaverHook instead.

2022-06-11 09:42:51,202 [WARNING] tensorflow: From /root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/tfhooks/utils.py:104: The name tf.train.SummarySaverHook is deprecated. Please use tf.estimator.SummarySaverHook instead.

WARNING:tensorflow:From /root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/training/utilities.py:140: The name tf.train.SingularMonitoredSession is deprecated. Please use tf.compat.v1.train.SingularMonitoredSession instead.

2022-06-11 09:42:51,203 [WARNING] tensorflow: From /root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/training/utilities.py:140: The name tf.train.SingularMonitoredSession is deprecated. Please use tf.compat.v1.train.SingularMonitoredSession instead.

INFO:tensorflow:Graph was finalized.
2022-06-11 09:42:54,583 [INFO] tensorflow: Graph was finalized.
INFO:tensorflow:Running local_init_op.
2022-06-11 09:42:56,232 [INFO] tensorflow: Running local_init_op.
INFO:tensorflow:Done running local_init_op.
2022-06-11 09:42:57,035 [INFO] tensorflow: Done running local_init_op.
2022-06-11 09:43:06,360 [INFO] root: Saving trained model.
2022-06-11 09:43:06,395 [INFO] root: Saving trained model.
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1365, in _do_call
    return fn(*args)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1350, in _run_fn
    target_list, run_metadata)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1443, in _call_tf_sessionrun
    run_metadata)
tensorflow.python.framework.errors_impl.InvalidArgumentError: 2 root error(s) found.
  (0) Invalid argument: bboxes class ID out of range [0, 3[, got-1
	 [[{{node BboxRasterizer/RasterizeBbox}}]]
	 [[resnet18_nopool_bn_detectnet_v2/block_2a_bn_1/AssignMovingAvg/_4451]]
  (1) Invalid argument: bboxes class ID out of range [0, 3[, got-1
	 [[{{node BboxRasterizer/RasterizeBbox}}]]
0 successful operations.
0 derived errors ignored.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 917, in <module>
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 906, in <module>
  File "<decorator-gen-2>", line 2, in main
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/utilities/timer.py", line 46, in wrapped_fn
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 893, in main
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 757, in run_experiment
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 693, in train_gridbox
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 195, in run_training_loop
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 754, in run
    run_metadata=run_metadata)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 1360, in run
    raise six.reraise(*original_exc_info)
  File "/usr/local/lib/python3.6/dist-packages/six.py", line 696, in reraise
    raise value
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 1345, in run
    return self._sess.run(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 1418, in run
    run_metadata=run_metadata)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 1176, in run
    return self._sess.run(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 956, in run
    run_metadata_ptr)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1180, in _run
    feed_dict_tensor, options, run_metadata)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1359, in _do_run
    run_metadata)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1384, in _do_call
    raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.InvalidArgumentError: 2 root error(s) found.
  (0) Invalid argument: bboxes class ID out of range [0, 3[, got-1
	 [[node BboxRasterizer/RasterizeBbox (defined at /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py:1748) ]]
	 [[resnet18_nopool_bn_detectnet_v2/block_2a_bn_1/AssignMovingAvg/_4451]]
  (1) Invalid argument: bboxes class ID out of range [0, 3[, got-1
	 [[node BboxRasterizer/RasterizeBbox (defined at /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py:1748) ]]
0 successful operations.
0 derived errors ignored.

Original stack trace for 'BboxRasterizer/RasterizeBbox':
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 906, in <module>
  File "<decorator-gen-2>", line 2, in main
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/utilities/timer.py", line 46, in wrapped_fn
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 893, in main
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 757, in run_experiment
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 668, in train_gridbox
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 507, in build_training_graph
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 397, in rasterize_tensors
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/model/detectnet_model.py", line 665, in generate_ground_truth_tensors
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/objectives/objective_set.py", line 278, in generate_ground_truth_tensors
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/rasterizers/bbox_rasterizer.py", line 473, in rasterize_labels
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/core/build_wheel.runfiles/ai_infra/moduluspy/modulus/processors/processors.py", line 235, in __call__
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/core/build_wheel.runfiles/ai_infra/moduluspy/modulus/processors/bbox_rasterizer.py", line 234, in call
  File "<string>", line 161, in rasterize_bbox
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/op_def_library.py", line 794, in _apply_op_helper
    op_def=op_def)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/util/deprecation.py", line 513, in new_func
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py", line 3357, in create_op
    attrs, op_def, compute_device)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py", line 3426, in _create_op_internal
    op_def=op_def)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py", line 1748, in __init__
    self._traceback = tf_stack.extract_stack()
2022-06-11 09:43:06,561 [INFO] root: Saving trained model.

2022-06-11 09:43:06,578 [INFO] root: Saving trained model.
2022-06-11 09:43:06,648 [INFO] root: Saving trained model.
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1365, in _do_call
    return fn(*args)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1350, in _run_fn
    target_list, run_metadata)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1443, in _call_tf_sessionrun
    run_metadata)
tensorflow.python.framework.errors_impl.InvalidArgumentError: 2 root error(s) found.
  (0) Invalid argument: bboxes class ID out of range [0, 3[, got-1
	 [[{{node BboxRasterizer/RasterizeBbox}}]]
	 [[resnet18_nopool_bn_detectnet_v2/block_2a_bn_1/AssignMovingAvg/_4451]]
  (1) Invalid argument: bboxes class ID out of range [0, 3[, got-1
	 [[{{node BboxRasterizer/RasterizeBbox}}]]
0 successful operations.
0 derived errors ignored.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 917, in <module>
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 906, in <module>
  File "<decorator-gen-2>", line 2, in main
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/utilities/timer.py", line 46, in wrapped_fn
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 893, in main
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 757, in run_experiment
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 693, in train_gridbox
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 195, in run_training_loop
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 754, in run
    run_metadata=run_metadata)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 1360, in run
    raise six.reraise(*original_exc_info)
  File "/usr/local/lib/python3.6/dist-packages/six.py", line 696, in reraise
    raise value
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 1345, in run
    return self._sess.run(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 1418, in run
    run_metadata=run_metadata)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 1176, in run
    return self._sess.run(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 956, in run
    run_metadata_ptr)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1180, in _run
    feed_dict_tensor, options, run_metadata)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1359, in _do_run
    run_metadata)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1384, in _do_call
    raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.InvalidArgumentError: 2 root error(s) found.
  (0) Invalid argument: bboxes class ID out of range [0, 3[, got-1
	 [[node BboxRasterizer/RasterizeBbox (defined at /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py:1748) ]]
	 [[resnet18_nopool_bn_detectnet_v2/block_2a_bn_1/AssignMovingAvg/_4451]]
  (1) Invalid argument: bboxes class ID out of range [0, 3[, got-1
	 [[node BboxRasterizer/RasterizeBbox (defined at /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py:1748) ]]
0 successful operations.
0 derived errors ignored.

Original stack trace for 'BboxRasterizer/RasterizeBbox':
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 906, in <module>
  File "<decorator-gen-2>", line 2, in main
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/utilities/timer.py", line 46, in wrapped_fn
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 893, in main
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 757, in run_experiment
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 668, in train_gridbox
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 507, in build_training_graph
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 397, in rasterize_tensors
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/model/detectnet_model.py", line 665, in generate_ground_truth_tensors
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/objectives/objective_set.py", line 278, in generate_ground_truth_tensors
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/rasterizers/bbox_rasterizer.py", line 473, in rasterize_labels
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/core/build_wheel.runfiles/ai_infra/moduluspy/modulus/processors/processors.py", line 235, in __call__
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/core/build_wheel.runfiles/ai_infra/moduluspy/modulus/processors/bbox_rasterizer.py", line 234, in call
  File "<string>", line 161, in rasterize_bbox
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/op_def_library.py", line 794, in _apply_op_helper
    op_def=op_def)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/util/deprecation.py", line 513, in new_func
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py", line 3357, in create_op
    attrs, op_def, compute_device)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py", line 3426, in _create_op_internal
    op_def=op_def)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py", line 1748, in __init__
    self._traceback = tf_stack.extract_stack()

2022-06-11 09:43:06,655 [INFO] root: Saving trained model.
INFO:tensorflow:Saving checkpoints for step-0.
2022-06-11 09:43:06,656 [INFO] tensorflow: Saving checkpoints for step-0.
2022-06-11 09:43:06,738 [INFO] root: Saving trained model.
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1365, in _do_call
    return fn(*args)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1350, in _run_fn
    target_list, run_metadata)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1443, in _call_tf_sessionrun
    run_metadata)
tensorflow.python.framework.errors_impl.UnknownError: Horovod has been shut down. This was caused by an exception on one of the ranks or an attempt to allreduce, allgather or broadcast a tensor after one of the ranks finished execution. If the shutdown was caused by an exception, you should see the exception in the log before the first shutdown message.
	 [[{{node DistributedAdamOptimizer_Allreduce/cond_1/HorovodAllreduce_gradients_AddN_58_0}}]]

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 917, in <module>
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 906, in <module>
  File "<decorator-gen-2>", line 2, in main
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/utilities/timer.py", line 46, in wrapped_fn
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 893, in main
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 757, in run_experiment
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 693, in train_gridbox
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 195, in run_training_loop
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 754, in run
    run_metadata=run_metadata)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 1360, in run
    raise six.reraise(*original_exc_info)
  File "/usr/local/lib/python3.6/dist-packages/six.py", line 696, in reraise
    raise value
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 1345, in run
    return self._sess.run(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 1418, in run
    run_metadata=run_metadata)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 1176, in run
    return self._sess.run(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 956, in run
    run_metadata_ptr)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1180, in _run
    feed_dict_tensor, options, run_metadata)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1359, in _do_run
    run_metadata)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1384, in _do_call
    raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.UnknownError: Horovod has been shut down. This was caused by an exception on one of the ranks or an attempt to allreduce, allgather or broadcast a tensor after one of the ranks finished execution. If the shutdown was caused by an exception, you should see the exception in the log before the first shutdown message.
	 [[node DistributedAdamOptimizer_Allreduce/cond_1/HorovodAllreduce_gradients_AddN_58_0 (defined at /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py:1748) ]]

Original stack trace for 'DistributedAdamOptimizer_Allreduce/cond_1/HorovodAllreduce_gradients_AddN_58_0':
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 906, in <module>
  File "<decorator-gen-2>", line 2, in main
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/utilities/timer.py", line 46, in wrapped_fn
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 893, in main
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 757, in run_experiment
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 668, in train_gridbox
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 523, in build_training_graph
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/model/detectnet_model.py", line 602, in build_training_graph
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/training/train_op_generator.py", line 59, in get_train_op
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/training/train_op_generator.py", line 74, in _get_train_op_without_cost_scaling
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/optimizer.py", line 419, in minimize
    grad_loss=grad_loss)
  File "/usr/local/lib/python3.6/dist-packages/horovod/tensorflow/__init__.py", line 501, in compute_gradients
    avg_grads = self._allreduce_grads(grads, vars)
  File "/usr/local/lib/python3.6/dist-packages/horovod/tensorflow/__init__.py", line 422, in allreduce_grads
    for grad in grads]
  File "/usr/local/lib/python3.6/dist-packages/horovod/tensorflow/__init__.py", line 422, in <listcomp>
    for grad in grads]
  File "/usr/local/lib/python3.6/dist-packages/horovod/tensorflow/__init__.py", line 258, in _allreduce_cond
    allreduce_fn, id_fn)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/util/deprecation.py", line 513, in new_func
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/control_flow_ops.py", line 1224, in cond
    orig_res_t, res_t = context_t.BuildCondBranch(true_fn)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/control_flow_ops.py", line 1061, in BuildCondBranch
    original_result = fn()
  File "/usr/local/lib/python3.6/dist-packages/horovod/tensorflow/__init__.py", line 248, in allreduce_fn
    return allreduce(tensor, *args, process_set=process_set, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/horovod/tensorflow/__init__.py", line 128, in allreduce
    name=name, process_set=process_set)
  File "/usr/local/lib/python3.6/dist-packages/horovod/tensorflow/mpi_ops.py", line 131, in _allreduce
    process_set_id=process_set.process_set_id)
  File "<string>", line 108, in horovod_allreduce
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/op_def_library.py", line 794, in _apply_op_helper
    op_def=op_def)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/util/deprecation.py", line 513, in new_func
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py", line 3357, in create_op
    attrs, op_def, compute_device)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py", line 3426, in _create_op_internal
    op_def=op_def)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py", line 1748, in __init__
    self._traceback = tf_stack.extract_stack()

Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1365, in _do_call
    return fn(*args)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1350, in _run_fn
    target_list, run_metadata)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1443, in _call_tf_sessionrun
    run_metadata)
tensorflow.python.framework.errors_impl.InvalidArgumentError: 2 root error(s) found.
  (0) Invalid argument: bboxes class ID out of range [0, 3[, got-1
	 [[{{node BboxRasterizer/RasterizeBbox}}]]
	 [[resnet18_nopool_bn_detectnet_v2/block_2a_bn_1/AssignMovingAvg/_4451]]
  (1) Invalid argument: bboxes class ID out of range [0, 3[, got-1
	 [[{{node BboxRasterizer/RasterizeBbox}}]]
0 successful operations.
0 derived errors ignored.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 917, in <module>
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 906, in <module>
  File "<decorator-gen-2>", line 2, in main
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/utilities/timer.py", line 46, in wrapped_fn
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 893, in main
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 757, in run_experiment
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 693, in train_gridbox
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 195, in run_training_loop
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 754, in run
    run_metadata=run_metadata)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 1360, in run
    raise six.reraise(*original_exc_info)
  File "/usr/local/lib/python3.6/dist-packages/six.py", line 696, in reraise
    raise value
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 1345, in run
    return self._sess.run(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 1418, in run
    run_metadata=run_metadata)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 1176, in run
    return self._sess.run(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 956, in run
    run_metadata_ptr)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1180, in _run
    feed_dict_tensor, options, run_metadata)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1359, in _do_run
    run_metadata)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1384, in _do_call
    raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.InvalidArgumentError: 2 root error(s) found.
  (0) Invalid argument: bboxes class ID out of range [0, 3[, got-1
	 [[node BboxRasterizer/RasterizeBbox (defined at /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py:1748) ]]
	 [[resnet18_nopool_bn_detectnet_v2/block_2a_bn_1/AssignMovingAvg/_4451]]
  (1) Invalid argument: bboxes class ID out of range [0, 3[, got-1
	 [[node BboxRasterizer/RasterizeBbox (defined at /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py:1748) ]]
0 successful operations.
0 derived errors ignored.

Original stack trace for 'BboxRasterizer/RasterizeBbox':
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 906, in <module>
  File "<decorator-gen-2>", line 2, in main
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/utilities/timer.py", line 46, in wrapped_fn
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 893, in main
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 757, in run_experiment
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 668, in train_gridbox
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 507, in build_training_graph
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 397, in rasterize_tensors
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/model/detectnet_model.py", line 665, in generate_ground_truth_tensors
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/objectives/objective_set.py", line 278, in generate_ground_truth_tensors
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/rasterizers/bbox_rasterizer.py", line 473, in rasterize_labels
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/core/build_wheel.runfiles/ai_infra/moduluspy/modulus/processors/processors.py", line 235, in __call__
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/core/build_wheel.runfiles/ai_infra/moduluspy/modulus/processors/bbox_rasterizer.py", line 234, in call
  File "<string>", line 161, in rasterize_bbox
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/op_def_library.py", line 794, in _apply_op_helper
    op_def=op_def)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/util/deprecation.py", line 513, in new_func
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py", line 3357, in create_op
    attrs, op_def, compute_device)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py", line 3426, in _create_op_internal
    op_def=op_def)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py", line 1748, in __init__
    self._traceback = tf_stack.extract_stack()

Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1365, in _do_call
    return fn(*args)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1350, in _run_fn
    target_list, run_metadata)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1443, in _call_tf_sessionrun
    run_metadata)
tensorflow.python.framework.errors_impl.UnknownError: Horovod has been shut down. This was caused by an exception on one of the ranks or an attempt to allreduce, allgather or broadcast a tensor after one of the ranks finished execution. If the shutdown was caused by an exception, you should see the exception in the log before the first shutdown message.
	 [[{{node DistributedAdamOptimizer_Allreduce/cond_44/HorovodAllreduce_gradients_AddN_32_0}}]]

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 917, in <module>
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 906, in <module>
  File "<decorator-gen-2>", line 2, in main
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/utilities/timer.py", line 46, in wrapped_fn
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 893, in main
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 757, in run_experiment
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 693, in train_gridbox
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 195, in run_training_loop
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 754, in run
    run_metadata=run_metadata)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 1360, in run
    raise six.reraise(*original_exc_info)
  File "/usr/local/lib/python3.6/dist-packages/six.py", line 696, in reraise
    raise value
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 1345, in run
    return self._sess.run(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 1418, in run
    run_metadata=run_metadata)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 1176, in run
    return self._sess.run(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 956, in run
    run_metadata_ptr)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1180, in _run
    feed_dict_tensor, options, run_metadata)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1359, in _do_run
    run_metadata)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1384, in _do_call
    raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.UnknownError: Horovod has been shut down. This was caused by an exception on one of the ranks or an attempt to allreduce, allgather or broadcast a tensor after one of the ranks finished execution. If the shutdown was caused by an exception, you should see the exception in the log before the first shutdown message.
	 [[node DistributedAdamOptimizer_Allreduce/cond_44/HorovodAllreduce_gradients_AddN_32_0 (defined at /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py:1748) ]]

Original stack trace for 'DistributedAdamOptimizer_Allreduce/cond_44/HorovodAllreduce_gradients_AddN_32_0':
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 906, in <module>
  File "<decorator-gen-2>", line 2, in main
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/utilities/timer.py", line 46, in wrapped_fn
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 893, in main
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 757, in run_experiment
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 668, in train_gridbox
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 523, in build_training_graph
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/model/detectnet_model.py", line 602, in build_training_graph
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/training/train_op_generator.py", line 59, in get_train_op
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/training/train_op_generator.py", line 74, in _get_train_op_without_cost_scaling
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/optimizer.py", line 419, in minimize
    grad_loss=grad_loss)
  File "/usr/local/lib/python3.6/dist-packages/horovod/tensorflow/__init__.py", line 501, in compute_gradients
    avg_grads = self._allreduce_grads(grads, vars)
  File "/usr/local/lib/python3.6/dist-packages/horovod/tensorflow/__init__.py", line 422, in allreduce_grads
    for grad in grads]
  File "/usr/local/lib/python3.6/dist-packages/horovod/tensorflow/__init__.py", line 422, in <listcomp>
    for grad in grads]
  File "/usr/local/lib/python3.6/dist-packages/horovod/tensorflow/__init__.py", line 258, in _allreduce_cond
    allreduce_fn, id_fn)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/util/deprecation.py", line 513, in new_func
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/control_flow_ops.py", line 1224, in cond
    orig_res_t, res_t = context_t.BuildCondBranch(true_fn)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/control_flow_ops.py", line 1061, in BuildCondBranch
    original_result = fn()
  File "/usr/local/lib/python3.6/dist-packages/horovod/tensorflow/__init__.py", line 248, in allreduce_fn
    return allreduce(tensor, *args, process_set=process_set, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/horovod/tensorflow/__init__.py", line 128, in allreduce
    name=name, process_set=process_set)
  File "/usr/local/lib/python3.6/dist-packages/horovod/tensorflow/mpi_ops.py", line 131, in _allreduce
    process_set_id=process_set.process_set_id)
  File "<string>", line 108, in horovod_allreduce
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/op_def_library.py", line 794, in _apply_op_helper
    op_def=op_def)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/util/deprecation.py", line 513, in new_func
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py", line 3357, in create_op
    attrs, op_def, compute_device)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py", line 3426, in _create_op_internal
    op_def=op_def)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py", line 1748, in __init__
    self._traceback = tf_stack.extract_stack()

Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1365, in _do_call
    return fn(*args)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1350, in _run_fn
    target_list, run_metadata)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1443, in _call_tf_sessionrun
    run_metadata)
tensorflow.python.framework.errors_impl.UnknownError: Horovod has been shut down. This was caused by an exception on one of the ranks or an attempt to allreduce, allgather or broadcast a tensor after one of the ranks finished execution. If the shutdown was caused by an exception, you should see the exception in the log before the first shutdown message.
	 [[{{node DistributedAdamOptimizer_Allreduce/cond_44/HorovodAllreduce_gradients_AddN_32_0}}]]

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 917, in <module>
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 906, in <module>
  File "<decorator-gen-2>", line 2, in main
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/utilities/timer.py", line 46, in wrapped_fn
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 893, in main
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 757, in run_experiment
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 693, in train_gridbox
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 195, in run_training_loop
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 754, in run
    run_metadata=run_metadata)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 1360, in run
    raise six.reraise(*original_exc_info)
  File "/usr/local/lib/python3.6/dist-packages/six.py", line 696, in reraise
    raise value
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 1345, in run
    return self._sess.run(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 1418, in run
    run_metadata=run_metadata)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 1176, in run
    return self._sess.run(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 956, in run
    run_metadata_ptr)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1180, in _run
    feed_dict_tensor, options, run_metadata)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1359, in _do_run
    run_metadata)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1384, in _do_call
    raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.UnknownError: Horovod has been shut down. This was caused by an exception on one of the ranks or an attempt to allreduce, allgather or broadcast a tensor after one of the ranks finished execution. If the shutdown was caused by an exception, you should see the exception in the log before the first shutdown message.
	 [[node DistributedAdamOptimizer_Allreduce/cond_44/HorovodAllreduce_gradients_AddN_32_0 (defined at /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py:1748) ]]

Original stack trace for 'DistributedAdamOptimizer_Allreduce/cond_44/HorovodAllreduce_gradients_AddN_32_0':
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 906, in <module>
  File "<decorator-gen-2>", line 2, in main
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/utilities/timer.py", line 46, in wrapped_fn
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 893, in main
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 757, in run_experiment
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 668, in train_gridbox
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 523, in build_training_graph
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/model/detectnet_model.py", line 602, in build_training_graph
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/training/train_op_generator.py", line 59, in get_train_op
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/training/train_op_generator.py", line 74, in _get_train_op_without_cost_scaling
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/optimizer.py", line 419, in minimize
    grad_loss=grad_loss)
  File "/usr/local/lib/python3.6/dist-packages/horovod/tensorflow/__init__.py", line 501, in compute_gradients
    avg_grads = self._allreduce_grads(grads, vars)
  File "/usr/local/lib/python3.6/dist-packages/horovod/tensorflow/__init__.py", line 422, in allreduce_grads
    for grad in grads]
  File "/usr/local/lib/python3.6/dist-packages/horovod/tensorflow/__init__.py", line 422, in <listcomp>
    for grad in grads]
  File "/usr/local/lib/python3.6/dist-packages/horovod/tensorflow/__init__.py", line 258, in _allreduce_cond
    allreduce_fn, id_fn)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/util/deprecation.py", line 513, in new_func
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/control_flow_ops.py", line 1224, in cond
    orig_res_t, res_t = context_t.BuildCondBranch(true_fn)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/control_flow_ops.py", line 1061, in BuildCondBranch
    original_result = fn()
  File "/usr/local/lib/python3.6/dist-packages/horovod/tensorflow/__init__.py", line 248, in allreduce_fn
    return allreduce(tensor, *args, process_set=process_set, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/horovod/tensorflow/__init__.py", line 128, in allreduce
    name=name, process_set=process_set)
  File "/usr/local/lib/python3.6/dist-packages/horovod/tensorflow/mpi_ops.py", line 131, in _allreduce
    process_set_id=process_set.process_set_id)
  File "<string>", line 108, in horovod_allreduce
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/op_def_library.py", line 794, in _apply_op_helper
    op_def=op_def)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/util/deprecation.py", line 513, in new_func
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py", line 3357, in create_op
    attrs, op_def, compute_device)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py", line 3426, in _create_op_internal
    op_def=op_def)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py", line 1748, in __init__
    self._traceback = tf_stack.extract_stack()

Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1365, in _do_call
    return fn(*args)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1350, in _run_fn
    target_list, run_metadata)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1443, in _call_tf_sessionrun
    run_metadata)
tensorflow.python.framework.errors_impl.UnknownError: Horovod has been shut down. This was caused by an exception on one of the ranks or an attempt to allreduce, allgather or broadcast a tensor after one of the ranks finished execution. If the shutdown was caused by an exception, you should see the exception in the log before the first shutdown message.
	 [[{{node DistributedAdamOptimizer_Allreduce/cond_86/HorovodAllreduce_gradients_AddN_8_0}}]]

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 917, in <module>
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 906, in <module>
  File "<decorator-gen-2>", line 2, in main
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/utilities/timer.py", line 46, in wrapped_fn
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 893, in main
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 757, in run_experiment
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 693, in train_gridbox
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 195, in run_training_loop
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 754, in run
    run_metadata=run_metadata)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 1360, in run
    raise six.reraise(*original_exc_info)
  File "/usr/local/lib/python3.6/dist-packages/six.py", line 696, in reraise
    raise value
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 1345, in run
    return self._sess.run(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 1418, in run
    run_metadata=run_metadata)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 1176, in run
    return self._sess.run(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 956, in run
    run_metadata_ptr)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1180, in _run
    feed_dict_tensor, options, run_metadata)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1359, in _do_run
    run_metadata)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1384, in _do_call
    raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.UnknownError: Horovod has been shut down. This was caused by an exception on one of the ranks or an attempt to allreduce, allgather or broadcast a tensor after one of the ranks finished execution. If the shutdown was caused by an exception, you should see the exception in the log before the first shutdown message.
	 [[node DistributedAdamOptimizer_Allreduce/cond_86/HorovodAllreduce_gradients_AddN_8_0 (defined at /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py:1748) ]]

Original stack trace for 'DistributedAdamOptimizer_Allreduce/cond_86/HorovodAllreduce_gradients_AddN_8_0':
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 906, in <module>
  File "<decorator-gen-2>", line 2, in main
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/utilities/timer.py", line 46, in wrapped_fn
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 893, in main
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 757, in run_experiment
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 668, in train_gridbox
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 523, in build_training_graph
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/model/detectnet_model.py", line 602, in build_training_graph
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/training/train_op_generator.py", line 59, in get_train_op
  File "/root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/training/train_op_generator.py", line 74, in _get_train_op_without_cost_scaling
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/optimizer.py", line 419, in minimize
    grad_loss=grad_loss)
  File "/usr/local/lib/python3.6/dist-packages/horovod/tensorflow/__init__.py", line 501, in compute_gradients
    avg_grads = self._allreduce_grads(grads, vars)
  File "/usr/local/lib/python3.6/dist-packages/horovod/tensorflow/__init__.py", line 422, in allreduce_grads
    for grad in grads]
  File "/usr/local/lib/python3.6/dist-packages/horovod/tensorflow/__init__.py", line 422, in <listcomp>
    for grad in grads]
  File "/usr/local/lib/python3.6/dist-packages/horovod/tensorflow/__init__.py", line 258, in _allreduce_cond
    allreduce_fn, id_fn)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/util/deprecation.py", line 513, in new_func
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/control_flow_ops.py", line 1224, in cond
    orig_res_t, res_t = context_t.BuildCondBranch(true_fn)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/control_flow_ops.py", line 1061, in BuildCondBranch
    original_result = fn()
  File "/usr/local/lib/python3.6/dist-packages/horovod/tensorflow/__init__.py", line 248, in allreduce_fn
    return allreduce(tensor, *args, process_set=process_set, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/horovod/tensorflow/__init__.py", line 128, in allreduce
    name=name, process_set=process_set)
  File "/usr/local/lib/python3.6/dist-packages/horovod/tensorflow/mpi_ops.py", line 131, in _allreduce
    process_set_id=process_set.process_set_id)
  File "<string>", line 108, in horovod_allreduce
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/op_def_library.py", line 794, in _apply_op_helper
    op_def=op_def)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/util/deprecation.py", line 513, in new_func
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py", line 3357, in create_op
    attrs, op_def, compute_device)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py", line 3426, in _create_op_internal
    op_def=op_def)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py", line 1748, in __init__
    self._traceback = tf_stack.extract_stack()

--------------------------------------------------------------------------
Primary job  terminated normally, but 1 process returned
a non-zero exit code. Per user-direction, the job has been aborted.
--------------------------------------------------------------------------
--------------------------------------------------------------------------
mpirun.real detected that one or more processes exited with non-zero status, thus causing
the job to be terminated. The first process to do so was:

  Process name: [[9212,1],1]
  Exit code:    1
--------------------------------------------------------------------------
2022-06-11 09:43:10,361 [INFO] tlt.components.docker_handler.docker_handler: Stopping container.

This error is often due to your tfrecords. Can you share the full command and log how did you generate tfrecords?

Thanks for the reply.
below, I executed to run to generate tfrecord…

■command

!sudo rm -fr $DATA_DOWNLOAD_DIR/tfrecords/kitti_trainval/*
!tao detectnet_v2 dataset_convert \
                  -d $SPECS_DIR/detectnet_v2_tfrecords_kitti_trainval.txt \
                  -o $DATA_DOWNLOAD_DIR/tfrecords/kitti_trainval/kitti_trainval

■success log

Using TensorFlow backend.
2022-06-11 10:06:12,106 [INFO] iva.detectnet_v2.dataio.build_converter: Instantiating a kitti converter
2022-06-11 10:06:12,107 [INFO] iva.detectnet_v2.dataio.kitti_converter_lib: Num images in
Train: 129	Val: 21
2022-06-11 10:06:12,107 [INFO] iva.detectnet_v2.dataio.kitti_converter_lib: Validation data in partition 0. Hence, while choosing the validationset during training choose validation_fold 0.
2022-06-11 10:06:12,108 [INFO] iva.detectnet_v2.dataio.dataset_converter_lib: Writing partition 0, shard 0
WARNING:tensorflow:From /root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/dataio/dataset_converter_lib.py:169: The name tf.python_io.TFRecordWriter is deprecated. Please use tf.io.TFRecordWriter instead.

2022-06-11 10:06:12,108 [WARNING] tensorflow: From /root/.cache/bazel/_bazel_root/b770f990bb7b9e2db5771981fb3a38b4/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/dataio/dataset_converter_lib.py:169: The name tf.python_io.TFRecordWriter is deprecated. Please use tf.io.TFRecordWriter instead.

/usr/local/lib/python3.6/dist-packages/iva/detectnet_v2/dataio/kitti_converter_lib.py:315: VisibleDeprecationWarning: Reading unicode strings without specifying the encoding argument is deprecated. Set the encoding, use None for the system default.
2022-06-11 10:06:12,116 [INFO] iva.detectnet_v2.dataio.dataset_converter_lib: Writing partition 0, shard 1
2022-06-11 10:06:12,118 [INFO] iva.detectnet_v2.dataio.dataset_converter_lib: Writing partition 0, shard 2
2022-06-11 10:06:12,122 [INFO] iva.detectnet_v2.dataio.dataset_converter_lib: Writing partition 0, shard 3
2022-06-11 10:06:12,125 [INFO] iva.detectnet_v2.dataio.dataset_converter_lib: Writing partition 0, shard 4
2022-06-11 10:06:12,127 [INFO] iva.detectnet_v2.dataio.dataset_converter_lib: Writing partition 0, shard 5
2022-06-11 10:06:12,130 [INFO] iva.detectnet_v2.dataio.dataset_converter_lib: Writing partition 0, shard 6
2022-06-11 10:06:12,132 [INFO] iva.detectnet_v2.dataio.dataset_converter_lib: Writing partition 0, shard 7
2022-06-11 10:06:12,135 [INFO] iva.detectnet_v2.dataio.dataset_converter_lib: Writing partition 0, shard 8
2022-06-11 10:06:12,138 [INFO] iva.detectnet_v2.dataio.dataset_converter_lib: Writing partition 0, shard 9
2022-06-11 10:06:12,142 [INFO] iva.detectnet_v2.dataio.dataset_converter_lib: 
Wrote the following numbers of objects:
b'fish': 81
b'jellyfish': 44
b'penguin': 22
b'shark': 12
b'starfish': 3

2022-06-11 10:06:12,142 [INFO] iva.detectnet_v2.dataio.dataset_converter_lib: Writing partition 1, shard 0
2022-06-11 10:06:12,159 [INFO] iva.detectnet_v2.dataio.dataset_converter_lib: Writing partition 1, shard 1
2022-06-11 10:06:12,173 [INFO] iva.detectnet_v2.dataio.dataset_converter_lib: Writing partition 1, shard 2
2022-06-11 10:06:12,188 [INFO] iva.detectnet_v2.dataio.dataset_converter_lib: Writing partition 1, shard 3
2022-06-11 10:06:12,205 [INFO] iva.detectnet_v2.dataio.dataset_converter_lib: Writing partition 1, shard 4
2022-06-11 10:06:12,220 [INFO] iva.detectnet_v2.dataio.dataset_converter_lib: Writing partition 1, shard 5
2022-06-11 10:06:12,235 [INFO] iva.detectnet_v2.dataio.dataset_converter_lib: Writing partition 1, shard 6
2022-06-11 10:06:12,251 [INFO] iva.detectnet_v2.dataio.dataset_converter_lib: Writing partition 1, shard 7
2022-06-11 10:06:12,266 [INFO] iva.detectnet_v2.dataio.dataset_converter_lib: Writing partition 1, shard 8
2022-06-11 10:06:12,282 [INFO] iva.detectnet_v2.dataio.dataset_converter_lib: Writing partition 1, shard 9
2022-06-11 10:06:12,308 [INFO] iva.detectnet_v2.dataio.dataset_converter_lib: 
Wrote the following numbers of objects:
b'jellyfish': 128
b'fish': 572
b'shark': 86
b'penguin': 202
b'stingray': 3
b'puffin': 44
b'starfish': 11

2022-06-11 10:06:12,308 [INFO] iva.detectnet_v2.dataio.dataset_converter_lib: Cumulative object statistics
2022-06-11 10:06:12,308 [INFO] iva.detectnet_v2.dataio.dataset_converter_lib: 
Wrote the following numbers of objects:
b'fish': 653
b'jellyfish': 172
b'penguin': 224
b'shark': 98
b'starfish': 14
b'stingray': 3
b'puffin': 44

2022-06-11 10:06:12,308 [INFO] iva.detectnet_v2.dataio.dataset_converter_lib: Class map. 
Label in GT: Label in tfrecords file 
b'fish': b'fish'
b'jellyfish': b'jellyfish'
b'penguin': b'penguin'
b'shark': b'shark'
b'starfish': b'starfish'
b'stingray': b'stingray'
b'puffin': b'puffin'
For the dataset_config in the experiment_spec, please use labels in the tfrecords file, while writing the classmap.

2022-06-11 10:06:12,308 [INFO] iva.detectnet_v2.dataio.dataset_converter_lib: Tfrecords generation complete.
2022-06-11 10:06:13,539 [INFO] tlt.components.docker_handler.docker_handler: Stopping container.

now, training is success.

I excluded all classes except fish from “detectnet_v2_train_resnet18_kitti.txt”.

Then training succeeded.
The excluded part is as follows.

■dataset_config

target_class_mapping {
    key: "jellyfish"
    value: "jellyfish"
  }
  target_class_mapping {
    key: "penguin"
    value: "penguin"
  }
  target_class_mapping {
    key: "puffin"
    value: "puffin"
  }
  target_class_mapping {
    key: "shark"
    value: "shark"
  }

■postprocessing_config

target_class_config {
    key: "jellyfish"
    value {
      clustering_config {
        clustering_algorithm: DBSCAN
        dbscan_confidence_threshold: 0.9
        coverage_threshold: 0.00499999988824
        dbscan_eps: 0.15000000596
        dbscan_min_samples: 0.0500000007451
        minimum_bounding_box_height: 20
      }
    }
  }
  target_class_config {
    key: "penguin"
    value {
      clustering_config {
        clustering_algorithm: DBSCAN
        dbscan_confidence_threshold: 0.9
        coverage_threshold: 0.00749999983236
        dbscan_eps: 0.230000004172
        dbscan_min_samples: 0.0500000007451
        minimum_bounding_box_height: 20
      }
    }
  }

■evaluation_config

evaluation_box_config {
    key: "jellyfish"
    value {
      minimum_height: 20
      maximum_height: 9999
      minimum_width: 10
      maximum_width: 9999
    }
  }
  evaluation_box_config {
    key: "penguin"
    value {
      minimum_height: 20
      maximum_height: 9999
      minimum_width: 10
      maximum_width: 9999
    }
  }

■cost_function_config

target_classes {
    name: "jellyfish"
    class_weight: 8.0
    coverage_foreground_weight: 0.0500000007451
    objectives {
      name: "cov"
      initial_weight: 1.0
      weight_target: 1.0
    }
    objectives {
      name: "bbox"
      initial_weight: 10.0
      weight_target: 1.0
    }
  }
  target_classes {
    name: "penguin"
    class_weight: 4.0
    coverage_foreground_weight: 0.0500000007451
    objectives {
      name: "cov"
      initial_weight: 1.0
      weight_target: 1.0
    }
    objectives {
      name: "bbox"
      initial_weight: 10.0
      weight_target: 10.0
    }
  }

■bbox_rasterizer_config

target_class_config {
    key: "jellyfish"
    value {
      cov_center_x: 0.5
      cov_center_y: 0.5
      cov_radius_x: 1.0
      cov_radius_y: 1.0
      bbox_min_radius: 1.0
    }
  }
  target_class_config {
    key: "penguin"
    value {
      cov_center_x: 0.5
      cov_center_y: 0.5
      cov_radius_x: 1.0
      cov_radius_y: 1.0
      bbox_min_radius: 1.0
    }
  }

According to the tfrecord generation log, the label files contain 7 classes. It should be successful to train these 7 classes. Please double check the training spec.