Please provide the following information when requesting support.
• Hardware (T4/V100/Xavier/Nano/etc)
Ubuntu 20.04.3 LTS, Intel x64, RTX3090.
• Network Type (Detectnet_v2/Faster_rcnn/Yolo_v4/LPRnet/Mask_rcnn/Classification/etc)
• TLT Version (Please run “tlt info --verbose” and share “docker_tag” here)
tao info
:
Configuration of the TAO Toolkit Instance
dockers: [‘nvidia/tao/tao-toolkit-tf’, ‘nvidia/tao/tao-toolkit-pyt’, ‘nvidia/tao/tao-toolkit-lm’]
format_version: 2.0
toolkit_version: 3.21.11
published_date: 11/08/2021
• Training spec file(If have, please share here)
• How to reproduce the issue ? (This is for errors. Please share the command line and the detailed log here.)
I’ve installed the TAO
, and followed the embedded notebook to trying to re-train an object detection model from my own dataset for practice purpose, I just simply labeled 58 jpg files with only 1 class of closedsign
(a paper stick on the elevator door).
this is the kitti to tfrecords
conversion spec:
TFrecords conversion spec file for kitti training
kitti_config {
root_directory_path: “/workspace/tao-experiments/data/training”
image_dir_name: “image_2”
label_dir_name: “label_2”
image_extension: “.jpg”
partition_mode: “random”
num_partitions: 2
val_split: 10
num_shards: 10
}
after conversion, the folder structure is like:
one sample of the label is like:
closedsign 0.0 0 -1 248.12 87.19 537.23 657.49 -1 -1 -1 -1 -1 -1 -1
I modified a little bit from the default trainning script of detectnet_v2_train_resnet18_kitti.txt
to (basically replaced the keyword car
to closedsign
):
random_seed: 42
dataset_config {
data_sources {
tfrecords_path: “/workspace/tao-experiments/data/tfrecords/kitti_trainval/*”
image_directory_path: “/workspace/tao-experiments/data/training”
}
image_extension: “jpg”
target_class_mapping {
key: “closedsign”
value: “closedsign”
}
target_class_mapping {
key: “cyclist”
value: “cyclist”
}
target_class_mapping {
key: “pedestrian”
value: “pedestrian”
}
target_class_mapping {
key: “person_sitting”
value: “pedestrian”
}
target_class_mapping {
key: “van”
value: “car”
}
validation_fold: 0
}
augmentation_config {
preprocessing {
output_image_width: 1248
output_image_height: 384
min_bbox_width: 1.0
min_bbox_height: 1.0
output_image_channel: 3
}
spatial_augmentation {
hflip_probability: 0.5
zoom_min: 1.0
zoom_max: 1.0
translate_max_x: 8.0
translate_max_y: 8.0
}
color_augmentation {
hue_rotation_max: 25.0
saturation_shift_max: 0.20000000298
contrast_scale_max: 0.10000000149
contrast_center: 0.5
}
}
postprocessing_config {
target_class_config {
key: “closedsign”
value {
clustering_config {
clustering_algorithm: DBSCAN
dbscan_confidence_threshold: 0.9
coverage_threshold: 0.00499999988824
dbscan_eps: 0.20000000298
dbscan_min_samples: 0.0500000007451
minimum_bounding_box_height: 20
}
}
}
target_class_config {
key: “cyclist”
value {
clustering_config {
clustering_algorithm: DBSCAN
dbscan_confidence_threshold: 0.9
coverage_threshold: 0.00499999988824
dbscan_eps: 0.15000000596
dbscan_min_samples: 0.0500000007451
minimum_bounding_box_height: 20
}
}
}
target_class_config {
key: “pedestrian”
value {
clustering_config {
clustering_algorithm: DBSCAN
dbscan_confidence_threshold: 0.9
coverage_threshold: 0.00749999983236
dbscan_eps: 0.230000004172
dbscan_min_samples: 0.0500000007451
minimum_bounding_box_height: 20
}
}
}
}
model_config {
pretrained_model_file: “/workspace/tao-experiments/detectnet_v2/pretrained_resnet18/pretrained_detectnet_v2_vresnet18/resnet18.hdf5”
num_layers: 18
use_batch_norm: true
objective_set {
bbox {
scale: 35.0
offset: 0.5
}
cov {
}
}
arch: “resnet”
}
evaluation_config {
validation_period_during_training: 10
first_validation_epoch: 30
minimum_detection_ground_truth_overlap {
key: “closedsign”
value: 0.699999988079
}
minimum_detection_ground_truth_overlap {
key: “cyclist”
value: 0.5
}
minimum_detection_ground_truth_overlap {
key: “pedestrian”
value: 0.5
}
evaluation_box_config {
key: “closedsign”
value {
minimum_height: 20
maximum_height: 9999
minimum_width: 10
maximum_width: 9999
}
}
evaluation_box_config {
key: “cyclist”
value {
minimum_height: 20
maximum_height: 9999
minimum_width: 10
maximum_width: 9999
}
}
evaluation_box_config {
key: “pedestrian”
value {
minimum_height: 20
maximum_height: 9999
minimum_width: 10
maximum_width: 9999
}
}
average_precision_mode: INTEGRATE
}
cost_function_config {
target_classes {
name: “closedsign”
class_weight: 1.0
coverage_foreground_weight: 0.0500000007451
objectives {
name: “cov”
initial_weight: 1.0
weight_target: 1.0
}
objectives {
name: “bbox”
initial_weight: 10.0
weight_target: 10.0
}
}
target_classes {
name: “cyclist”
class_weight: 8.0
coverage_foreground_weight: 0.0500000007451
objectives {
name: “cov”
initial_weight: 1.0
weight_target: 1.0
}
objectives {
name: “bbox”
initial_weight: 10.0
weight_target: 1.0
}
}
target_classes {
name: “pedestrian”
class_weight: 4.0
coverage_foreground_weight: 0.0500000007451
objectives {
name: “cov”
initial_weight: 1.0
weight_target: 1.0
}
objectives {
name: “bbox”
initial_weight: 10.0
weight_target: 10.0
}
}
enable_autoweighting: true
max_objective_weight: 0.999899983406
min_objective_weight: 9.99999974738e-05
}
training_config {
batch_size_per_gpu: 4
num_epochs: 120
learning_rate {
soft_start_annealing_schedule {
min_learning_rate: 5e-06
max_learning_rate: 5e-04
soft_start: 0.10000000149
annealing: 0.699999988079
}
}
regularizer {
type: L1
weight: 3.00000002618e-09
}
optimizer {
adam {
epsilon: 9.99999993923e-09
beta1: 0.899999976158
beta2: 0.999000012875
}
}
cost_scaling {
initial_exponent: 20.0
increment: 0.005
decrement: 1.0
}
checkpoint_interval: 10
}
bbox_rasterizer_config {
target_class_config {
key: “closedsign”
value {
cov_center_x: 0.5
cov_center_y: 0.5
cov_radius_x: 0.40000000596
cov_radius_y: 0.40000000596
bbox_min_radius: 1.0
}
}
target_class_config {
key: “cyclist”
value {
cov_center_x: 0.5
cov_center_y: 0.5
cov_radius_x: 1.0
cov_radius_y: 1.0
bbox_min_radius: 1.0
}
}
target_class_config {
key: “pedestrian”
value {
cov_center_x: 0.5
cov_center_y: 0.5
cov_radius_x: 1.0
cov_radius_y: 1.0
bbox_min_radius: 1.0
}
}
deadzone_radius: 0.400000154972
}
then, running the training script:
!tao detectnet_v2 train -e $SPECS_DIR/detectnet_v2_train_resnet18_kitti.txt \
-r $USER_EXPERIMENT_DIR/experiment_dir_unpruned \
-k $KEY \
-n resnet18_detector \
--gpus $NUM_GPUS
the logging with errors is like below:
2022-01-03 15:21:51,211 [INFO] root: Registry: [‘nvcr.io’]
2022-01-03 15:21:51,240 [INFO] tlt.components.instance_handler.local_instance: Running command in container: nvcr.io/nvidia/tao/tao-toolkit-tf:v3.21.11-tf1.15.4-py3
2022-01-03 15:21:51,260 [WARNING] tlt.components.docker_handler.docker_handler:
Docker will run the commands as root. If you would like to retain your
local host permissions, please add the “user”:“UID:GID” in the
DockerOptions portion of the “/home/shao/.tao_mounts.json” file. You can obtain your
users UID and GID by using the “id -u” and “id -g” commands on the
terminal.
Using TensorFlow backend.
WARNING:tensorflow:Deprecation warnings have been disabled. Set TF_ENABLE_DEPRECATION_WARNINGS=1 to re-enable them.
Using TensorFlow backend.
WARNING:tensorflow:From /opt/tlt/.cache/dazel/_dazel_tlt/75913d2aee35770fa76c4a63d877f3aa/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/cost_function/cost_auto_weight_hook.py:43: The name tf.train.SessionRunHook is deprecated. Please use tf.estimator.SessionRunHook instead.2022-01-03 07:21:55,161 [WARNING] tensorflow: From /opt/tlt/.cache/dazel/_dazel_tlt/75913d2aee35770fa76c4a63d877f3aa/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/cost_function/cost_auto_weight_hook.py:43: The name tf.train.SessionRunHook is deprecated. Please use tf.estimator.SessionRunHook instead.
WARNING:tensorflow:From /opt/tlt/.cache/dazel/_dazel_tlt/75913d2aee35770fa76c4a63d877f3aa/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/tfhooks/checkpoint_saver_hook.py:25: The name tf.train.CheckpointSaverHook is deprecated. Please use tf.estimator.CheckpointSaverHook instead.
2022-01-03 07:21:55,261 [WARNING] tensorflow: From /opt/tlt/.cache/dazel/_dazel_tlt/75913d2aee35770fa76c4a63d877f3aa/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/tfhooks/checkpoint_saver_hook.py:25: The name tf.train.CheckpointSaverHook is deprecated. Please use tf.estimator.CheckpointSaverHook instead.
WARNING:tensorflow:From /opt/tlt/.cache/dazel/_dazel_tlt/75913d2aee35770fa76c4a63d877f3aa/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py:69: The name tf.logging.set_verbosity is deprecated. Please use tf.compat.v1.logging.set_verbosity instead.
2022-01-03 07:21:55,262 [WARNING] tensorflow: From /opt/tlt/.cache/dazel/_dazel_tlt/75913d2aee35770fa76c4a63d877f3aa/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py:69: The name tf.logging.set_verbosity is deprecated. Please use tf.compat.v1.logging.set_verbosity instead.
WARNING:tensorflow:From /opt/tlt/.cache/dazel/_dazel_tlt/75913d2aee35770fa76c4a63d877f3aa/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py:69: The name tf.logging.INFO is deprecated. Please use tf.compat.v1.logging.INFO instead.
2022-01-03 07:21:55,262 [WARNING] tensorflow: From /opt/tlt/.cache/dazel/_dazel_tlt/75913d2aee35770fa76c4a63d877f3aa/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py:69: The name tf.logging.INFO is deprecated. Please use tf.compat.v1.logging.INFO instead.
WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/horovod/tensorflow/init.py:117: The name tf.global_variables is deprecated. Please use tf.compat.v1.global_variables instead.
2022-01-03 07:21:55,268 [WARNING] tensorflow: From /usr/local/lib/python3.6/dist-packages/horovod/tensorflow/init.py:117: The name tf.global_variables is deprecated. Please use tf.compat.v1.global_variables instead.
WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/horovod/tensorflow/init.py:143: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.
2022-01-03 07:21:55,268 [WARNING] tensorflow: From /usr/local/lib/python3.6/dist-packages/horovod/tensorflow/init.py:143: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.
2022-01-03 07:21:55,720 [INFO] iva.common.logging.logging: Log file already exists at /workspace/tao-experiments/detectnet_v2/experiment_dir_unpruned/status.json
2022-01-03 07:21:55,721 [INFO] main: Loading experiment spec at /workspace/tao-experiments/detectnet_v2/specs/detectnet_v2_train_resnet18_kitti.txt.
2022-01-03 07:21:55,722 [INFO] iva.detectnet_v2.spec_handler.spec_loader: Merging specification from /workspace/tao-experiments/detectnet_v2/specs/detectnet_v2_train_resnet18_kitti.txt
2022-01-03 07:21:55,804 [INFO] main: Cannot iterate over exactly 53 samples with a batch size of 4; each epoch will therefore take one extra step.
WARNING:tensorflow:From /opt/tlt/.cache/dazel/_dazel_tlt/75913d2aee35770fa76c4a63d877f3aa/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/cost_function/cost_auto_weight_hook.py:107: The name tf.variable_scope is deprecated. Please use tf.compat.v1.variable_scope instead.2022-01-03 07:21:55,806 [WARNING] tensorflow: From /opt/tlt/.cache/dazel/_dazel_tlt/75913d2aee35770fa76c4a63d877f3aa/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/cost_function/cost_auto_weight_hook.py:107: The name tf.variable_scope is deprecated. Please use tf.compat.v1.variable_scope instead.
WARNING:tensorflow:From /opt/tlt/.cache/dazel/_dazel_tlt/75913d2aee35770fa76c4a63d877f3aa/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/cost_function/cost_auto_weight_hook.py:110: The name tf.get_variable is deprecated. Please use tf.compat.v1.get_variable instead.
2022-01-03 07:21:55,806 [WARNING] tensorflow: From /opt/tlt/.cache/dazel/_dazel_tlt/75913d2aee35770fa76c4a63d877f3aa/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/cost_function/cost_auto_weight_hook.py:110: The name tf.get_variable is deprecated. Please use tf.compat.v1.get_variable instead.
WARNING:tensorflow:From /opt/tlt/.cache/dazel/_dazel_tlt/75913d2aee35770fa76c4a63d877f3aa/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/cost_function/cost_auto_weight_hook.py:113: The name tf.assign is deprecated. Please use tf.compat.v1.assign instead.
2022-01-03 07:21:55,808 [WARNING] tensorflow: From /opt/tlt/.cache/dazel/_dazel_tlt/75913d2aee35770fa76c4a63d877f3aa/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/cost_function/cost_auto_weight_hook.py:113: The name tf.assign is deprecated. Please use tf.compat.v1.assign instead.
WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.
2022-01-03 07:21:55,875 [WARNING] tensorflow: From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.
WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.
2022-01-03 07:21:55,876 [WARNING] tensorflow: From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.
WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:1834: The name tf.nn.fused_batch_norm is deprecated. Please use tf.compat.v1.nn.fused_batch_norm instead.
2022-01-03 07:21:55,895 [WARNING] tensorflow: From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:1834: The name tf.nn.fused_batch_norm is deprecated. Please use tf.compat.v1.nn.fused_batch_norm instead.
WARNING:tensorflow:From /opt/nvidia/third_party/keras/tensorflow_backend.py:187: The name tf.nn.avg_pool is deprecated. Please use tf.nn.avg_pool2d instead.
2022-01-03 07:21:56,895 [WARNING] tensorflow: From /opt/nvidia/third_party/keras/tensorflow_backend.py:187: The name tf.nn.avg_pool is deprecated. Please use tf.nn.avg_pool2d instead.
WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:174: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.
2022-01-03 07:21:57,088 [WARNING] tensorflow: From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:174: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.
WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:199: The name tf.is_variable_initialized is deprecated. Please use tf.compat.v1.is_variable_initialized instead.
2022-01-03 07:21:57,088 [WARNING] tensorflow: From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:199: The name tf.is_variable_initialized is deprecated. Please use tf.compat.v1.is_variable_initialized instead.
WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:206: The name tf.variables_initializer is deprecated. Please use tf.compat.v1.variables_initializer instead.
2022-01-03 07:21:57,390 [WARNING] tensorflow: From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:206: The name tf.variables_initializer is deprecated. Please use tf.compat.v1.variables_initializer instead.
2022-01-03 07:22:03,742 [INFO] iva.detectnet_v2.objectives.bbox_objective: Default L1 loss function will be used.
Layer (type) Output Shape Param # Connected to
input_1 (InputLayer) (None, 3, 384, 1248) 0
conv1 (Conv2D) (None, 64, 192, 624) 9472 input_1[0][0]
bn_conv1 (BatchNormalization) (None, 64, 192, 624) 256 conv1[0][0]
activation_1 (Activation) (None, 64, 192, 624) 0 bn_conv1[0][0]
block_1a_conv_1 (Conv2D) (None, 64, 96, 312) 36928 activation_1[0][0]
block_1a_bn_1 (BatchNormalizati (None, 64, 96, 312) 256 block_1a_conv_1[0][0]
block_1a_relu_1 (Activation) (None, 64, 96, 312) 0 block_1a_bn_1[0][0]
block_1a_conv_2 (Conv2D) (None, 64, 96, 312) 36928 block_1a_relu_1[0][0]
block_1a_conv_shortcut (Conv2D) (None, 64, 96, 312) 4160 activation_1[0][0]
block_1a_bn_2 (BatchNormalizati (None, 64, 96, 312) 256 block_1a_conv_2[0][0]
block_1a_bn_shortcut (BatchNorm (None, 64, 96, 312) 256 block_1a_conv_shortcut[0][0]
add_1 (Add) (None, 64, 96, 312) 0 block_1a_bn_2[0][0]
block_1a_bn_shortcut[0][0]
block_1a_relu (Activation) (None, 64, 96, 312) 0 add_1[0][0]
block_1b_conv_1 (Conv2D) (None, 64, 96, 312) 36928 block_1a_relu[0][0]
block_1b_bn_1 (BatchNormalizati (None, 64, 96, 312) 256 block_1b_conv_1[0][0]
block_1b_relu_1 (Activation) (None, 64, 96, 312) 0 block_1b_bn_1[0][0]
block_1b_conv_2 (Conv2D) (None, 64, 96, 312) 36928 block_1b_relu_1[0][0]
block_1b_bn_2 (BatchNormalizati (None, 64, 96, 312) 256 block_1b_conv_2[0][0]
add_2 (Add) (None, 64, 96, 312) 0 block_1b_bn_2[0][0]
block_1a_relu[0][0]
block_1b_relu (Activation) (None, 64, 96, 312) 0 add_2[0][0]
block_2a_conv_1 (Conv2D) (None, 128, 48, 156) 73856 block_1b_relu[0][0]
block_2a_bn_1 (BatchNormalizati (None, 128, 48, 156) 512 block_2a_conv_1[0][0]
block_2a_relu_1 (Activation) (None, 128, 48, 156) 0 block_2a_bn_1[0][0]
block_2a_conv_2 (Conv2D) (None, 128, 48, 156) 147584 block_2a_relu_1[0][0]
block_2a_conv_shortcut (Conv2D) (None, 128, 48, 156) 8320 block_1b_relu[0][0]
block_2a_bn_2 (BatchNormalizati (None, 128, 48, 156) 512 block_2a_conv_2[0][0]
block_2a_bn_shortcut (BatchNorm (None, 128, 48, 156) 512 block_2a_conv_shortcut[0][0]
add_3 (Add) (None, 128, 48, 156) 0 block_2a_bn_2[0][0]
block_2a_bn_shortcut[0][0]
block_2a_relu (Activation) (None, 128, 48, 156) 0 add_3[0][0]
block_2b_conv_1 (Conv2D) (None, 128, 48, 156) 147584 block_2a_relu[0][0]
block_2b_bn_1 (BatchNormalizati (None, 128, 48, 156) 512 block_2b_conv_1[0][0]
block_2b_relu_1 (Activation) (None, 128, 48, 156) 0 block_2b_bn_1[0][0]
block_2b_conv_2 (Conv2D) (None, 128, 48, 156) 147584 block_2b_relu_1[0][0]
block_2b_bn_2 (BatchNormalizati (None, 128, 48, 156) 512 block_2b_conv_2[0][0]
add_4 (Add) (None, 128, 48, 156) 0 block_2b_bn_2[0][0]
block_2a_relu[0][0]
block_2b_relu (Activation) (None, 128, 48, 156) 0 add_4[0][0]
block_3a_conv_1 (Conv2D) (None, 256, 24, 78) 295168 block_2b_relu[0][0]
block_3a_bn_1 (BatchNormalizati (None, 256, 24, 78) 1024 block_3a_conv_1[0][0]
block_3a_relu_1 (Activation) (None, 256, 24, 78) 0 block_3a_bn_1[0][0]
block_3a_conv_2 (Conv2D) (None, 256, 24, 78) 590080 block_3a_relu_1[0][0]
block_3a_conv_shortcut (Conv2D) (None, 256, 24, 78) 33024 block_2b_relu[0][0]
block_3a_bn_2 (BatchNormalizati (None, 256, 24, 78) 1024 block_3a_conv_2[0][0]
block_3a_bn_shortcut (BatchNorm (None, 256, 24, 78) 1024 block_3a_conv_shortcut[0][0]
add_5 (Add) (None, 256, 24, 78) 0 block_3a_bn_2[0][0]
block_3a_bn_shortcut[0][0]
block_3a_relu (Activation) (None, 256, 24, 78) 0 add_5[0][0]
block_3b_conv_1 (Conv2D) (None, 256, 24, 78) 590080 block_3a_relu[0][0]
block_3b_bn_1 (BatchNormalizati (None, 256, 24, 78) 1024 block_3b_conv_1[0][0]
block_3b_relu_1 (Activation) (None, 256, 24, 78) 0 block_3b_bn_1[0][0]
block_3b_conv_2 (Conv2D) (None, 256, 24, 78) 590080 block_3b_relu_1[0][0]
block_3b_bn_2 (BatchNormalizati (None, 256, 24, 78) 1024 block_3b_conv_2[0][0]
add_6 (Add) (None, 256, 24, 78) 0 block_3b_bn_2[0][0]
block_3a_relu[0][0]
block_3b_relu (Activation) (None, 256, 24, 78) 0 add_6[0][0]
block_4a_conv_1 (Conv2D) (None, 512, 24, 78) 1180160 block_3b_relu[0][0]
block_4a_bn_1 (BatchNormalizati (None, 512, 24, 78) 2048 block_4a_conv_1[0][0]
block_4a_relu_1 (Activation) (None, 512, 24, 78) 0 block_4a_bn_1[0][0]
block_4a_conv_2 (Conv2D) (None, 512, 24, 78) 2359808 block_4a_relu_1[0][0]
block_4a_conv_shortcut (Conv2D) (None, 512, 24, 78) 131584 block_3b_relu[0][0]
block_4a_bn_2 (BatchNormalizati (None, 512, 24, 78) 2048 block_4a_conv_2[0][0]
block_4a_bn_shortcut (BatchNorm (None, 512, 24, 78) 2048 block_4a_conv_shortcut[0][0]
add_7 (Add) (None, 512, 24, 78) 0 block_4a_bn_2[0][0]
block_4a_bn_shortcut[0][0]
block_4a_relu (Activation) (None, 512, 24, 78) 0 add_7[0][0]
block_4b_conv_1 (Conv2D) (None, 512, 24, 78) 2359808 block_4a_relu[0][0]
block_4b_bn_1 (BatchNormalizati (None, 512, 24, 78) 2048 block_4b_conv_1[0][0]
block_4b_relu_1 (Activation) (None, 512, 24, 78) 0 block_4b_bn_1[0][0]
block_4b_conv_2 (Conv2D) (None, 512, 24, 78) 2359808 block_4b_relu_1[0][0]
block_4b_bn_2 (BatchNormalizati (None, 512, 24, 78) 2048 block_4b_conv_2[0][0]
add_8 (Add) (None, 512, 24, 78) 0 block_4b_bn_2[0][0]
block_4a_relu[0][0]
block_4b_relu (Activation) (None, 512, 24, 78) 0 add_8[0][0]
output_bbox (Conv2D) (None, 12, 24, 78) 6156 block_4b_relu[0][0]
output_cov (Conv2D) (None, 3, 24, 78) 1539 block_4b_relu[0][0]
Total params: 11,203,023
Trainable params: 11,193,295
Non-trainable params: 9,728
2022-01-03 07:22:03,774 [INFO] modulus.blocks.data_loaders.multi_source_loader.data_loader: Serial augmentation enabled = False
2022-01-03 07:22:03,774 [INFO] modulus.blocks.data_loaders.multi_source_loader.data_loader: Pseudo sharding enabled = False
2022-01-03 07:22:03,774 [INFO] modulus.blocks.data_loaders.multi_source_loader.data_loader: Max Image Dimensions (all sources): (0, 0)
2022-01-03 07:22:03,775 [INFO] modulus.blocks.data_loaders.multi_source_loader.data_loader: number of cpus: 24, io threads: 48, compute threads: 24, buffered batches: 4
2022-01-03 07:22:03,775 [INFO] modulus.blocks.data_loaders.multi_source_loader.data_loader: total dataset size 53, number of sources: 1, batch size per gpu: 4, steps: 14
WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/autograph/converters/directives.py:119: The name tf.set_random_seed is deprecated. Please use tf.compat.v1.set_random_seed instead.2022-01-03 07:22:03,811 [WARNING] tensorflow: From /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/autograph/converters/directives.py:119: The name tf.set_random_seed is deprecated. Please use tf.compat.v1.set_random_seed instead.
WARNING:tensorflow:Entity <bound method DriveNetTFRecordsParser.call of <iva.detectnet_v2.dataloader.drivenet_dataloader.DriveNetTFRecordsParser object at 0x7f17301f14a8>> could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux,
export AUTOGRAPH_VERBOSITY=10
) and attach the full output. Cause: Unable to locate the source code of <bound method DriveNetTFRecordsParser.call of <iva.detectnet_v2.dataloader.drivenet_dataloader.DriveNetTFRecordsParser object at 0x7f17301f14a8>>. Note that functions defined in certain environments, like the interactive Python shell do not expose their source code. If that is the case, you should to define them in a .py source file. If you are certain the code is graph-compatible, wrap the call using @tf.autograph.do_not_convert. Original error: could not get source code
2022-01-03 07:22:03,846 [WARNING] tensorflow: Entity <bound method DriveNetTFRecordsParser.call of <iva.detectnet_v2.dataloader.drivenet_dataloader.DriveNetTFRecordsParser object at 0x7f17301f14a8>> could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux,export AUTOGRAPH_VERBOSITY=10
) and attach the full output. Cause: Unable to locate the source code of <bound method DriveNetTFRecordsParser.call of <iva.detectnet_v2.dataloader.drivenet_dataloader.DriveNetTFRecordsParser object at 0x7f17301f14a8>>. Note that functions defined in certain environments, like the interactive Python shell do not expose their source code. If that is the case, you should to define them in a .py source file. If you are certain the code is graph-compatible, wrap the call using @tf.autograph.do_not_convert. Original error: could not get source code
2022-01-03 07:22:03,864 [INFO] iva.detectnet_v2.dataloader.default_dataloader: Bounding box coordinates were detected in the input specification! Bboxes will be automatically converted to polygon coordinates.2022-01-03 07:22:04,101 [INFO] modulus.blocks.data_loaders.multi_source_loader.data_loader: shuffle: True - shard 0 of 1
2022-01-03 07:22:04,107 [INFO] modulus.blocks.data_loaders.multi_source_loader.data_loader: sampling 1 datasets with weights:
2022-01-03 07:22:04,107 [INFO] modulus.blocks.data_loaders.multi_source_loader.data_loader: source: 0 weight: 1.000000
WARNING:tensorflow:Entity <bound method Processor.call of <modulus.blocks.data_loaders.multi_source_loader.processors.asset_loader.AssetLoader object at 0x7f178015a400>> could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux,export AUTOGRAPH_VERBOSITY=10
) and attach the full output. Cause: Unable to locate the source code of <bound method Processor.call of <modulus.blocks.data_loaders.multi_source_loader.processors.asset_loader.AssetLoader object at 0x7f178015a400>>. Note that functions defined in certain environments, like the interactive Python shell do not expose their source code. If that is the case, you should to define them in a .py source file. If you are certain the code is graph-compatible, wrap the call using @tf.autograph.do_not_convert. Original error: could not get source code
2022-01-03 07:22:04,119 [WARNING] tensorflow: Entity <bound method Processor.call of <modulus.blocks.data_loaders.multi_source_loader.processors.asset_loader.AssetLoader object at 0x7f178015a400>> could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux,export AUTOGRAPH_VERBOSITY=10
) and attach the full output. Cause: Unable to locate the source code of <bound method Processor.call of <modulus.blocks.data_loaders.multi_source_loader.processors.asset_loader.AssetLoader object at 0x7f178015a400>>. Note that functions defined in certain environments, like the interactive Python shell do not expose their source code. If that is the case, you should to define them in a .py source file. If you are certain the code is graph-compatible, wrap the call using @tf.autograph.do_not_convert. Original error: could not get source code
2022-01-03 07:22:04,466 [INFO] main: Found 53 samples in training set
WARNING:tensorflow:From /opt/tlt/.cache/dazel/_dazel_tlt/75913d2aee35770fa76c4a63d877f3aa/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/rasterizers/bbox_rasterizer.py:347: The name tf.bincount is deprecated. Please use tf.math.bincount instead.2022-01-03 07:22:04,565 [WARNING] tensorflow: From /opt/tlt/.cache/dazel/_dazel_tlt/75913d2aee35770fa76c4a63d877f3aa/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/rasterizers/bbox_rasterizer.py:347: The name tf.bincount is deprecated. Please use tf.math.bincount instead.
WARNING:tensorflow:From /opt/tlt/.cache/dazel/_dazel_tlt/75913d2aee35770fa76c4a63d877f3aa/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/training/training_proto_utilities.py:89: The name tf.train.get_or_create_global_step is deprecated. Please use tf.compat.v1.train.get_or_create_global_step instead.
2022-01-03 07:22:04,683 [WARNING] tensorflow: From /opt/tlt/.cache/dazel/_dazel_tlt/75913d2aee35770fa76c4a63d877f3aa/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/training/training_proto_utilities.py:89: The name tf.train.get_or_create_global_step is deprecated. Please use tf.compat.v1.train.get_or_create_global_step instead.
WARNING:tensorflow:From /opt/tlt/.cache/dazel/_dazel_tlt/75913d2aee35770fa76c4a63d877f3aa/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/training/training_proto_utilities.py:36: The name tf.train.AdamOptimizer is deprecated. Please use tf.compat.v1.train.AdamOptimizer instead.
2022-01-03 07:22:04,698 [WARNING] tensorflow: From /opt/tlt/.cache/dazel/_dazel_tlt/75913d2aee35770fa76c4a63d877f3aa/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/training/training_proto_utilities.py:36: The name tf.train.AdamOptimizer is deprecated. Please use tf.compat.v1.train.AdamOptimizer instead.
WARNING:tensorflow:From /opt/tlt/.cache/dazel/_dazel_tlt/75913d2aee35770fa76c4a63d877f3aa/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/cost_function/cost_functions.py:17: The name tf.log is deprecated. Please use tf.math.log instead.
2022-01-03 07:22:04,849 [WARNING] tensorflow: From /opt/tlt/.cache/dazel/_dazel_tlt/75913d2aee35770fa76c4a63d877f3aa/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/cost_function/cost_functions.py:17: The name tf.log is deprecated. Please use tf.math.log instead.
WARNING:tensorflow:From /opt/tlt/.cache/dazel/_dazel_tlt/75913d2aee35770fa76c4a63d877f3aa/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/cost_function/cost_auto_weight_hook.py:235: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.
2022-01-03 07:22:04,887 [WARNING] tensorflow: From /opt/tlt/.cache/dazel/_dazel_tlt/75913d2aee35770fa76c4a63d877f3aa/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/cost_function/cost_auto_weight_hook.py:235: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.
WARNING:tensorflow:From /opt/tlt/.cache/dazel/_dazel_tlt/75913d2aee35770fa76c4a63d877f3aa/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/model/detectnet_model.py:591: The name tf.summary.scalar is deprecated. Please use tf.compat.v1.summary.scalar instead.
2022-01-03 07:22:04,895 [WARNING] tensorflow: From /opt/tlt/.cache/dazel/_dazel_tlt/75913d2aee35770fa76c4a63d877f3aa/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/model/detectnet_model.py:591: The name tf.summary.scalar is deprecated. Please use tf.compat.v1.summary.scalar instead.
2022-01-03 07:22:06,358 [INFO] modulus.blocks.data_loaders.multi_source_loader.data_loader: Serial augmentation enabled = False
2022-01-03 07:22:06,358 [INFO] modulus.blocks.data_loaders.multi_source_loader.data_loader: Pseudo sharding enabled = False
2022-01-03 07:22:06,358 [INFO] modulus.blocks.data_loaders.multi_source_loader.data_loader: Max Image Dimensions (all sources): (0, 0)
2022-01-03 07:22:06,358 [INFO] modulus.blocks.data_loaders.multi_source_loader.data_loader: number of cpus: 24, io threads: 48, compute threads: 24, buffered batches: 4
2022-01-03 07:22:06,358 [INFO] modulus.blocks.data_loaders.multi_source_loader.data_loader: total dataset size 5, number of sources: 1, batch size per gpu: 4, steps: 2
WARNING:tensorflow:Entity <bound method DriveNetTFRecordsParser.call of <iva.detectnet_v2.dataloader.drivenet_dataloader.DriveNetTFRecordsParser object at 0x7f17301f1470>> could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux,export AUTOGRAPH_VERBOSITY=10
) and attach the full output. Cause: Unable to locate the source code of <bound method DriveNetTFRecordsParser.call of <iva.detectnet_v2.dataloader.drivenet_dataloader.DriveNetTFRecordsParser object at 0x7f17301f1470>>. Note that functions defined in certain environments, like the interactive Python shell do not expose their source code. If that is the case, you should to define them in a .py source file. If you are certain the code is graph-compatible, wrap the call using @tf.autograph.do_not_convert. Original error: could not get source code
2022-01-03 07:22:06,367 [WARNING] tensorflow: Entity <bound method DriveNetTFRecordsParser.call of <iva.detectnet_v2.dataloader.drivenet_dataloader.DriveNetTFRecordsParser object at 0x7f17301f1470>> could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux,export AUTOGRAPH_VERBOSITY=10
) and attach the full output. Cause: Unable to locate the source code of <bound method DriveNetTFRecordsParser.call of <iva.detectnet_v2.dataloader.drivenet_dataloader.DriveNetTFRecordsParser object at 0x7f17301f1470>>. Note that functions defined in certain environments, like the interactive Python shell do not expose their source code. If that is the case, you should to define them in a .py source file. If you are certain the code is graph-compatible, wrap the call using @tf.autograph.do_not_convert. Original error: could not get source code
Traceback (most recent call last):
File “/opt/tlt/.cache/dazel/_dazel_tlt/75913d2aee35770fa76c4a63d877f3aa/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py”, line 849, in
File “/opt/tlt/.cache/dazel/_dazel_tlt/75913d2aee35770fa76c4a63d877f3aa/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py”, line 838, in
File “”, line 2, in main
File “/opt/tlt/.cache/dazel/_dazel_tlt/75913d2aee35770fa76c4a63d877f3aa/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/utilities/timer.py”, line 46, in wrapped_fn
File “/opt/tlt/.cache/dazel/_dazel_tlt/75913d2aee35770fa76c4a63d877f3aa/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py”, line 827, in main
File “/opt/tlt/.cache/dazel/_dazel_tlt/75913d2aee35770fa76c4a63d877f3aa/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py”, line 708, in run_experiment
File “/opt/tlt/.cache/dazel/_dazel_tlt/75913d2aee35770fa76c4a63d877f3aa/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py”, line 633, in train_gridbox
File “/opt/tlt/.cache/dazel/_dazel_tlt/75913d2aee35770fa76c4a63d877f3aa/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py”, line 507, in build_validation_graph
File “/opt/tlt/.cache/dazel/_dazel_tlt/75913d2aee35770fa76c4a63d877f3aa/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/dataloader/drivenet_dataloader.py”, line 694, in get_dataset_tensors
File “/opt/tlt/.cache/dazel/_dazel_tlt/75913d2aee35770fa76c4a63d877f3aa/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/core/build_wheel.runfiles/ai_infra/moduluspy/modulus/blocks/trainers/multi_task_trainer/data_loader_interface.py”, line 77, in call
File “/opt/tlt/.cache/dazel/_dazel_tlt/75913d2aee35770fa76c4a63d877f3aa/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/core/build_wheel.runfiles/ai_infra/moduluspy/modulus/blocks/data_loaders/multi_source_loader/data_loader.py”, line 396, in call
File “/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/data/ops/dataset_ops.py”, line 2081, in apply
return DatasetV1Adapter(super(DatasetV1, self).apply(transformation_func))
File “/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/data/ops/dataset_ops.py”, line 1422, in apply
dataset = transformation_func(self)
File “/opt/tlt/.cache/dazel/_dazel_tlt/75913d2aee35770fa76c4a63d877f3aa/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/dataloader/drivenet_dataloader.py”, line 405, in
File “/opt/nvidia/third_party/keras/tensorflow_backend.py”, line 356, in new_map
self, _map_func_set_random_wrapper, num_parallel_calls=num_parallel_calls
File “/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/data/ops/dataset_ops.py”, line 2000, in map
MapDataset(self, map_func, preserve_cardinality=False))
File “/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/data/ops/dataset_ops.py”, line 3531, in init
use_legacy_function=use_legacy_function)
File “/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/data/ops/dataset_ops.py”, line 2810, in init
self._function = wrapper_fn._get_concrete_function_internal()
File “/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/eager/function.py”, line 1853, in _get_concrete_function_internal
*args, **kwargs)
File “/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/eager/function.py”, line 1847, in _get_concrete_function_internal_garbage_collected
graph_function, _, _ = self._maybe_define_function(args, kwargs)
File “/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/eager/function.py”, line 2147, in _maybe_define_function
graph_function = self._create_graph_function(args, kwargs)
File “/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/eager/function.py”, line 2038, in _create_graph_function
capture_by_value=self._capture_by_value),
File “/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/func_graph.py”, line 915, in func_graph_from_py_func
func_outputs = python_func(*func_args, **func_kwargs)
File “/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/data/ops/dataset_ops.py”, line 2804, in wrapper_fn
ret = _wrapper_helper(*args)
File “/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/data/ops/dataset_ops.py”, line 2749, in _wrapper_helper
ret = autograph.tf_convert(func, ag_ctx)(*nested_args)
File “/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/autograph/impl/api.py”, line 237, in wrapper
raise e.ag_error_metadata.to_exception(e)
StopIteration: in converted code:
relative to /opt:nvidia/third_party/keras/tensorflow_backend.py:353 _map_func_set_random_wrapper * return map_func(*args, **kwargs) tlt/.cache/dazel/_dazel_tlt/75913d2aee35770fa76c4a63d877f3aa/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/dataloader/drivenet_dataloader.py:144 __call__ tlt/.cache/dazel/_dazel_tlt/75913d2aee35770fa76c4a63d877f3aa/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/dataloader/drivenet_dataloader.py:112 _get_parse_example tlt/.cache/dazel/_dazel_tlt/75913d2aee35770fa76c4a63d877f3aa/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/dataloader/utilities.py:217 extract_tfrecords_features StopIteration:
2022-01-03 15:22:07,038 [INFO] tlt.components.docker_handler.docker_handler: Stopping container.
could you please help to take a look?