Please provide the following information when requesting support.
• Hardware (RTX2070)
• Network Type (Yolo_v4_tiny)
• TLT Version ( 4.0.1)
I am trying to train yolo v4 model using TAO. My training images are 1920x1080 in resolution, thus I set the output_width: 1920
output_height: 1088
in the augmentation_config. My training config file is like bellow:
random_seed: 42
yolov4_config {
big_anchor_shape: "[(260.69, 172.35), (125.91, 81.47), (72.27, 42.42)]"
mid_anchor_shape: "[(30.80, 71.40), (38.97, 26.86), (18.88, 17.11)]"
box_matching_iou: 0.25
matching_neutral_box_iou: 0.5
arch: "cspdarknet_tiny"
loss_loc_weight: 1.0
loss_neg_obj_weights: 1.0
loss_class_weights: 1.0
label_smoothing: 0.0
big_grid_xy_extend: 0.05
mid_grid_xy_extend: 0.05
freeze_bn: false
#freeze_blocks: 0
force_relu: false
}
training_config {
visualizer {
enabled: False
num_images: 3
}
batch_size_per_gpu: 4
num_epochs: 80
enable_qat: false
checkpoint_interval: 2
learning_rate {
soft_start_cosine_annealing_schedule {
min_learning_rate: 1e-7
max_learning_rate: 1e-4
soft_start: 0.3
}
}
regularizer {
type: L1
weight: 3e-5
}
optimizer {
adam {
epsilon: 1e-7
beta1: 0.9
beta2: 0.999
amsgrad: false
}
}
pretrain_model_path: "/workspace/tao-experiments/yolo_v4_tiny/pretrained_cspdarknet_tiny/pretrained_object_detection_vcspdarknet_tiny/cspdarknet_tiny.hdf5"
}
eval_config {
average_precision_mode: SAMPLE
batch_size: 4
matching_iou_threshold: 0.3
}
nms_config {
confidence_threshold: 0.001
clustering_iou_threshold: 0.3
top_k: 1
}
augmentation_config {
hue: 0.1
saturation: 1.5
exposure:1.5
vertical_flip:0
horizontal_flip: 0.5
jitter: 0.3
output_width: 1920
output_height: 1088
output_channel: 3
randomize_input_shape_period: 10
mosaic_prob: 0.5
mosaic_min_ratio:0.2
}
dataset_config {
data_sources: {
label_directory_path: "/workspace/tao-experiments/data/train/labels"
image_directory_path: "/workspace/tao-experiments/data/train/images"
}
include_difficult_in_training: true
target_class_mapping {
key: "sedan"
value: "sedan"
}
target_class_mapping {
key: "midtruck"
value: "midtruck"
}
target_class_mapping {
key: "motorbike"
value: "motorbike"
}
target_class_mapping {
key: "threewheeler"
value: "threewheeler"
}
target_class_mapping {
key: "bicycle"
value: "bicycle"
}
target_class_mapping {
key: "minibus"
value: "minibus"
}
target_class_mapping {
key: "lighttruck"
value: "lighttruck"
}
target_class_mapping {
key: "microbus"
value: "microbus"
}
target_class_mapping {
key: "bigbus"
value: "bigbus"
}
target_class_mapping {
key: "heavytruck"
value: "heavytruck"
}
target_class_mapping {
key: "utility"
value: "utility"
}
target_class_mapping {
key: "nmt"
value: "nmt"
}
validation_data_sources: {
label_directory_path: "/workspace/tao-experiments/data/val/labels"
image_directory_path: "/workspace/tao-experiments/data/val/images"
}
}
However, the training stuck with the following error:
INFO: Starting Training Loop.
Epoch 1/80
470/186491 [..............................] - ETA: 214:20:58 - loss: 30052.7804INFO: 2 root error(s) found.
(0) Invalid argument: Input to reshape is a tensor with 2108 values, but the requested shape has 2074
[[{{node bg_anchor_1/Reshape_1}}]]
[[loss_1/add_20/_3009]]
(1) Invalid argument: Input to reshape is a tensor with 2108 values, but the requested shape has 2074
[[{{node bg_anchor_1/Reshape_1}}]]
0 successful operations.
0 derived errors ignored.
Traceback (most recent call last):
File "</usr/local/lib/python3.6/dist-packages/iva/yolo_v4/scripts/train.py>", line 3, in <module>
File "<frozen iva.yolo_v4.scripts.train>", line 152, in <module>
File "<frozen iva.common.utils>", line 707, in return_func
File "<frozen iva.common.utils>", line 695, in return_func
File "<frozen iva.yolo_v4.scripts.train>", line 148, in main
File "<frozen iva.yolo_v4.scripts.train>", line 133, in main
File "<frozen iva.yolo_v4.scripts.train>", line 83, in run_experiment
File "<frozen iva.yolo_v4.models.yolov4_model>", line 731, in train
File "<frozen iva.yolo_v4.utils.fit_generator>", line 222, in fit_generator
File "/usr/local/lib/python3.6/dist-packages/keras/engine/training.py", line 1217, in train_on_batch
outputs = self.train_function(ins)
File "/usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py", line 2715, in __call__
return self._call(inputs)
File "/usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py", line 2675, in _call
fetched = self._callable_fn(*array_vals)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1472, in __call__
run_metadata_ptr)
tensorflow.python.framework.errors_impl.InvalidArgumentError: 2 root error(s) found.
(0) Invalid argument: Input to reshape is a tensor with 2108 values, but the requested shape has 2074
[[{{node bg_anchor_1/Reshape_1}}]]
[[loss_1/add_20/_3009]]
(1) Invalid argument: Input to reshape is a tensor with 2108 values, but the requested shape has 2074
[[{{node bg_anchor_1/Reshape_1}}]]
0 successful operations.
0 derived errors ignored.
Telemetry data couldn't be sent, but the command ran successfully.
[WARNING]: <urlopen error [Errno -2] Name or service not known>
Execution status: FAIL
2023-04-17 12:51:02,417 [INFO] tlt.components.docker_handler.docker_handler: Stopping container.
I tried to set enable_auto_resize: true
but it might not be supported. What else can I do?