Model performance is terrible when using 8 gpus

I am experiencing weird results when going from 1 or 2 gpus to 8 gpus when training a FasterRCNN model in TLT 3.0.

When I train the model with just 1 or 2 gpus, I get expected validation scores (e.g. mAP). However, when I train a model on the same dataset & same specs - but with 8 gpus - I get 0.0s in all the validation metrics.

My immediate thought is that something is going wrong during the training process, however, the loss scores provided during each epoch seem to be fine. It’s just the validation scores that get messed up.

Additionally, the 8gpu-trained model fails to make any inferences when running tlt faster_rcnn inference on a dataset in which the 1gpu and 2gpu -trained models do fine with. That is, the inference commands runs to completion no problem. It just doesn’t make any predictions.

Could you help resolve why our model fails when trained with 8 gpus?

As one last note, can you clarify whether any of the loss metrics printed during training are for validation loss, instead of training loss? We are having trouble finding any documentation on loss vs rpn_out_class_loss vs rpn_out_regress_loss etc. Are they all with respect to the training data? Can you point me to any documentation on those metrics? And obviously the mAP scores I am referring to above are a form of a validation metrics but the documentation doesn’t explain whether other metrics can be used instead. Like “validation_loss”, for example.

Thanks

Can you share the training spec with 8 gpus?
I think it is necessary to finetune the batch_size_per_gpu and lr.

The loss is averaged in an epoch on training dataset, not validation set.

Yes, the metrics are for validation dataset. Currently, no other metrics can be used.

Yes, here is the training spec:

Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
random_seed: 42
enc_key: ‘tlt’
verbose: True
model_config {
input_image_config {
image_type: RGB
image_channel_order: ‘bgr’
size_height_width {
height: 1024
width: 1024
}
image_channel_mean {
key: ‘b’
value: 103.939
}
image_channel_mean {
key: ‘g’
value: 116.779
}
image_channel_mean {
key: ‘r’
value: 123.68
}
image_scaling_factor: 1.0
max_objects_num_per_image: 20
}
arch: “resnet:18”
anchor_box_config {
scale: 64.0
scale: 128.0
scale: 256.0
ratio: 1.0
ratio: 0.5
ratio: 2.0
}
freeze_bn: True
freeze_blocks: 0
freeze_blocks: 1
roi_mini_batch: 256
rpn_stride: 16
use_bias: False
roi_pooling_config {
pool_size: 7
pool_size_2x: False
}
all_projections: True
use_pooling:False
}
dataset_config {
data_sources: {
tfrecords_path: “/workspace/tlt-experiments/faster_rcnn_3_29/data/tfrecords/kitti_trainval/kitti_trainval*”
image_directory_path: “/workspace/tlt-experiments”
}
image_extension: ‘png’
target_class_mapping {
key: “cheerios”
value: “cheerios”
}
target_class_mapping {
key: “cornflakes”
value: “cornflakes”
}
target_class_mapping {
key: “fruityluckycharms”
value: “fruityluckycharms”
}
target_class_mapping {
key: “honeybunchesalmonds”
value: “honeybunchesalmonds”
}
target_class_mapping {
key: “honeybunchesstrawberry”
value: “honeybunchesstrawberry”
}
target_class_mapping {
key: “honeynutcheerios”
value: “honeynutcheerios”
}
target_class_mapping {
key: “raisinbran”
value: “raisinbran”
}
target_class_mapping {
key: “ricekrispies”
value: “ricekrispies”
}
validation_data_source: {
tfrecords_path: “/workspace/tlt-experiments/faster_rcnn_3_29/data/tfrecords/kitti_test/kitti_test*”
image_directory_path: “/workspace/tlt-experiments”
}
}
augmentation_config {
preprocessing {
output_image_width: 1024
output_image_height: 1024
output_image_channel: 3
min_bbox_width: 10.0
min_bbox_height: 10.0
}
spatial_augmentation {
hflip_probability: 0.0
vflip_probability: 0.0
zoom_min: 0.9
zoom_max: 1.1
translate_max_x: 8.0
translate_max_y: 8.0
rotate_rad_max: 0.69
}
color_augmentation {
color_shift_stddev: 0.025
hue_rotation_max: 360.0
saturation_shift_max: 0.025
contrast_scale_max: 0.025
contrast_center: 0.5
}
}
training_config {
enable_augmentation: True
enable_qat: True
batch_size_per_gpu: 8
num_epochs: 15
pretrained_weights: “/workspace/tlt-experiments/faster_rcnn_3_29/faster_rcnn/tlt_pretrained_object_detection_vresnet18/resnet_18.hdf5”
#resume_from_model: “/workspace/tlt-experiments/faster_rcnn_3_29/faster_rcnn/trained_models/210318_frcnn_resnet18_epch8/frcnn_kitti_resnet18.epoch8.tlt”
output_model: “/workspace/tlt-experiments/faster_rcnn_3_29/faster_rcnn/frcnn_kitti_resnet18.tlt”
rpn_min_overlap: 0.3
rpn_max_overlap: 0.7
classifier_min_overlap: 0.0
classifier_max_overlap: 0.5
gt_as_roi: False
std_scaling: 1.0
classifier_regr_std {
key: ‘x’
value: 10.0
}
classifier_regr_std {
key: ‘y’
value: 10.0
}
classifier_regr_std {
key: ‘w’
value: 5.0
}
classifier_regr_std {
key: ‘h’
value: 5.0
}
rpn_mini_batch: 256
rpn_pre_nms_top_N: 12000
rpn_nms_max_boxes: 2000
rpn_nms_overlap_threshold: 0.7
regularizer {
type: L2
weight: 1e-4
}
optimizer {
sgd {
lr: 0.02
momentum: 0.9
decay: 0.0
nesterov: False
}
}
learning_rate {
soft_start {
base_lr: 0.02
start_lr: 0.002
soft_start: 0.1
annealing_points: 0.8
annealing_points: 0.9
annealing_divider: 10.0
}
}
lambda_rpn_regr: 1.0
lambda_rpn_class: 1.0
lambda_cls_regr: 1.0
lambda_cls_class: 1.0
}

inference_config {
images_dir: ‘/workspace/tlt-experiments/images/inference/unlabeled_composite_frames_04’
model: ‘/workspace/tlt-experiments/faster_rcnn_3_29/faster_rcnn/models/8gpus/frcnn_kitti_resnet18.epoch15.tlt’
batch_size: 1
detection_image_output_dir: ‘/workspace/tlt-experiments/faster_rcnn_3_29/faster_rcnn/inferences/unlabeled_composite_frames_04/210326_8gpus_epoch8_inference_results_imgs’
labels_dump_dir: ‘/workspace/tlt-experiments/faster_rcnn_3_29/faster_rcnn/inferences/unlabeled_composite_frames_04/210326_8gpus_epoch8_inference_dump_labels’
rpn_pre_nms_top_N: 6000
rpn_nms_max_boxes: 300
rpn_nms_overlap_threshold: 0.7
object_confidence_thres: 0.0001
bbox_visualize_threshold: 0.6
classifier_nms_max_boxes: 100
classifier_nms_overlap_threshold: 0.3
}
evaluation_config {
model: ‘/workspace/tlt-experiments/faster_rcnn_3_29/faster_rcnn/models/8gpus/frcnn_kitti_resnet18.epoch15.tlt’
batch_size: 1
validation_period_during_training: 1
rpn_pre_nms_top_N: 6000
rpn_nms_max_boxes: 300
rpn_nms_overlap_threshold: 0.7
classifier_nms_max_boxes: 100
classifier_nms_overlap_threshold: 0.3
object_confidence_thres: 0.0001
use_voc07_11point_metric:False
gt_matching_iou_threshold: 0.5
}

Thanks for confirming. Do any of the object detection models support other validation metrics that you know of? E.g. DetectNet_v2, YOLO,

Could you trigger experiments to finetune the lr? For example, try a lower lr.

sgd {
lr: 0.01