Train faster rcnn model error when resume from tlt

Hi,

I want retrain faster rcnn modl from tlt, and I set resume_from_model: “/workspace/tlt-experiments/data/faster_rcnn/frcnn_kitti_darknet53.epoch111.tlt”

However OOM error happend. I dont think the GPU memory is not enough, I have set batch_size_per_gpu 1 and export TF_FORCE_GPU_ALLOW_GROWTH=true. Meanwhile I use –gpus 4 with 4 TITAN V of 12GB MEM free.

I noticed that if I set pretrained_weight to darknet53.hdf5, it can train succeed.


Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
tensorflow.python.framework.errors_impl.ResourceExhaustedError: OOM when allocating tensor with shape[256,1024,13,13] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
[[{{node training_1/Adam/gradients/time_distributed_19_1/AvgPool_grad/AvgPoolGrad}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.


So I think it may caused by my specs. Here is my specs.

random_seed: 42
enc_key: 'YjlxOTRkaHRjYWI2Z2N***'
verbose: True
network_config {
input_image_config {
image_type: RGB
image_channel_order: 'bgr'
size_height_width {
height: 1080
width: 1920
}
    image_channel_mean {
        key: 'b'
        value: 103.939
}
    image_channel_mean {
        key: 'g'
        value: 116.779
}
    image_channel_mean {
        key: 'r'
        value: 123.68
}
image_scaling_factor: 1.0
max_objects_num_per_image: 100
}
feature_extractor: "darknet:53"
anchor_box_config {
scale: 64.0
scale: 128.0
scale: 256.0
ratio: 1.0
ratio: 0.5
ratio: 2.0
}
freeze_bn: True
freeze_blocks: 0
freeze_blocks: 1
roi_mini_batch: 256
rpn_stride: 16
conv_bn_share_bias: True
roi_pooling_config {
pool_size: 7
pool_size_2x: False
}
all_projections: True
use_pooling:False
}
training_config {
kitti_data_config {
  data_sources: {
    tfrecords_path: "/workspace/tlt-experiments/tfrecords/kitti_trainval/kitti_trainval*"
    image_directory_path: "/workspace/tlt-experiments/data/training"
  }
image_extension: 'jpg'
target_class_mapping {
key: 'person'
value: 'person'
}
validation_fold: 0
}
data_augmentation {
preprocessing {
output_image_width: 1920
output_image_height: 1080
output_image_channel: 3
min_bbox_width: 1.0
min_bbox_height: 1.0
}
spatial_augmentation {
hflip_probability: 0.5
vflip_probability: 0.0
zoom_min: 1.0
zoom_max: 1.0
translate_max_x: 0
translate_max_y: 0
}
color_augmentation {
hue_rotation_max: 0.0
saturation_shift_max: 0.0
contrast_scale_max: 0.0
contrast_center: 0.5
}
}
enable_augmentation: True
batch_size_per_gpu: 1
num_epochs: 300
#pretrained_weights: "/workspace/tlt-experiments/data/faster_rcnn/darknet53.hdf5"
resume_from_model: "/workspace/tlt-experiments/data/faster_rcnn/frcnn_kitti_darknet53.epoch111.tlt"
output_model: "/workspace/tlt-experiments/data/faster_rcnn/frcnn_kitti_darknet53.tlt"
rpn_min_overlap: 0.3
rpn_max_overlap: 0.7
classifier_min_overlap: 0.0
classifier_max_overlap: 0.5
gt_as_roi: False
std_scaling: 1.0
classifier_regr_std {
key: 'x'
value: 10.0
}
classifier_regr_std {
key: 'y'
value: 10.0
}
classifier_regr_std {
key: 'w'
value: 5.0
}
classifier_regr_std {
key: 'h'
value: 5.0
}

rpn_mini_batch: 256
rpn_pre_nms_top_N: 12000
rpn_nms_max_boxes: 2000
rpn_nms_overlap_threshold: 0.7

reg_config {
reg_type: 'L2'
weight_decay: 1e-4
}

optimizer {
adam {
lr: 0.00001
beta_1: 0.9
beta_2: 0.999
decay: 0.0
}
}

lr_scheduler {
step {
base_lr: 0.00001
gamma: 1.0
step_size: 30
}
}

lambda_rpn_regr: 1.0
lambda_rpn_class: 1.0
lambda_cls_regr: 1.0
lambda_cls_class: 1.0

inference_config {
#images_dir: '/workspace/tlt-experiments/data/testing/image_2'
images_dir: '/workspace/tlt-experiments/data/lanzhou_test/'
detection_image_output_dir: '/workspace/tlt-experiments/data/faster_rcnn/lanzhou_test_imgs'
labels_dump_dir: '/workspace/tlt-experiments/data/faster_rcnn/lanzhou_test_labels'
model: '/workspace/tlt-experiments/data/faster_rcnn/frcnn_kitti_darknet53.epoch110.tlt'
#detection_image_output_dir: '/workspace/tlt-experiments/data/faster_rcnn/inference_results_imgs'
#labels_dump_dir: '/workspace/tlt-experiments/data/faster_rcnn/inference_dump_labels'
rpn_pre_nms_top_N: 6000
rpn_nms_max_boxes: 300
rpn_nms_overlap_threshold: 0.7
bbox_visualize_threshold: 0.6
classifier_nms_max_boxes: 300
classifier_nms_overlap_threshold: 0.3
}

evaluation_config {
model: '/workspace/tlt-experiments/data/faster_rcnn/frcnn_kitti_darknet53.epoch110.tlt'
labels_dump_dir: '/workspace/tlt-experiments/data/faster_rcnn/test_dump_labels'
rpn_pre_nms_top_N: 6000
rpn_nms_max_boxes: 300
rpn_nms_overlap_threshold: 0.7
classifier_nms_max_boxes: 300
classifier_nms_overlap_threshold: 0.3
object_confidence_thres: 0.0001
use_voc07_11point_metric:False
}

}

Can you double check if the other parameters are the same when you train with pretrained model darknet53.hdf5?

Yes, other parameters are absolutely same. I only change model from resume_from_model to pretrained_weights, it can train succeed. But it train from epoch 0.

For 1920x1080 resolution, even for batch size 1, I am afraid 12GB GPU memory is not enough because when we resume a training from checkpoint, the implementation needs 2 copies of the model(two FasterRCNN DarkNet53 models in memory).

I suggest you to try the KITTI notebook example in TLT docker at first. To check if OOM happened for DarkNet53 when resume training from checkpoint.

The KITTI dataset is mostly 1248x384, which is smaller than 1920x1080.