In the continuation to Ravi’s Discussion,
We were able to train the resnet18 model for object detection on VOC Dataset, i am sharing all the spec files we have created so far to help others get a better understanding of the whole pipeline.
Dataset: http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar
voc to kitti format conversion Script:
#voc_to_kitty.py
import collections
import os
from PIL import Image
from tqdm import tqdm_notebook as tqdm
import xmltodict
import argparse
# construct the command line arguments
ap = argparse.ArgumentParser()
ap.add_argument("-i", "--input", required=True,
help="input voc dataset path")
ap.add_argument("-wd", "--width", required=False, default='480',
help="output image width")
ap.add_argument("-ht", "--height", required=False, default='320',
help="output image height")
args = vars(ap.parse_args())
input_dir = args["input"]
image_width = int(args["width"]) #480
image_height = int(args["height"]) #320
def save_kitti_labels(label_list, output_file):
"""Function wrapper to save kitti format bbox labels to txt file.
Args:
label_list (list: list of string of kitti labels.
output_file (str): path to output kitti label file.
"""
with open(output_file, 'w') as f:
if len(label_list) != 0:
for line in label_list:
f.write(line)
f.closed
def read_pascal_labels(xmlfile):
"""Function to read xml file and parse to return list of objects.
Args:
filename (str): path to the xml label file.
"""
with open(xmlfile, 'r') as xfile:
data = xmltodict.parse(xfile)
object_list = data['annotation']['object']
# Making sure that the outputs is always a list irrespective
# of having just one object
if isinstance(object_list, collections.OrderedDict):
object_list = [object_list]
return object_list
def map_to_kitti(object_list, sf):
"""Function to map xml objects to kitti object.
Args:
object_list: List of xml translated labels per object.
"""
if not isinstance(object_list, list):
raise TypeError("The object list is incompatible: {}".type(object_list))
labels_list = []
scale_x = sf[0]
scale_y = sf[1]
for element in object_list:
class_name = element['name']
xmin = float(element['bndbox']['xmin']) * scale_x
ymin = float(element['bndbox']['ymin']) * scale_y
xmax = float(element['bndbox']['xmax']) * scale_x
ymax = float(element['bndbox']['ymax']) * scale_y
label_tail = " 0.00 0.00 0.00 0.00 0.00 0.00 0.00\n"
label_head = class_name.lower() + " 0.00 0 0.00 "
bbox_string = "{:.3f} {:.3f} {:.3f} {:.3f}".format(xmin, ymin,
xmax, ymax)
label_string = label_head + bbox_string + label_tail
labels_list.append(label_string)
return labels_list
def convert_to_kitti(voc_root, output_image_height, output_image_width):
"""Wrapper function to convert a VOC dataset to kitti format.
Args:
voc_root (str): Path to voc dataset root.
output_image_height (int): image height for the output kitti images
output_image_width (int): image_width for the output kitti images
"""
output_size = (output_image_width, output_image_height)
voc_labels_root = os.path.join(voc_root, "Annotations")
kitti_trainval_labels_root = os.path.join(voc_root, "Annotations_kitti/trainval")
kitti_test_labels_root = os.path.join(voc_root, "Annotations_kitti/test")
voc_images_root = os.path.join(voc_root, "JPEGImages")
kitti_trainval_images_root = os.path.join(voc_root, "JPEGImages_kitti/trainval")
kitti_test_images_root = os.path.join(voc_root, "JPEGImages_kitti/test")
if not os.path.exists(kitti_trainval_labels_root):
os.makedirs(kitti_trainval_labels_root)
if not os.path.exists(kitti_test_labels_root):
os.makedirs(kitti_test_labels_root)
if not os.path.exists(kitti_test_images_root):
os.makedirs(kitti_test_images_root)
if not os.path.exists(kitti_trainval_images_root):
os.makedirs(kitti_trainval_images_root)
labels = [os.path.splitext(item)[0] for item in sorted(os.listdir(voc_labels_root))
if item.endswith('.xml')]
for item in tqdm(labels):
if "2011" in item:
kitti_images_root = kitti_test_images_root
kitti_labels_root = kitti_test_labels_root
else:
kitti_images_root = kitti_trainval_images_root
kitti_labels_root = kitti_trainval_labels_root
xmlfile = os.path.join(voc_labels_root, "{}.xml".format(item))
kitti_file = os.path.join(kitti_labels_root, "{}.txt".format(item))
image_file = os.path.join(voc_images_root, "{}.jpg".format(item))
kitti_image = os.path.join(kitti_images_root, "{}.jpg".format(item))
image = Image.open(image_file)
resized_image = image.resize(output_size, Image.ANTIALIAS)
resized_image.save(kitti_image)
scale_x = float(output_image_width)/float(image.size[0])
scale_y = float(output_image_height)/float(image.size[1])
sf = (scale_x, scale_y)
pascal_labels = read_pascal_labels(xmlfile)
kitti_labels = map_to_kitti(pascal_labels, sf)
save_kitti_labels(kitti_labels, kitti_file)
convert_to_kitti(input_dir, image_height, image_width)
kitti to tf-records conversion spec_file:
kitti_config {
root_directory_path: "/workspace/dataset/VOCdevkit/VOC2012"
image_dir_name: "JPEGImages_kitti/trainval"
label_dir_name: "Annotations_kitti/trainval"
image_extension: ".jpg"
partition_mode: "random"
num_partitions:2
val_split: 20
num_shards: 10
}
image_directory_path: "/workspace/dataset/VOCdevkit/VOC2012"
training spec_file:
random_seed: 42
model_config {
pretrained_model_file: "/workspace/pretrained_model/tlt_resnet18_detectnet_v2_v1/resnet18.hdf5"
num_layers: 18
freeze_blocks: 0
arch: "resnet"
use_batch_norm: true
objective_set: {
cov {}
bbox {
scale: 35.0
offset: 0.5
}
}
training_precision {
backend_floatx: FLOAT32
}
}
bbox_rasterizer_config {
target_class_config {
key: "car"
value: {
cov_center_x: 0.5
cov_center_y: 0.5
cov_radius_x: 0.4
cov_radius_y: 0.4
bbox_min_radius: 1.0
}
}
target_class_config {
key: "bicycle"
value: {
cov_center_x: 0.5
cov_center_y: 0.5
cov_radius_x: 1.0
cov_radius_y: 1.0
bbox_min_radius: 1.0
}
}
target_class_config {
key: "person"
value: {
cov_center_x: 0.5
cov_center_y: 0.5
cov_radius_x: 1.0
cov_radius_y: 1.0
bbox_min_radius: 1.0
}
}
deadzone_radius: 0.67
}
cost_function_config {
target_classes {
name: "car"
class_weight: 1.0
coverage_foreground_weight: 0.05
objectives {
name: "cov"
initial_weight: 1.0
weight_target: 1.0
}
objectives {
name: "bbox"
initial_weight: 10.0
weight_target: 10.0
}
}
target_classes {
name: "bicycle"
class_weight: 1.0
coverage_foreground_weight: 0.05
objectives {
name: "cov"
initial_weight: 1.0
weight_target: 1.0
}
objectives {
name: "bbox"
initial_weight: 10.0
weight_target: 1.0
}
}
target_classes {
name: "person"
class_weight: 1.0
coverage_foreground_weight: 0.05
objectives {
name: "cov"
initial_weight: 1.0
weight_target: 1.0
}
objectives {
name: "bbox"
initial_weight: 10.0
weight_target: 10.0
}
}
enable_autoweighting: True
max_objective_weight: 0.9999
min_objective_weight: 0.0001
}
training_config {
batch_size_per_gpu: 32
num_epochs: 20
learning_rate {
soft_start_annealing_schedule {
min_learning_rate: 5e-6
max_learning_rate: 5e-4
soft_start: 0.1
annealing: 0.7
}
}
regularizer {
type: L1
weight: 3e-9
}
optimizer {
adam {
epsilon: 1e-08
beta1: 0.9
beta2: 0.999
}
}
cost_scaling {
enabled: False
initial_exponent: 20.0
increment: 0.005
decrement: 1.0
}
}
augmentation_config {
preprocessing {
output_image_width: 480
output_image_height: 320
output_image_channel: 3
min_bbox_width: 1.0
min_bbox_height: 1.0
}
spatial_augmentation {
hflip_probability: 0.5
vflip_probability: 0.0
zoom_min: 1.0
zoom_max: 1.0
translate_max_x: 8.0
translate_max_y: 8.0
}
color_augmentation {
color_shift_stddev: 0.0
hue_rotation_max: 25.0
saturation_shift_max: 0.2
contrast_scale_max: 0.1
contrast_center: 0.5
}
}
postprocessing_config {
target_class_config {
key: "car"
value: {
clustering_config {
coverage_threshold: 0.005
dbscan_eps: 0.13
dbscan_min_samples: 0.05
minimum_bounding_box_height: 1
}
}
}
target_class_config {
key: "bicycle"
value: {
clustering_config {
coverage_threshold: 0.005
dbscan_eps: 0.15
dbscan_min_samples: 0.05
minimum_bounding_box_height: 1
}
}
}
target_class_config {
key: "person"
value: {
clustering_config {
coverage_threshold: 0.005
dbscan_eps: 0.15
dbscan_min_samples: 0.05
minimum_bounding_box_height: 1
}
}
}
}
dataset_config {
data_sources: {
tfrecords_path: "/workspace/tf_records/*"
image_directory_path: "/workspace/dataset/VOCdevkit/VOC2012"
}
image_extension: "jpg"
target_class_mapping {
key: "car"
value: "car"
}
target_class_mapping {
key: "person"
value: "person"
}
target_class_mapping {
key: "bicycle"
value: "bicycle"
}
validation_fold: 0
}
evaluation_config {
validation_period_during_training: 10
first_validation_epoch: 1
minimum_detection_ground_truth_overlap {
key: "car"
value: 0.7
}
minimum_detection_ground_truth_overlap {
key: "bicycle"
value: 0.5
}
minimum_detection_ground_truth_overlap {
key: "person"
value: 0.5
}
evaluation_box_config {
key: "car"
value {
minimum_height: 4
maximum_height: 9999
minimum_width: 4
maximum_width: 9999
}
}
evaluation_box_config {
key: "person"
value {
minimum_height: 4
maximum_height: 9999
minimum_width: 4
maximum_width: 9999
}
}
evaluation_box_config {
key: "bicycle"
value {
minimum_height: 4
maximum_height: 9999
minimum_width: 4
maximum_width: 9999
}
}
}
Detailed step by step guide for training the model is also posted on https://github.com/imneonizer/Nvidia-Transfer-Learning-Toolkit-Training-Example
A big Thanks to Morganh! for helping in debugging and resolving our issues.