Test and retrain TrafficCamNet on COCO and KITTI

I’m working with TrafficCamNet and have couple of questions:
1) I am testing TrafficCamNet on part of COCO dataset, which contains only cars category, however there are images without cars in this part. I evaluate the model on the dataset and receive AP - ​4.1 with iou = 0.5. It seems that AP is lower that it supposed to be. Could anyone shed light on the cause of the problem?

Evaluation config:

random_seed: 42
dataset_config {
data_sources {
tfrecords_path: “/workspace/detector/data/mscoco_with_labels/tfrecords/*”
image_directory_path: “/workspace/detector/data/mscoco_with_labels/”
}
image_extension: “jpg”
target_class_mapping {
key: “car”
value: “car”
}
target_class_mapping {
key: “two-weeler”
value: “two-weeler”
}
target_class_mapping {
key: “person”
value: “person”
}
target_class_mapping {
key: “road_sign”
value: “road_sign”
}
validation_fold: 0
}
augmentation_config {
preprocessing {
output_image_width: 960
output_image_height: 544
min_bbox_width: 1.0
min_bbox_height: 1.0
output_image_channel: 3
}
spatial_augmentation {
hflip_probability: 0.5
zoom_min: 1.0
zoom_max: 1.0
translate_max_x: 8.0
translate_max_y: 8.0
}
color_augmentation {
hue_rotation_max: 25.0
saturation_shift_max: 0.20000000298
contrast_scale_max: 0.10000000149
contrast_center: 0.5
}
}
postprocessing_config {
target_class_config {
key: “car”
value {
clustering_config {
coverage_threshold: 0.00499999988824
dbscan_eps: 0.20000000298
dbscan_min_samples: 0.0500000007451
minimum_bounding_box_height: 20
}
}
}
target_class_config {
key: “two-weeler”
value {
clustering_config {
coverage_threshold: 0.00499999988824
dbscan_eps: 0.15000000596
dbscan_min_samples: 0.0500000007451
minimum_bounding_box_height: 20
}
}
}
target_class_config {
key: “person”
value {
clustering_config {
coverage_threshold: 0.00749999983236
dbscan_eps: 0.230000004172
dbscan_min_samples: 0.0500000007451
minimum_bounding_box_height: 20
}
}
}
target_class_config {
key: “road_sign”
value {
clustering_config {
coverage_threshold: 0.00749999983236
dbscan_eps: 0.230000004172
dbscan_min_samples: 0.0500000007451
minimum_bounding_box_height: 20
}}}}
model_config {
pretrained_model_file: “/workspace/detector/models/tlt_trafficcamnet_vunpruned_v1.0/resnet18_trafficcamnet.tlt”
num_layers: 18
use_batch_norm: true
objective_set {
bbox {
scale: 35.0
offset: 0.5
}
cov {
}
}
training_precision {
backend_floatx: FLOAT32
}
arch: “resnet”
}
evaluation_config {
validation_period_during_training: 10
first_validation_epoch: 30
minimum_detection_ground_truth_overlap {
key: “car”
value: 0.699999988079
}
minimum_detection_ground_truth_overlap {
key: “two-weeler”
value: 0.5
}
minimum_detection_ground_truth_overlap {
key: “person”
value: 0.5
}
minimum_detection_ground_truth_overlap {
key: “road_sign”
value: 0.5
}
evaluation_box_config {
key: “car”
value {
minimum_height: 20
maximum_height: 9999
minimum_width: 10
maximum_width: 9999
}
}
evaluation_box_config {
key: “two-weeler”
value {
minimum_height: 20
maximum_height: 9999
minimum_width: 10
maximum_width: 9999
}
}
evaluation_box_config {
key: “person”
value {
minimum_height: 20
maximum_height: 9999
minimum_width: 10
maximum_width: 9999
}
}
evaluation_box_config {
key: “road_sign”
value {
minimum_height: 20
maximum_height: 9999
minimum_width: 10
maximum_width: 9999
}
}
average_precision_mode: INTEGRATE
}

2) Then I finetune the model on kitti dataset(cars category as in the above dataset). The model hits AP - 77.56 on validation dataset from kitti. However, the trained model on kitti test set with config above produces the following results: image
What can be the cause for difference in AP on validation data and test data?

Train config:

random_seed: 42
dataset_config {
data_sources {
tfrecords_path: “/workspace//kitti/tfrecords/
image_directory_path: “/workspace/*/kitti/train/”
}
image_extension: “png”
target_class_mapping {
key: “car”
value: “car”
}
validation_fold: 0
}
augmentation_config {
preprocessing {
output_image_width: 960
output_image_height: 544
min_bbox_width: 1.0
min_bbox_height: 1.0
output_image_channel: 3
}
spatial_augmentation {
hflip_probability: 0.5
zoom_min: 1.0
zoom_max: 1.0
translate_max_x: 8.0
translate_max_y: 8.0
}
color_augmentation {
hue_rotation_max: 25.0
saturation_shift_max: 0.20000000298
contrast_scale_max: 0.10000000149
contrast_center: 0.5
}
}
postprocessing_config {
target_class_config {
key: “car”
value {
clustering_config {
coverage_threshold: 0.00499999988824
dbscan_eps: 0.20000000298
dbscan_min_samples: 0.0500000007451
minimum_bounding_box_height: 20
}}}}
model_config {
num_layers: 18
pretrained_model_file: “/workspace/ssd/m.ageeva/nvidia_toolkit/detector/tlt_trafficcamnet_vunpruned_v1.0/resnet18_trafficcamnet.tlt”
use_batch_norm: true
objective_set {
bbox {
scale: 35.0
offset: 0.5
}
cov {
}
}
training_precision {
backend_floatx: FLOAT32
}
arch: “resnet”
all_projections: true
}
evaluation_config {
validation_period_during_training: 5
first_validation_epoch: 5
minimum_detection_ground_truth_overlap {
key: “car”
value: 0.699999988079
}
evaluation_box_config {
key: “car”
value {
minimum_height: 20
maximum_height: 9999
minimum_width: 10
maximum_width: 9999
}
}
average_precision_mode: INTEGRATE
}
cost_function_config {
target_classes {
name: “car”
class_weight: 1.0
coverage_foreground_weight: 0.0500000007451
objectives {
name: “cov”
initial_weight: 1.0
weight_target: 1.0
}
objectives {
name: “bbox”
initial_weight: 10.0
weight_target: 10.0
}
}
enable_autoweighting: true
max_objective_weight: 0.999899983406
min_objective_weight: 9.99999974738e-05
}
training_config {
batch_size_per_gpu: 4
num_epochs: 120
learning_rate {
soft_start_annealing_schedule {
min_learning_rate: 1e-05
max_learning_rate: 1e-03
soft_start: 0.10000000149
annealing: 0.699999988079
}
}
regularizer {
type: L1
weight: 3.00000002618e-09
}
optimizer {
adam {
epsilon: 9.99999993923e-09
beta1: 0.899999976158
beta2: 0.999000012875
}
}
cost_scaling {
initial_exponent: 20.0
increment: 0.005
decrement: 1.0
}
checkpoint_interval: 10
}
bbox_rasterizer_config {
target_class_config {
key: “car”
value {
cov_center_x: 0.5
cov_center_y: 0.5
cov_radius_x: 0.40000000596
cov_radius_y: 0.40000000596
bbox_min_radius: 1.0
}
}
deadzone_radius: 0.400000154972
}

Inference config:

inferencer_config{
target_classes: “car”
image_width: 960
image_height: 544
image_channels: 3
batch_size: 16
gpu_index: 2
tlt_config{
model: “/workspace/detector/models/kitti_1class.tlt”
}
}
bbox_handler_config{
kitti_dump: true
disable_overlay: false
overlay_linewidth: 2
classwise_bbox_handler_config{
key:“car”
value: {
confidence_model: “aggregate_cov”
output_map: “car”
confidence_threshold: 0.9
bbox_color{
R: 255
G: 255
B: 0
}
clustering_config{
coverage_threshold: 0.00
dbscan_eps: 0.3
dbscan_min_samples: 0.05
minimum_bounding_box_height: 4
}}}
classwise_bbox_handler_config{
key:“default”
value: {
confidence_model: “aggregate_cov”
confidence_threshold: 0.9
bbox_color{
R: 255
G: 0
B: 0
}
clustering_config{
coverage_threshold: 0.00
dbscan_eps: 0.3
dbscan_min_samples: 0.05
minimum_bounding_box_height: 4
}}}}

For 1) , If you want to use detectnet_v2 evaluate to do evaluation against the unpruned trafficcamnet.tlt, please resized the car images/labels to 960x544, then modify one place in your config

soft_start_annealing_schedule {
  min_learning_rate: 10e-10
  max_learning_rate: 10e-10

And also need to set load_graph to True.

model_config {
pretrained_model_file: “/workspace/detector/models/tlt_trafficcamnet_vunpruned_v1.0/resnet18_trafficcamnet.tlt”
num_layers: 18
load_graph: True

This will directly load the tlt model and run evaluation.

More, suggest you to run detectnet_v2 inference against the car part of COCO images. In this case, you need not resize images/labels.

And also you can run inference with deepstream (mentioned in https://ngc.nvidia.com/catalog/models/nvidia:tlt_trafficcamnet)

Last, please note that trafficcamnet is not trained via COCO dataset. So, please use the unpruned trafficcamnet model to trigger training. This is the value of TLT (transfer learning toolkit).

For 2), Do you run with your own evaluation method to get “easy, moderate, hard” ? In TLT evaluate , it does not print this kind of result.