I’m currently trying to evaluate on Tao with a trainable peoplenet model provided by nvidian ngc.
However, if you proceed with evaluation, the validation cost will be zero.
How can I solve this problem?
Attached is the spec below.
random_seed : 42
dataset_config {
data_sources: {
tfrecords_path: "/home/ssh/tao-experiments/data/tfrecords/kitti_trainval/kitti_trainval*"
image_directory_path: "/home/ssh/tao-experiments/data/training/"
}
image_extension: "png"
target_class_mapping {
key: "person"
value: "pedestrian"
}
validation_fold: 0
# For evaluation on test set
validation_data_source: {
tfrecords_path: "/home/ssh/tao-experiments/data/tfrecords/kitti_trainval/kitti_trainval*"
image_directory_path: "/home/ssh/tao-experiments/data/training/"
}
}
model_config {
pretrained_model_file: "/home/ssh/tao-experiments/detectnet_v2/pretrained_resnet34/pretrained_detectnet_v2_vresnet34/resnet_34.hdf5"
num_layers: 34
freeze_blocks: 0
arch: "resnet"
use_batch_norm: true
objective_set {
bbox {
scale: 35.0
offset: 0.5
}
cov {
}
}
training_precision {
backend_floatx: FLOAT32
}
}
training_config {
batch_size_per_gpu: 12
num_epochs: 100
learning_rate {
soft_start_annealing_schedule {
min_learning_rate: 5e-06
max_learning_rate: 0.0005
soft_start: 0.1
annealing: 0.7
}
}
regularizer {
type: L1
weight: 3e-09
}
optimizer {
adam {
epsilon: 9.9e-09
beta1: 0.9
beta2: 0.999
}
}
cost_scaling {
initial_exponent: 20.0
increment: 0.005
decrement: 1.0
}
checkpoint_interval: 10
}
augmentation_config {
preprocessing {
output_image_width: 960
output_image_height: 544
crop_right: 1244
crop_left: 700
crop_top: 0
crop_bottom: 320
# crop_right: 960
# crop_bottom: 544
min_bbox_width: 1.0
min_bbox_height: 1.0
output_image_channel: 3
}
spatial_augmentation {
hflip_probability: 0.5
zoom_min: 1.0
zoom_max: 1.0
translate_max_x: 8.0
translate_max_y: 8.0
}
color_augmentation {
color_shift_stddev: 0.0
hue_rotation_max: 25.0
saturation_shift_max: 0.20000000298
contrast_scale_max: 0.10000000149
contrast_center: 0.5
}
}
postprocessing_config{
target_class_config{
key: "person"
value: {
clustering_config {
coverage_threshold: 0.005
dbscan_eps: 0.265
dbscan_min_samples: 0.05
minimum_bounding_box_height: 20
}
}
}
}
cost_function_config {
target_classes {
name: "person"
class_weight: 1.0
coverage_foreground_weight: 0.05
objectives {
name: "cov"
initial_weight: 1.0
weight_target: 1.0
}
objectives {
name: "bbox"
initial_weight: 10.0
weight_target: 10.0
}
}
enable_autoweighting: true
max_objective_weight: 0.9999
min_objective_weight: 0.0001
}
evaluation_config {
validation_period_during_training: 10
first_validation_epoch: 10
minimum_detection_ground_truth_overlap {
key: "person"
value: 0.5
}
evaluation_box_config {
key: "person"
value {
minimum_height: 4
maximum_height: 9999
minimum_width: 4
maximum_width: 9999
}
}
}
bbox_rasterizer_config {
target_class_config {
key: "person"
value {
cov_center_x: 0.5
cov_center_y: 0.5
cov_radius_x: 1.0
cov_radius_y: 1.0
bbox_min_radius: 1.0
}
}
deadzone_radius: 0.2
}
I found out that the key and value values were set incorrectly in dataset_config, and I corrected them to confirm that they were working properly.
However, I want to learn only about person in retrain afterwards, so how should I modify config?
Can you share the latest spec file?
random_seed : 42
dataset_config {
data_sources: {
tfrecords_path: "/home/ssh/tao-experiments/data/tfrecords/kitti_trainval/kitti_trainval*"
image_directory_path: "/home/ssh/tao-experiments/data/training/"
}
image_extension: "png"
target_class_mapping {
key: "pedestrian"
value: "person"
}
target_class_mapping {
key: "bag"
value: "bag"
}
target_class_mapping {
key: "face"
value: "face"
}
validation_fold: 0
# For evaluation on test set
validation_data_source: {
tfrecords_path: "/home/ssh/tao-experiments/data/tfrecords/kitti_trainval/kitti_trainval*"
image_directory_path: "/home/ssh/tao-experiments/data/training/"
}
}
model_config {
pretrained_model_file: "/home/ssh/tao-experiments/detectnet_v2/pretrained_resnet34/pretrained_detectnet_v2_vresnet34/resnet_34.hdf5"
num_layers: 34
#freeze_blocks: 0
arch: "resnet"
use_batch_norm: true
objective_set {
bbox {
scale: 35.0
offset: 0.5
}
cov {
}
}
training_precision {
backend_floatx: FLOAT32
}
}
training_config {
batch_size_per_gpu: 32
num_epochs: 10
learning_rate {
soft_start_annealing_schedule {
min_learning_rate: 5e-06
max_learning_rate: 0.0005
soft_start: 0.1
annealing: 0.7
}
}
regularizer {
type: L1
weight: 3e-09
}
optimizer {
adam {
epsilon: 9.9e-09
beta1: 0.9
beta2: 0.999
}
}
cost_scaling {
initial_exponent: 20.0
increment: 0.005
decrement: 1.0
}
checkpoint_interval: 10
}
augmentation_config {
preprocessing {
output_image_width: 960
output_image_height: 544
min_bbox_width: 1.0
min_bbox_height: 1.0
output_image_channel: 3
}
spatial_augmentation {
hflip_probability: 0.5
zoom_min: 1.0
zoom_max: 1.0
translate_max_x: 8.0
translate_max_y: 8.0
}
color_augmentation {
hue_rotation_max: 25.0
saturation_shift_max: 0.20000000298
contrast_scale_max: 0.10000000149
contrast_center: 0.5
}
}
postprocessing_config{
target_class_config{
key: "person"
value: {
clustering_config {
coverage_threshold: 0.005
dbscan_eps: 0.265
dbscan_min_samples: 0.05
minimum_bounding_box_height: 20
}
}
}
target_class_config{
key: "bag"
value: {
clustering_config {
coverage_threshold: 0.005
dbscan_eps: 0.265
dbscan_min_samples: 0.05
minimum_bounding_box_height: 20
}
}
}
target_class_config{
key: "face"
value: {
clustering_config {
coverage_threshold: 0.005
dbscan_eps: 0.265
dbscan_min_samples: 0.05
minimum_bounding_box_height: 20
}
}
}
}
cost_function_config {
target_classes {
name: "person"
class_weight: 1.0
coverage_foreground_weight: 0.05
objectives {
name: "cov"
initial_weight: 1.0
weight_target: 1.0
}
objectives {
name: "bbox"
initial_weight: 10.0
weight_target: 10.0
}
}
target_classes {
name: "bag"
class_weight: 1.0
coverage_foreground_weight: 0.05
objectives {
name: "cov"
initial_weight: 1.0
weight_target: 1.0
}
objectives {
name: "bbox"
initial_weight: 10.0
weight_target: 10.0
}
}
target_classes {
name: "face"
class_weight: 1.0
coverage_foreground_weight: 0.05
objectives {
name: "cov"
initial_weight: 1.0
weight_target: 1.0
}
objectives {
name: "bbox"
initial_weight: 10.0
weight_target: 10.0
}
}
enable_autoweighting: true
max_objective_weight: 0.9999
min_objective_weight: 0.0001
}
evaluation_config {
validation_period_during_training: 10
first_validation_epoch: 10
minimum_detection_ground_truth_overlap {
key: "person"
value: 0.5
}
minimum_detection_ground_truth_overlap {
key: "bag"
value: 0.5
}
minimum_detection_ground_truth_overlap {
key: "face"
value: 0.5
}
evaluation_box_config {
key: "person"
value {
minimum_height: 4
maximum_height: 9999
minimum_width: 4
maximum_width: 9999
}
}
evaluation_box_config {
key: "bag"
value {
minimum_height: 4
maximum_height: 9999
minimum_width: 4
maximum_width: 9999
}
}
evaluation_box_config {
key: "face"
value {
minimum_height: 4
maximum_height: 9999
minimum_width: 4
maximum_width: 9999
}
}
average_precision_mode: INTEGRATE
}
bbox_rasterizer_config {
target_class_config {
key: "person"
value {
cov_center_x: 0.5
cov_center_y: 0.5
cov_radius_x: 1.0
cov_radius_y: 1.0
bbox_min_radius: 1.0
}
}
target_class_config {
key: "bag"
value {
cov_center_x: 0.5
cov_center_y: 0.5
cov_radius_x: 1.0
cov_radius_y: 1.0
bbox_min_radius: 1.0
}
}
target_class_config {
key: "face"
value {
cov_center_x: 0.5
cov_center_y: 0.5
cov_radius_x: 1.0
cov_radius_y: 1.0
bbox_min_radius: 1.0
}
}
deadzone_radius: 0.2
}
How many classes in your training dataset? Did you ever keep the log when you generate above tfrecords files?
I will learn data only with person class
Do you mean there is only one class(person) in your training dataset?
Yes, there is only one class (person) in the training data, and the question is how to modify the config to learn with this data.
Can you check several label files?
Is the class name “person” or “pedestrian” ?
Is the class name ‘person’
Please change to below and retry.
target_class_mapping {
key: “person”
value: “person”
}
Can I not touch the other two classes on target_class_mapping?
I wonder if it doesn’t matter if there are no other two classes in the learning data
Yes, you can delete the content of other two classes.
Thank you for your reply.
I have an additional question.
I have gpu 8 and I want to proceed with the learning in gpu 4.
I proceeded with the code as below, but an error comes up.
# Retraining using the pruned model as pretrained weights
!tao detectnet_v2 train -e $SPECS_DIR/detectnet_v2_retrain_resnet34_kitti.txt \
-r $USER_EXPERIMENT_DIR/experiment_dir_retrain \
-k $KEY \
-n resnet34_detector \
--gpus 4
“error code”
[f66fb9fe45ba:00263] *** Process received signal ***
[f66fb9fe45ba:00263] Signal: Bus error (7)
[f66fb9fe45ba:00263] Signal code: (-6)
[f66fb9fe45ba:00263] Failing at address: 0x3ea00000107
[f66fb9fe45ba:00263] [ 0] /usr/lib/x86_64-linux-gnu/libc.so.6(+0x43090)[0x7fc9b1b44090]
[f66fb9fe45ba:00263] [ 1] /usr/lib/x86_64-linux-gnu/libc.so.6(+0x18bb41)[0x7fc9b1c8cb41]
[f66fb9fe45ba:00263] [ 2] /usr/lib/x86_64-linux-gnu/libnccl.so.2(+0x755bd)[0x7fc8a34a85bd]
[f66fb9fe45ba:00263] [ 3] /usr/lib/x86_64-linux-gnu/libnccl.so.2(+0x7a74f)[0x7fc8a34ad74f]
[f66fb9fe45ba:00263] [ 4] /usr/lib/x86_64-linux-gnu/libnccl.so.2(+0x59e67)[0x7fc8a348ce67]
[f66fb9fe45ba:00263] [ 5] /usr/lib/x86_64-linux-gnu/libnccl.so.2(+0x48985)[0x7fc8a347b985]
[f66fb9fe45ba:00263] [ 6] /usr/lib/x86_64-linux-gnu/libnccl.so.2(+0x4a5c2)[0x7fc8a347d5c2]
[f66fb9fe45ba:00263] [ 7] /usr/lib/x86_64-linux-gnu/libnccl.so.2(+0x64f66)[0x7fc8a3497f66]
[f66fb9fe45ba:00263] [ 8] /usr/lib/x86_64-linux-gnu/libnccl.so.2(+0x4ae0b)[0x7fc8a347de0b]
[f66fb9fe45ba:00263] [ 9] /usr/lib/x86_64-linux-gnu/libnccl.so.2(ncclCommInitRank+0xd8)[0x7fc8a347e068]
[f66fb9fe45ba:00263] [10] /usr/local/lib/python3.6/dist-packages/horovod/tensorflow/mpi_lib.cpython-36m-x86_64-linux-gnu.so(_ZN7horovod6common13NCCLOpContext12InitNCCLCommERKSt6vectorINS0_16TensorTableEntryESaIS3_EERKS2_IiSaIiEE+0x284)[0x7fc87268f354]
[f66fb9fe45ba:00263] [11] /usr/local/lib/python3.6/dist-packages/horovod/tensorflow/mpi_lib.cpython-36m-x86_64-linux-gnu.so(_ZN7horovod6common13NCCLAllreduce7ExecuteERSt6vectorINS0_16TensorTableEntryESaIS3_EERKNS0_8ResponseE+0x61)[0x7fc87268f581]
[f66fb9fe45ba:00263] [12] /usr/local/lib/python3.6/dist-packages/horovod/tensorflow/mpi_lib.cpython-36m-x86_64-linux-gnu.so(_ZNK7horovod6common16OperationManager16ExecuteAllreduceERSt6vectorINS0_16TensorTableEntryESaIS3_EERKNS0_8ResponseE+0x7d)[0x7fc8726513cd]
[f66fb9fe45ba:00263] [13] /usr/local/lib/python3.6/dist-packages/horovod/tensorflow/mpi_lib.cpython-36m-x86_64-linux-gnu.so(_ZNK7horovod6common16OperationManager16ExecuteOperationERSt6vectorINS0_16TensorTableEntryESaIS3_EERKNS0_8ResponseERNS0_10ProcessSetE+0x4c)[0x7fc8726517fc]
[f66fb9fe45ba:00263] [14] /usr/local/lib/python3.6/dist-packages/horovod/tensorflow/mpi_lib.cpython-36m-x86_64-linux-gnu.so(+0xa902d)[0x7fc87262002d]
[f66fb9fe45ba:00263] [15] /usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0xd6de4)[0x7fc9b0eacde4]
[f66fb9fe45ba:00263] [16] /usr/lib/x86_64-linux-gnu/libpthread.so.0(+0x8609)[0x7fc9b1ae6609]
[f66fb9fe45ba:00263] [17] /usr/lib/x86_64-linux-gnu/libc.so.6(clone+0x43)[0x7fc9b1c20133]
[f66fb9fe45ba:00263] *** End of error message ***
Can you upload the full log via below button ?
Please try to increase Docker Virtual Memory Size.
Reference:
https://github.com/microsoft/DeepSpeed/issues/2693#issuecomment-1473302302
I solved my question with the following command.
!tao detectnet_v2 train -e $SPECS_DIR/detectnet_v2_retrain_resnet34_kitti.txt \
-r $USER_EXPERIMENT_DIR/experiment_dir_retrain \
-k $KEY \
-n resnet34_detector \
--gpus 1 \
--gpu_index 4
But as a result, the AP was only 25.
What kind of problem would there be?
It was my mistake. Tlt file is not included. Please tell me how to put the tlt file in Retrain