TLT2.0 When using DetectNet/PeopleNet, do you need at least 2 classes..?

Please provide the following information when requesting support.

• Hardware (Nano)
• Deepstream(5.0.1)Due to client’s specific needs
• Network Type (Detectnet_v2 and Peoplenet)
• TLT Version (Due to client’s specific needs I’m using TLT2.0:Container Link: http://nvcr.io/nvidia/tlt-streamanalytics:v2.0_py3)
• Training spec file for smartphone ONLY

random_seed: 42
dataset_config {
  data_sources {
    tfrecords_path: "/workspace/tlt-experiments/smartphone/data/tfrecords/kitti_trainval/*"
    image_directory_path: "/workspace/tlt-experiments/smartphone/data/training"
  }
  image_extension: "jpg"
  target_class_mapping {
    key: "smartphone"
    value: "smartphone"
  }
  
  
  validation_fold: 0
}
augmentation_config {
  preprocessing {
    output_image_width: 1248
    output_image_height: 384
    min_bbox_width: 0.5
    min_bbox_height: 2.0
    output_image_channel: 3
  }
  spatial_augmentation {
    hflip_probability: 0.5
    zoom_min: 1.0
    zoom_max: 1.0
    translate_max_x: 8.0
    translate_max_y: 8.0
  }
  color_augmentation {
    hue_rotation_max: 25.0
    saturation_shift_max: 0.20000000298
    contrast_scale_max: 0.10000000149
    contrast_center: 0.5
  }
}
postprocessing_config {
  target_class_config {
    key: "smartphone"
    value {
      clustering_config {
        coverage_threshold: 0.00499999988824
        dbscan_eps: 0.20000000298
        dbscan_min_samples: 0.0500000007451
        minimum_bounding_box_height: 20
      }
    }
  }

  
}
model_config {
  pretrained_model_file: "/workspace/tlt-experiments/smartphone/pretrained_resnet18/tlt_pretrained_detectnet_v2_vresnet18/resnet18.hdf5"
  num_layers: 18
  use_batch_norm: true
  freeze_blocks:0
  freeze_blocks:1
  freeze_blocks:2
  freeze_bn:false
  objective_set {
    bbox {
      scale: 35.0
      offset: 0.5
    }
    cov {
    }
  }
  training_precision {
    backend_floatx: FLOAT32
  }
  arch: "resnet"
}
evaluation_config {
  validation_period_during_training: 10
  first_validation_epoch: 30
  minimum_detection_ground_truth_overlap {
    key: "smartphone"
    value: 0.2
  }
 
  evaluation_box_config {
    key: "smartphone"
    value {
      minimum_height: 20
      maximum_height: 9999
      minimum_width: 10
      maximum_width: 9999
    }
  }
  
  
  average_precision_mode: INTEGRATE
}
cost_function_config {
  target_classes {
    name: "smartphone"
    class_weight: 1.0
    coverage_foreground_weight: 0.0500000007451
    objectives {
      name: "cov"
      initial_weight: 1.0
      weight_target: 1.0
    }
    objectives {
      name: "bbox"
      initial_weight: 10.0
      weight_target: 10.0
    }
  }
  
  
  enable_autoweighting: true
  max_objective_weight: 0.999899983406
  min_objective_weight: 9.99999974738e-05
}
training_config {
  batch_size_per_gpu: 4
  num_epochs: 300
  learning_rate {
    soft_start_annealing_schedule {
      min_learning_rate: 5e-06
      max_learning_rate: 5e-04
      soft_start: 0.10000000149
      annealing: 0.699999988079
    }
  }
  regularizer {
    type: L1
    weight: 3.00000002618e-09
  }
  optimizer {
    adam {
      epsilon: 9.99999993923e-09
      beta1: 0.899999976158
      beta2: 0.999000012875
    }
  }
  cost_scaling {
    initial_exponent: 20.0
    increment: 0.005
    decrement: 1.0
  }
  checkpoint_interval: 10
}
bbox_rasterizer_config {
  target_class_config {
    key: "smartphone"
    value {
      cov_center_x: 0.5
      cov_center_y: 0.5
      cov_radius_x: 0.40000000596
      cov_radius_y: 0.40000000596
      bbox_min_radius: 1.0
    }
  }
  
  
  deadzone_radius: 0.400000154972
}

• Training spec file for 2 classes

random_seed: 42
dataset_config {
  data_sources {
    tfrecords_path: "/workspace/tlt-experiments/smartphone/data/tfrecords/kitti_trainval/*"
    image_directory_path: "/workspace/tlt-experiments/smartphone/data/training"
  }
  image_extension: "jpg"
  target_class_mapping {
    key: "smartphone"
    value: "smartphone"
  }
  target_class_mapping {
    key: "brita"
    value: "brita"
  }
  
  validation_fold: 0
}
augmentation_config {
  preprocessing {
    output_image_width: 1248
    output_image_height: 384
    min_bbox_width: 0.5
    min_bbox_height: 2.0
    output_image_channel: 3
  }
  spatial_augmentation {
    hflip_probability: 0.5
    zoom_min: 1.0
    zoom_max: 1.0
    translate_max_x: 8.0
    translate_max_y: 8.0
  }
  color_augmentation {
    hue_rotation_max: 25.0
    saturation_shift_max: 0.20000000298
    contrast_scale_max: 0.10000000149
    contrast_center: 0.5
  }
}
postprocessing_config {
  target_class_config {
    key: "smartphone"
    value {
      clustering_config {
        coverage_threshold: 0.00499999988824
        dbscan_eps: 0.20000000298
        dbscan_min_samples: 0.0500000007451
        minimum_bounding_box_height: 20
      }
    }
  }
  target_class_config {
    key: "brita"
    value {
      clustering_config {
        coverage_threshold: 0.00499999988824
        dbscan_eps: 0.15000000596
        dbscan_min_samples: 0.0500000007451
        minimum_bounding_box_height: 20
      }
    }
  }
  
}
model_config {
  pretrained_model_file: "/workspace/tlt-experiments/smartphone/pretrained_resnet18/tlt_pretrained_detectnet_v2_vresnet18/resnet18.hdf5"
  num_layers: 18
  freeze_blocks: 0
  freeze_blocks: 1
  freeze_blocks: 2
  use_batch_norm: true
  objective_set {
    bbox {
      scale: 35.0
      offset: 0.5
    }
    cov {
    }
  }
  training_precision {
    backend_floatx: FLOAT32
  }
  arch: "resnet"
}
evaluation_config {
  validation_period_during_training: 10
  first_validation_epoch: 30
  minimum_detection_ground_truth_overlap {
    key: "smartphone"
    value: 0.2
  }
  minimum_detection_ground_truth_overlap {
    key: "brita"
    value: 0.2
  }
  evaluation_box_config {
    key: "smartphone"
    value {
      minimum_height: 20
      maximum_height: 9999
      minimum_width: 10
      maximum_width: 9999
    }
  }
  evaluation_box_config {
    key: "brita"
    value {
      minimum_height: 20
      maximum_height: 9999
      minimum_width: 10
      maximum_width: 9999
    }
  }
  
  average_precision_mode: INTEGRATE
}
cost_function_config {
  target_classes {
    name: "smartphone"
    class_weight: 1.0
    coverage_foreground_weight: 0.0500000007451
    objectives {
      name: "cov"
      initial_weight: 1.0
      weight_target: 1.0
    }
    objectives {
      name: "bbox"
      initial_weight: 10.0
      weight_target: 10.0
    }
  }
  target_classes {
    name: "brita"
    class_weight: 1.0
    coverage_foreground_weight: 0.0500000007451
    objectives {
      name: "cov"
      initial_weight: 1.0
      weight_target: 1.0
    }
    objectives {
      name: "bbox"
      initial_weight: 10.0
      weight_target: 1.0
    }
  }
  
  enable_autoweighting: true
  max_objective_weight: 0.999899983406
  min_objective_weight: 9.99999974738e-05
}
training_config {
  batch_size_per_gpu: 4
  num_epochs: 300
  learning_rate {
    soft_start_annealing_schedule {
      min_learning_rate: 5e-06
      max_learning_rate: 5e-04
      soft_start: 0.10000000149
      annealing: 0.699999988079
    }
  }
  regularizer {
    type: L1
    weight: 3.00000002618e-09
  }
  optimizer {
    adam {
      epsilon: 9.99999993923e-09
      beta1: 0.899999976158
      beta2: 0.999000012875
    }
  }
  cost_scaling {
    initial_exponent: 20.0
    increment: 0.005
    decrement: 1.0
  }
  checkpoint_interval: 10
}
bbox_rasterizer_config {
  target_class_config {
    key: "smartphone"
    value {
      cov_center_x: 0.5
      cov_center_y: 0.5
      cov_radius_x: 0.40000000596
      cov_radius_y: 0.40000000596
      bbox_min_radius: 1.0
    }
  }
  target_class_config {
    key: "brita"
    value {
      cov_center_x: 0.5
      cov_center_y: 0.5
      cov_radius_x: 1.0
      cov_radius_y: 1.0
      bbox_min_radius: 1.0
    }
  }
  
  deadzone_radius: 0.400000154972
}

Hi Everyone,
I’m trying to fine-tune DetectNet and Peoplenet to detect my smartphone with my own dataset(30 images) using DEEPSTREAM5.0 on my Jetson, but it seems to not be working. The training procedures I took were…

Train with the config file above(the one with only “smartphones”)

Prune

Retrain with epoch 100

INT8 Calibration

Export and Run on Jetson Nano

Oddly, When I added another class named “brita” with 25 images and did the same procedures, the output video was able to find my smartphone.

Does this mean that when using DetectNet or PeopleNet, you need at least 2 classes…?

There is no such limitation. You can train with 1 class.

Please note that for detectnet_v2 network(Peoplenet is also actually detectnet_v2 network), from the user guide, you need to make sure all the training images are of the same resolution.
Then set correct

output_image_width: xxx
output_image_height: xxx

I find that you set above to 1248x384. So, are your training images 1248x384? If not, please set “enable_auto_resize: True” as well. See user guide for more details.

Hi Morganh,

Thank you for replying.
I cropped the images to be that dimension.
I thought you can’t resize images for TLT2.0 according to this note(Preparing the Input Data Structure — Transfer Learning Toolkit 2.0 documentation)

I ran a search again but couldn’t find it. Can you tell me where I can find it?

note

Another question, is there a need to create a “background” class when you do a single-class detection?

Hi,
For detectnet_v2 network, it is expected to make sure all the images and labels should be of the same resolution. You need to confirm this before training.
For example, if your images are already 640x480, and also the labels are also correct accordingly, then you can set as below.
output_image_width: 640
output_image_height: 480

For TLT 2.0 version, yes, there is no “enable_auto_resize” yet. So, make sure above setting matches your images/labels.

It is not needed to create a “background” class when you do a single-class detection.

Hi Morgan,

Thank you for replying, your help really means a lot…

I have a few more questions.

  1. I’ve cropped the image to 1248x384 and annotated the data with that cropped image. Is there anywhere else that may be the cause?
  2. When I retrained the model, I only gave images that have a smartphone in them.
    Do you also need to provide images without smartphones as well?

If you train only one class(smartphone) , you need not provide images for other classes.
Suggest you to narrow down with below way step by step.

  1. After training with class(smartphone), run “tao inference xxx” to check the inference result.
  2. If result seems ok, then deploy in deepstream and run inference.
  3. If result seems ok, then run pruning and retraining.
  4. After retraining with class(smartphone), run “tao inference xxx” to check the inference result.
  5. If result seems ok, then deploy in deepstream and run inference.

Hi Morganh,

Thank you for replying and thank you for your advice.

I was able to find the cause of the problem. When I changed the config file from…

cost_function_config {
  target_classes {
    name: "smartphone"
    class_weight: 1.0
    coverage_foreground_weight: 0.0500000007451
    objectives {
      name: "cov"
      initial_weight: 1.0
      weight_target: 1.0
    }
    objectives {
      name: "bbox"
      initial_weight: 10.0
      weight_target: 10.0
    }
  }

To this,

cost_function_config {
  target_classes {
    name: "smartphone"
    class_weight: 1.0
    coverage_foreground_weight: 0.0500000007451
    objectives {
      name: "cov"
      initial_weight: 1.0
      weight_target: 1.0
    }
    objectives {
      name: "bbox"
      initial_weight: 10.0
      weight_target: 1.0
    }
  }

it worked.

I couldn’t find detailed information about what is “weight_target” and “initial_weight”, can you tell me what it’s doing…?

Refer to Tlt spec file - cost function - #3 by neophyte1

Hi Morganh,

Thank you for replying and helping me out.
Sorry about asking similar questions.

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.