Custom dataset -- ValueError: steps_per_epoch must be > 0

Hello I am getting this error “ValueError: steps_per_epoch must be > 0” when trying to train a custom dataset with TLT.

I’ve tried without success:

  • adjusting the batch_size_per_gpu
  • adjusting num_epochs

I am running this command:
tlt-train detectnet_v2 -e /workspace/path_spec
-r /workspace/path_to_model
-k /workspace/path_to_model
-n “custom_model_1”
–gpus 1

This is my spec file:
model_config {
arch: “resnet”
pretrained_model_file: “/workspace/path_to_model/resnet18.hdf5”
freeze_blocks: 0
freeze_blocks: 1
all_projections: True
num_layers: 18
use_pooling: False
use_batch_norm: True
dropout_rate: 0.0
training_precision: {
backend_floatx: FLOAT32
}
objective_set: {
cov {}
bbox {
scale: 35.0
offset: 0.5
}
}
}

bbox_rasterizer_config {
target_class_config {
key: “dog”
value: {
cov_center_x: 0.5
cov_center_y: 0.5
cov_radius_x: 0.4
cov_radius_y: 0.4
bbox_min_radius: 1.0
}
}
target_class_config {
key: “door_open”
value: {
cov_center_x: 0.5
cov_center_y: 0.5
cov_radius_x: 0.4
cov_radius_y: 0.4
bbox_min_radius: 1.0
}
}
target_class_config {
key: “person”
value: {
cov_center_x: 0.5
cov_center_y: 0.5
cov_radius_x: 0.4
cov_radius_y: 0.4
bbox_min_radius: 1.0
}
}
target_class_config {
key: “door_closed”
value: {
cov_center_x: 0.5
cov_center_y: 0.5
cov_radius_x: 0.4
cov_radius_y: 0.4
bbox_min_radius: 1.0
}
}
target_class_config {
key: “package”
value: {
cov_center_x: 0.5
cov_center_y: 0.5
cov_radius_x: 0.4
cov_radius_y: 0.4
bbox_min_radius: 1.0
}
}
deadzone_radius: 0.67
}

postprocessing_config {
target_class_config {
key: “dog”
value: {
clustering_config {
coverage_threshold: 0.005
dbscan_eps: 0.15
dbscan_min_samples: 0.05
minimum_bounding_box_height: 20
}
}
}
target_class_config {
key: “door_open”
value: {
clustering_config {
coverage_threshold: 0.005
dbscan_eps: 0.15
dbscan_min_samples: 0.05
minimum_bounding_box_height: 20
}
}
}
target_class_config {
key: “person”
value: {
clustering_config {
coverage_threshold: 0.005
dbscan_eps: 0.15
dbscan_min_samples: 0.05
minimum_bounding_box_height: 20
}
}
}
target_class_config {
key: “door_closed”
value: {
clustering_config {
coverage_threshold: 0.005
dbscan_eps: 0.15
dbscan_min_samples: 0.05
minimum_bounding_box_height: 20
}
}
}
target_class_config {
key: “package”
value: {
clustering_config {
coverage_threshold: 0.005
dbscan_eps: 0.15
dbscan_min_samples: 0.05
minimum_bounding_box_height: 20
}
}
}
}

cost_function_config {
target_classes {
name: “dog”
class_weight: 1.0
coverage_foreground_weight: 0.05
objectives {
name: “cov”
initial_weight: 1.0
weight_target: 1.0
}
objectives {
name: “bbox”
initial_weight: 10.0
weight_target: 10.0
}
}
target_classes {
name: “door_open”
class_weight: 1.0
coverage_foreground_weight: 0.05
objectives {
name: “cov”
initial_weight: 1.0
weight_target: 1.0
}
objectives {
name: “bbox”
initial_weight: 10.0
weight_target: 1.0
}
}
target_classes {
name: “person”
class_weight: 1.0
coverage_foreground_weight: 0.05
objectives {
name: “cov”
initial_weight: 1.0
weight_target: 1.0
}
objectives {
name: “bbox”
initial_weight: 10.0
weight_target: 10.0
}
}
target_classes {
name: “door_closed”
class_weight: 1.0
coverage_foreground_weight: 0.05
objectives {
name: “cov”
initial_weight: 1.0
weight_target: 1.0
}
objectives {
name: “bbox”
initial_weight: 10.0
weight_target: 10.0
}
}
target_classes {
name: “package”
class_weight: 1.0
coverage_foreground_weight: 0.05
objectives {
name: “cov”
initial_weight: 1.0
weight_target: 1.0
}
objectives {
name: “bbox”
initial_weight: 10.0
weight_target: 10.0
}
}
enable_autoweighting: True
max_objective_weight: 0.9999
min_objective_weight: 0.0001
}

training_config {
batch_size_per_gpu: 16
num_epochs: 80
learning_rate {
soft_start_annealing_schedule {
min_learning_rate: 5e-6
max_learning_rate: 5e-4
soft_start: 0.1
annealing: 0.7
}
}
regularizer {
type: L1
weight: 3e-9
}
optimizer {
adam {
epsilon: 1e-08
beta1: 0.9
beta2: 0.999
}
}
cost_scaling {
enabled: False
initial_exponent: 20.0
increment: 0.005
decrement: 1.0
}
}

augmentation_config {
preprocessing {
output_image_width: 800
output_image_height: 600
output_image_channel: 3
min_bbox_width: 1.0
min_bbox_height: 1.0
}
spatial_augmentation {

hflip_probability: 0.5
vflip_probability: 0.0
zoom_min: 1.0
zoom_max: 1.0
translate_max_x: 8.0
translate_max_y: 8.0

}
color_augmentation {
color_shift_stddev: 0.0
hue_rotation_max: 25.0
saturation_shift_max: 0.2
contrast_scale_max: 0.1
contrast_center: 0.5
}
}

evaluation_config {
average_precision_mode: INTEGRATE
validation_period_during_training: 10
first_validation_epoch: 1
minimum_detection_ground_truth_overlap {
key: “dog”
value: 0.5
}
minimum_detection_ground_truth_overlap {
key: “door_open”
value: 0.5
}
minimum_detection_ground_truth_overlap {
key: “person”
value: 0.5
}
minimum_detection_ground_truth_overlap {
key: “door_closed”
value: 0.5
}
minimum_detection_ground_truth_overlap {
key: “package”
value: 0.5
}
evaluation_box_config {
key: “dog”
value {
minimum_height: 4
maximum_height: 9999
minimum_width: 4
maximum_width: 9999
}
}
evaluation_box_config {
key: “door_open”
value {
minimum_height: 4
maximum_height: 9999
minimum_width: 4
maximum_width: 9999
}
}
evaluation_box_config {
key: “person”
value {
minimum_height: 4
maximum_height: 9999
minimum_width: 4
maximum_width: 9999
}
}
evaluation_box_config {
key: “door_closed”
value {
minimum_height: 4
maximum_height: 9999
minimum_width: 4
maximum_width: 9999
}
}
evaluation_box_config {
key: “package”
value {
minimum_height: 4
maximum_height: 9999
minimum_width: 4
maximum_width: 9999
}
}
}

dataset_config {
data_sources: {
tfrecords_path: “/workspace/path_to_tfrecords”
image_directory_path: “/workspace/path_to_images”
}
image_extension: “jpg”
target_class_mapping {
key: “dog”
value: “dog”
}
target_class_mapping {
key: “door_open”
value: “door_open”
}
target_class_mapping {
key: “person”
value: “person”
}
target_class_mapping {
key: “door_closed”
value: “door_closed”
}
target_class_mapping {
key: “package”
value: “package”
}
validation_fold: 0
}

Refer to TLT Custom data set size question?

Thanks Morganh, I did double check that as well. I have it set correctly.

A few more details:

Appreciate the help

Please narrow down the issue via:

  1. Please check the full log when you run tlt-dataset-convert.
    How many images of the val dataset?

  2. Please check if there is empty(zero size) tfrecord file.

  3. Please modify
    output_image_height: 600
    to
    output_image_height: 608

    The width or height is expected to be multiple of 16.

Thank you for the suggestions, I tried them but is still returning the same error. Below is the full log.

  1. See log below

  2. There are no empty tfrecord files

  3. resized all images and bounding boxes to 640 width and 480 height (now multiple of 16)

Logs:

Using TensorFlow backend.
2020-11-27 12:34:01.081088: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.0
2020-11-27 12:34:03.747200: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2020-11-27 12:34:03.751977: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-11-27 12:34:03.752267: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1618] Found device 0 with properties:
name: GeForce GTX 1070 major: 6 minor: 1 memoryClockRate(GHz): 1.7845
pciBusID: 0000:01:00.0
2020-11-27 12:34:03.752292: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.0
2020-11-27 12:34:03.752349: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10.0
2020-11-27 12:34:03.753296: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcufft.so.10.0
2020-11-27 12:34:03.753682: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcurand.so.10.0
2020-11-27 12:34:03.754993: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusolver.so.10.0
2020-11-27 12:34:03.757683: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusparse.so.10.0
2020-11-27 12:34:03.757773: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7
2020-11-27 12:34:03.757989: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-11-27 12:34:03.758590: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-11-27 12:34:03.759077: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1746] Adding visible gpu devices: 0
2020-11-27 12:34:03.759238: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.0
2020-11-27 12:34:04.298865: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1159] Device interconnect StreamExecutor with strength 1 edge matrix:
2020-11-27 12:34:04.298901: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1165] 0
2020-11-27 12:34:04.298909: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1178] 0: N
2020-11-27 12:34:04.299215: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-11-27 12:34:04.299619: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-11-27 12:34:04.299927: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-11-27 12:34:04.300220: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1304] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 3702 MB memory) → physical GPU (device: 0, name: GeForce GTX 1070, pci bus id: 0000:01:00.0, compute capability: 6.1)
2020-11-27 12:34:04,302 [INFO] iva.detectnet_v2.scripts.train: Loading experiment spec at /workspace/home/tlt/configs/detection_spec_file.txt.
2020-11-27 12:34:04,304 [INFO] iva.detectnet_v2.spec_handler.spec_loader: Merging specification from /workspace/home/tlt/configs/detection_spec_file.txt
Traceback (most recent call last):
File “/usr/local/bin/tlt-train-g1”, line 8, in
sys.exit(main())
File “/home/vpraveen/.cache/dazel/_dazel_vpraveen/715c8bafe7816f3bb6f309cd506049bb/execroot/ai_infra/bazel-out/k8-py3-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/common/magnet_train.py”, line 55, in main
File “”, line 2, in main
File “/home/vpraveen/.cache/dazel/_dazel_vpraveen/715c8bafe7816f3bb6f309cd506049bb/execroot/ai_infra/bazel-out/k8-py3-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/utilities/timer.py”, line 46, in wrapped_fn
File “/home/vpraveen/.cache/dazel/_dazel_vpraveen/715c8bafe7816f3bb6f309cd506049bb/execroot/ai_infra/bazel-out/k8-py3-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py”, line 773, in main
File “/home/vpraveen/.cache/dazel/_dazel_vpraveen/715c8bafe7816f3bb6f309cd506049bb/execroot/ai_infra/bazel-out/k8-py3-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py”, line 691, in run_experiment
File “/home/vpraveen/.cache/dazel/_dazel_vpraveen/715c8bafe7816f3bb6f309cd506049bb/execroot/ai_infra/bazel-out/k8-py3-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py”, line 569, in train_gridbox
File “/home/vpraveen/.cache/dazel/_dazel_vpraveen/715c8bafe7816f3bb6f309cd506049bb/execroot/ai_infra/bazel-out/k8-py3-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/cost_function/cost_auto_weight_hook.py”, line 26, in build_cost_auto_weight_hook
ValueError: steps_per_epoch must be > 0

new spec_file:

model_config {
arch: “resnet”
pretrained_model_file: “/workspace/home/tlt/models/tlt_pretrained_detectnet_v2_vresnet18/resnet18.hdf5”
freeze_blocks: 0
freeze_blocks: 1
all_projections: True
num_layers: 18
use_pooling: False
use_batch_norm: True
dropout_rate: 0.0
training_precision: {
backend_floatx: FLOAT32
}
objective_set: {
cov {}
bbox {
scale: 35.0
offset: 0.5
}
}
}

bbox_rasterizer_config {
target_class_config {
key: “dog”
value: {
cov_center_x: 0.5
cov_center_y: 0.5
cov_radius_x: 0.4
cov_radius_y: 0.4
bbox_min_radius: 1.0
}
}
target_class_config {
key: “door_open”
value: {
cov_center_x: 0.5
cov_center_y: 0.5
cov_radius_x: 0.4
cov_radius_y: 0.4
bbox_min_radius: 1.0
}
}
target_class_config {
key: “person”
value: {
cov_center_x: 0.5
cov_center_y: 0.5
cov_radius_x: 0.4
cov_radius_y: 0.4
bbox_min_radius: 1.0
}
}
target_class_config {
key: “door_closed”
value: {
cov_center_x: 0.5
cov_center_y: 0.5
cov_radius_x: 0.4
cov_radius_y: 0.4
bbox_min_radius: 1.0
}
}
target_class_config {
key: “package”
value: {
cov_center_x: 0.5
cov_center_y: 0.5
cov_radius_x: 0.4
cov_radius_y: 0.4
bbox_min_radius: 1.0
}
}
target_class_config {
key: “car”
value: {
cov_center_x: 0.5
cov_center_y: 0.5
cov_radius_x: 0.4
cov_radius_y: 0.4
bbox_min_radius: 1.0
}
}
deadzone_radius: 0.67
}

postprocessing_config {
target_class_config {
key: “dog”
value: {
clustering_config {
coverage_threshold: 0.005
dbscan_eps: 0.15
dbscan_min_samples: 0.05
minimum_bounding_box_height: 20
}
}
}
target_class_config {
key: “door_open”
value: {
clustering_config {
coverage_threshold: 0.005
dbscan_eps: 0.15
dbscan_min_samples: 0.05
minimum_bounding_box_height: 20
}
}
}
target_class_config {
key: “person”
value: {
clustering_config {
coverage_threshold: 0.005
dbscan_eps: 0.15
dbscan_min_samples: 0.05
minimum_bounding_box_height: 20
}
}
}
target_class_config {
key: “door_closed”
value: {
clustering_config {
coverage_threshold: 0.005
dbscan_eps: 0.15
dbscan_min_samples: 0.05
minimum_bounding_box_height: 20
}
}
}
target_class_config {
key: “package”
value: {
clustering_config {
coverage_threshold: 0.005
dbscan_eps: 0.15
dbscan_min_samples: 0.05
minimum_bounding_box_height: 20
}
}
}
target_class_config {
key: “car”
value: {
clustering_config {
coverage_threshold: 0.005
dbscan_eps: 0.15
dbscan_min_samples: 0.05
minimum_bounding_box_height: 20
}
}
}
}

cost_function_config {
target_classes {
name: “dog”
class_weight: 1.0
coverage_foreground_weight: 0.05
objectives {
name: “cov”
initial_weight: 1.0
weight_target: 1.0
}
objectives {
name: “bbox”
initial_weight: 10.0
weight_target: 10.0
}
}
target_classes {
name: “door_open”
class_weight: 1.0
coverage_foreground_weight: 0.05
objectives {
name: “cov”
initial_weight: 1.0
weight_target: 1.0
}
objectives {
name: “bbox”
initial_weight: 10.0
weight_target: 1.0
}
}
target_classes {
name: “person”
class_weight: 1.0
coverage_foreground_weight: 0.05
objectives {
name: “cov”
initial_weight: 1.0
weight_target: 1.0
}
objectives {
name: “bbox”
initial_weight: 10.0
weight_target: 10.0
}
}
target_classes {
name: “door_closed”
class_weight: 1.0
coverage_foreground_weight: 0.05
objectives {
name: “cov”
initial_weight: 1.0
weight_target: 1.0
}
objectives {
name: “bbox”
initial_weight: 10.0
weight_target: 10.0
}
}
target_classes {
name: “package”
class_weight: 1.0
coverage_foreground_weight: 0.05
objectives {
name: “cov”
initial_weight: 1.0
weight_target: 1.0
}
objectives {
name: “bbox”
initial_weight: 10.0
weight_target: 10.0
}
}
target_classes {
name: “car”
class_weight: 1.0
coverage_foreground_weight: 0.05
objectives {
name: “cov”
initial_weight: 1.0
weight_target: 1.0
}
objectives {
name: “bbox”
initial_weight: 10.0
weight_target: 10.0
}
}
enable_autoweighting: True
max_objective_weight: 0.9999
min_objective_weight: 0.0001
}

training_config {
batch_size_per_gpu: 16
num_epochs: 80
learning_rate {
soft_start_annealing_schedule {
min_learning_rate: 5e-6
max_learning_rate: 5e-4
soft_start: 0.1
annealing: 0.7
}
}
regularizer {
type: L1
weight: 3e-9
}
optimizer {
adam {
epsilon: 1e-08
beta1: 0.9
beta2: 0.999
}
}
cost_scaling {
enabled: False
initial_exponent: 20.0
increment: 0.005
decrement: 1.0
}
checkpoint_interval: 10
}

augmentation_config {
preprocessing {
output_image_width: 640
output_image_height: 480
output_image_channel: 3
min_bbox_width: 1.0
min_bbox_height: 1.0
}
spatial_augmentation {
hflip_probability: 0.5
vflip_probability: 0.0
zoom_min: 1.0
zoom_max: 1.0
translate_max_x: 8.0
translate_max_y: 8.0
}
color_augmentation {
color_shift_stddev: 0.0
hue_rotation_max: 25.0
saturation_shift_max: 0.2
contrast_scale_max: 0.1
contrast_center: 0.5
}
}

evaluation_config {
average_precision_mode: INTEGRATE
validation_period_during_training: 10
first_validation_epoch: 25
minimum_detection_ground_truth_overlap {
key: “dog”
value: 0.5
}
minimum_detection_ground_truth_overlap {
key: “door_open”
value: 0.5
}
minimum_detection_ground_truth_overlap {
key: “person”
value: 0.5
}
minimum_detection_ground_truth_overlap {
key: “door_closed”
value: 0.5
}
minimum_detection_ground_truth_overlap {
key: “package”
value: 0.5
}
minimum_detection_ground_truth_overlap {
key: “car”
value: 0.5
}
evaluation_box_config {
key: “dog”
value {
minimum_height: 10
maximum_height: 9999
minimum_width: 10
maximum_width: 9999
}
}
evaluation_box_config {
key: “door_open”
value {
minimum_height: 10
maximum_height: 9999
minimum_width: 10
maximum_width: 9999
}
}
evaluation_box_config {
key: “person”
value {
minimum_height: 10
maximum_height: 9999
minimum_width: 10
maximum_width: 9999
}
}
evaluation_box_config {
key: “door_closed”
value {
minimum_height: 10
maximum_height: 9999
minimum_width: 10
maximum_width: 9999
}
}
evaluation_box_config {
key: “package”
value {
minimum_height: 10
maximum_height: 9999
minimum_width: 10
maximum_width: 9999
}
}
evaluation_box_config {
key: “car”
value {
minimum_height: 10
maximum_height: 9999
minimum_width: 10
maximum_width: 9999
}
}
}

dataset_config {
data_sources: {
tfrecords_path: “/workspace/home/label_images/tfrecords/tfrecords-fold-000-of-002-shard-00000-of-00010”
image_directory_path: “/workspace/home/label_images/dataset_root”
}
image_extension: “jpg”
target_class_mapping {
key: “dog”
value: “dog”
}
target_class_mapping {
key: “door_open”
value: “door_open”
}
target_class_mapping {
key: “person”
value: “person”
}
target_class_mapping {
key: “door_closed”
value: “door_closed”
}
target_class_mapping {
key: “package”
value: “package”
}
target_class_mapping {
key: “car”
value: “car”
}
validation_fold: 0
}

Ok, figured it out. I was pointing to a specific tfrecord when needed to use “*” in tfrecords/*.

Updated to the code below in the spec file and now runs good.

dataset_config {
data_sources: {
tfrecords_path: “/workspace/home/label_images/tfrecords/*”