DataLossError: corrupted record at 0 when using TFRecords with DetectNet

@Morganh I am still having issues with detectnet. How can all values be 0?
{"validation cost": 0.00477045, "mean average precision": 0.0, "average_precision": {"gruenerStein": 0, "gelbeRutsche": 0, "orangePlatte": 0, "schwarzeStange": 0.0, "gelbesAuge": 0.0, "blauerBalken": 0}, "date": "2/15/2022", "time": "17:41:33", "status": "Evaluation Complete"}

Could you share your latest training spec file?

Here you go:

random_seed: 42
dataset_config {
  data_sources {
    tfrecords_path: "/workspace/projects/lego/data/tfrecords_detectnetv2_resnet18/tfrecord-*"
    image_directory_path: "/workspace/projects/lego/data/kitti_detection_1000x1000/train"
  }
  image_extension: "jpg"
  target_class_mapping {
    key: "blauerBalken"
    value: "blauerBalken"
  }
  target_class_mapping {
    key: "gelbeRutsche"
    value: "gelbeRutsche"
  }
  target_class_mapping {
    key: "gelbesAuge"
    value: "gelbesAuge"
  }
  target_class_mapping {
    key: "gruenerStein"
    value: "gruenerStein"
  }
  target_class_mapping {
    key: "orangePlatte"
    value: "orangePlatte"
  }
  target_class_mapping {
    key: "schwarzeStange"
    value: "schwarzeStange"
  }
  validation_fold: 0
}
augmentation_config {
  preprocessing {
    output_image_width: 480
    output_image_height: 480
    min_bbox_width: 1.0
    min_bbox_height: 1.0
    output_image_channel: 3
  }
  spatial_augmentation {
    hflip_probability: 0.5
    vflip_probability: 0.5
    zoom_min: 1.0
    zoom_max: 1.0
    translate_max_x: 8.0
    translate_max_y: 8.0
  }
  color_augmentation {
    hue_rotation_max: 5.0
    saturation_shift_max: 0.8999999761581421
    contrast_scale_max: 0.10000000149011612
    contrast_center: 0.5
  }
}
postprocessing_config {
  target_class_config {
    key: "blauerBalken"
    value {
      clustering_config {
        coverage_threshold: 0.004999999888241291
        minimum_bounding_box_height: 20
        dbscan_eps: 0.15000000596046448
        dbscan_min_samples: 0.05000000074505806
      }
    }
  }
  target_class_config {
    key: "gelbeRutsche"
    value {
      clustering_config {
        coverage_threshold: 0.004999999888241291
        minimum_bounding_box_height: 20
        dbscan_eps: 0.15000000596046448
        dbscan_min_samples: 0.05000000074505806
      }
    }
  }
  target_class_config {
    key: "gelbesAuge"
    value {
      clustering_config {
        coverage_threshold: 0.004999999888241291
        minimum_bounding_box_height: 20
        dbscan_eps: 0.15000000596046448
        dbscan_min_samples: 0.05000000074505806
      }
    }
  }
  target_class_config {
    key: "gruenerStein"
    value {
      clustering_config {
        coverage_threshold: 0.004999999888241291
        minimum_bounding_box_height: 20
        dbscan_eps: 0.15000000596046448
        dbscan_min_samples: 0.05000000074505806
      }
    }
  }
  target_class_config {
    key: "orangePlatte"
    value {
      clustering_config {
        coverage_threshold: 0.004999999888241291
        minimum_bounding_box_height: 20
        dbscan_eps: 0.15000000596046448
        dbscan_min_samples: 0.05000000074505806
      }
    }
  }
  target_class_config {
    key: "schwarzeStange"
    value {
      clustering_config {
        coverage_threshold: 0.004999999888241291
        minimum_bounding_box_height: 20
        dbscan_eps: 0.15000000596046448
        dbscan_min_samples: 0.05000000074505806
      }
    }
  }
}
model_config {
  pretrained_model_file: "/workspace/repositories/pretrained_detectnet_v2/pretrained_detectnet_v2_vresnet18/resnet18.hdf5"
  num_layers: 18
  use_batch_norm: true
  objective_set {
    bbox {
      scale: 35.0
      offset: 0.5
    }
    cov {
    }
  }
  freeze_blocks: 0.0
  freeze_blocks: 1.0
  arch: "resnet"
  all_projections: true
}
evaluation_config {
  validation_period_during_training: 2
  first_validation_epoch: 2
  minimum_detection_ground_truth_overlap {
    key: "blauerBalken"
    value: 0.6000000238418579
  }
  minimum_detection_ground_truth_overlap {
    key: "gelbeRutsche"
    value: 0.6000000238418579
  }
  minimum_detection_ground_truth_overlap {
    key: "gelbesAuge"
    value: 0.6000000238418579
  }
  minimum_detection_ground_truth_overlap {
    key: "gruenerStein"
    value: 0.6000000238418579
  }
  minimum_detection_ground_truth_overlap {
    key: "orangePlatte"
    value: 0.6000000238418579
  }
  minimum_detection_ground_truth_overlap {
    key: "schwarzeStange"
    value: 0.6000000238418579
  }
  evaluation_box_config {
    key: "blauerBalken"
    value {
      minimum_height: 20
      maximum_height: 200
      minimum_width: 10
      maximum_width: 200
    }
  }
  evaluation_box_config {
    key: "gelbeRutsche"
    value {
      minimum_height: 20
      maximum_height: 200
      minimum_width: 10
      maximum_width: 200
    }
  }
  evaluation_box_config {
    key: "gelbesAuge"
    value {
      minimum_height: 20
      maximum_height: 200
      minimum_width: 10
      maximum_width: 200
    }
  }
  evaluation_box_config {
    key: "gruenerStein"
    value {
      minimum_height: 20
      maximum_height: 200
      minimum_width: 10
      maximum_width: 200
    }
  }
  evaluation_box_config {
    key: "orangePlatte"
    value {
      minimum_height: 20
      maximum_height: 200
      minimum_width: 10
      maximum_width: 200
    }
  }
  evaluation_box_config {
    key: "schwarzeStange"
    value {
      minimum_height: 20
      maximum_height: 200
      minimum_width: 10
      maximum_width: 200
    }
  }
  average_precision_mode: INTEGRATE
}
cost_function_config {
  target_classes {
    name: "gruenerStein"
    class_weight: 1.0
    coverage_foreground_weight: 0.05000000074505806
    objectives {
      name: "cov"
      initial_weight: 1.0
      weight_target: 1.0
    }
    objectives {
      name: "bbox"
      initial_weight: 10.0
      weight_target: 10.0
    }
  }
  target_classes {
    name: "gelbeRutsche"
    class_weight: 1.0
    coverage_foreground_weight: 0.05000000074505806
    objectives {
      name: "cov"
      initial_weight: 1.0
      weight_target: 1.0
    }
    objectives {
      name: "bbox"
      initial_weight: 10.0
      weight_target: 10.0
    }
  }
  target_classes {
    name: "orangePlatte"
    class_weight: 1.0
    coverage_foreground_weight: 0.05000000074505806
    objectives {
      name: "cov"
      initial_weight: 1.0
      weight_target: 1.0
    }
    objectives {
      name: "bbox"
      initial_weight: 10.0
      weight_target: 10.0
    }
  }
  target_classes {
    name: "schwarzeStange"
    class_weight: 1.0
    coverage_foreground_weight: 0.05000000074505806
    objectives {
      name: "cov"
      initial_weight: 1.0
      weight_target: 1.0
    }
    objectives {
      name: "bbox"
      initial_weight: 10.0
      weight_target: 10.0
    }
  }
  target_classes {
    name: "gelbesAuge"
    class_weight: 1.0
    coverage_foreground_weight: 0.05000000074505806
    objectives {
      name: "cov"
      initial_weight: 1.0
      weight_target: 1.0
    }
    objectives {
      name: "bbox"
      initial_weight: 10.0
      weight_target: 10.0
    }
  }
  target_classes {
    name: "blauerBalken"
    class_weight: 1.0
    coverage_foreground_weight: 0.05000000074505806
    objectives {
      name: "cov"
      initial_weight: 1.0
      weight_target: 1.0
    }
    objectives {
      name: "bbox"
      initial_weight: 10.0
      weight_target: 10.0
    }
  }
  enable_autoweighting: true
  max_objective_weight: 0.9998999834060669
  min_objective_weight: 9.999999747378752e-05
}
training_config {
  batch_size_per_gpu: 1
  num_epochs: 120
  learning_rate {
    soft_start_annealing_schedule {
      min_learning_rate: 4.999999873689376e-06
      max_learning_rate: 0.0005000000237487257
      soft_start: 0.10000000149011612
      annealing: 0.699999988079071
    }
  }
  regularizer {
    type: L1
    weight: 3.000000026176508e-09
  }
  optimizer {
    adam {
      epsilon: 9.99999993922529e-09
      beta1: 0.8999999761581421
      beta2: 0.9990000128746033
    }
  }
  cost_scaling {
    initial_exponent: 20.0
    increment: 0.005
    decrement: 1.0
  }
  checkpoint_interval: 1
}
bbox_rasterizer_config {
  target_class_config {
    key: "blauerBalken"
    value {
      cov_center_x: 0.5
      cov_center_y: 0.5
      cov_radius_x: 0.4000000059604645
      cov_radius_y: 0.4000000059604645
      bbox_min_radius: 1.0
    }
  }
  target_class_config {
    key: "gelbeRutsche"
    value {
      cov_center_x: 0.5
      cov_center_y: 0.5
      cov_radius_x: 0.4000000059604645
      cov_radius_y: 0.4000000059604645
      bbox_min_radius: 1.0
    }
  }
  target_class_config {
    key: "gelbesAuge"
    value {
      cov_center_x: 0.5
      cov_center_y: 0.5
      cov_radius_x: 0.4000000059604645
      cov_radius_y: 0.4000000059604645
      bbox_min_radius: 1.0
    }
  }
  target_class_config {
    key: "gruenerStein"
    value {
      cov_center_x: 0.5
      cov_center_y: 0.5
      cov_radius_x: 0.4000000059604645
      cov_radius_y: 0.4000000059604645
      bbox_min_radius: 1.0
    }
  }
  target_class_config {
    key: "orangePlatte"
    value {
      cov_center_x: 0.5
      cov_center_y: 0.5
      cov_radius_x: 0.4000000059604645
      cov_radius_y: 0.4000000059604645
      bbox_min_radius: 1.0
    }
  }
  target_class_config {
    key: "schwarzeStange"
    value {
      cov_center_x: 0.5
      cov_center_y: 0.5
      cov_radius_x: 0.4000000059604645
      cov_radius_y: 0.4000000059604645
      bbox_min_radius: 1.0
    }
  }
  deadzone_radius: 0.6700000166893005
}

You mentioned earlier that your training images are resized to 1000x1000.
But the training spec is set to

output_image_width: 480
output_image_height: 480

For this case, need to add “enable_auto_resize: True”.
i.e,

output_image_width: 480
output_image_height: 480
enable_auto_resize: True

See more info in DetectNet_v2 — TAO Toolkit 3.21.11 documentation

Good catch, thanks!

Sadly, it did not fix my issue. I tried with enable_auto_resize: True and also with a dataset preformatted to 480x480.
One thing that I noticed while training was that nvidia-smi did not show any usage of the gpu whatsoever (power and ram usage wise, as util is always N/A).
Moreover there was no significant usage of the cpu or ram either, so I really don’t know what the program is doing at all.

Could you share the result of nvidia-smi?

Could you share the training log and upload it as a file?

nvidia-smi before training:

+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.00       Driver Version: 510.06       CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  NVIDIA GeForce ...  On   | 00000000:09:00.0 Off |                  N/A |
|  0%   45C    P8    24W / 350W |    706MiB / 24576MiB |     N/A      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  On   | 00000000:0A:00.0 Off |                  N/A |
| 35%   34C    P8    N/A /  19W |     71MiB /  2048MiB |     N/A      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
|  No running processes found                                                 |
+-----------------------------------------------------------------------------+

and while training:

+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.00       Driver Version: 510.06       CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  NVIDIA GeForce ...  On   | 00000000:09:00.0 Off |                  N/A |
|  0%   47C    P8    24W / 350W |    938MiB / 24576MiB |     N/A      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  On   | 00000000:0A:00.0 Off |                  N/A |
| 52%   71C    P0    N/A /  19W |   1840MiB /  2048MiB |     N/A      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
|  No running processes found                                                 |
+-----------------------------------------------------------------------------+

That can’t be right. The gpu power consumption should be much higher if there were any usage of it.
Also, I cannot believe that I can only run training with a batch size of 1 with 25GB vram. It doesn’t get used at all?

.tao_mounts.json:

{
    "Mounts": [
        {
            "source": "~/nvidia-tao/projects",
            "destination": "/workspace/projects"
        },
        {
            "source": "~/nvidia-tao/repositories",
            "destination": "/workspace/repositories"
        }
    ],
    "Envs": [
        {
            "variable": "CUDA_VISIBLE_DEVICES",
            "value": "0"
        },
        {

            "variable": "DALI_DISABLE_NVML",
            "value": "1"
        }
    ],
    "DockerOptions": {
        "shm_size": "16G",
        "ulimits": {
            "memlock": -1,
            "stack": 67108864
        },
        "user": "1000:1000",
        "ports": {}
    }
}

Will add the train.log later.

  1. Are you running WSL?
  1. Why did you set as above? Could you share the user guide link?

Yes, I am running ubuntu inside wsl 2.

I set CUDA_VISIBLE_DEVICES so I dont accidentally use the second gpu, which is much slower.
DALI_DISABLE_NVML is a result of some issue around dali (A nvml internal driver error occured (on WSL2) · Issue #2921 · NVIDIA/DALI · GitHub)

To narrow down, could you try one of below two ways?

–runtime nvidia does result in an error stating that the runtime cannot be found, but without it the container starts and i can call nvidia-smi.

I’ll try to run the training from the container now.

Here is the log until epoch 56. There is no change between epochs, so I stopped the training early.
train.log (672.2 KB)

So, you did not install nvidia-docker2. It is required. See TAO Toolkit Quick Start Guide — TAO Toolkit 3.21.11 documentation

I’ll check back with my system admin. It did work fine just a couple of days back with a dssd model.

Edit:
nvidia-docker2 is already the newest version (2.9.1-1).

Maybe I have to restart the docker daemon…
Does tao not use --runtime nvidia? I cannot start containers using the flag and this guide does not mention --runtime nvidia (Installation Guide — NVIDIA Cloud Native Technologies documentation)

Any error?

Just docker: Error response from daemon: Unknown runtime specified nvidia.

The output of docker run --rm --gpus all nvidia/cuda:11.0-base nvidia-smi is

Thu Feb 17 18:59:11 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.00       Driver Version: 510.06       CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  NVIDIA GeForce ...  On   | 00000000:09:00.0 Off |                  N/A |
|  0%   37C    P8    14W / 350W |    520MiB / 24576MiB |     N/A      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  On   | 00000000:0A:00.0 Off |                  N/A |
| 35%   33C    P8    N/A /  19W |     81MiB /  2048MiB |     N/A      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
|  No running processes found                                                 |
+-----------------------------------------------------------------------------+

For above error, please refer to Docker instantiation failed with error: 500 Server Error: Internal Server Error ("OCI runtime create failed...) - #15 by Morganh

sudo systemctl restart docker

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.