Error when: classification_pyt train -e ./spec.txt

Please provide the following information when requesting support.

• Hardware (2080Ti)
• Network Type (Classification)
• TLT Version (Please run “tlt info --verbose” and share “docker_tag” here)
root@0604619f0b55:/workspace/tao-experiments/classification/efficientnet_b0# tao info
Configuration of the TAO Toolkit Instance
task_group: [‘model’, ‘dataset’, ‘deploy’]
format_version: 3.0
toolkit_version: 5.3.0
published_date: 03/14/2024

• Training spec file(If have, please share here)
results_dir: “/workspace/tao-experiments/classification/efficientnet_b0/results”

model_config {
arch: “efficientnet_b0”
use_bias: False
use_imagenet_head: True
resize_interpolation_method: BICUBIC
input_image_size: “3,224,224”
}
train_config {
preprocess_mode: “caffe”
train_dataset_path: “/workspace/tao-experiments/data/c5/train”
val_dataset_path: “/workspace/tao-experiments/data/c5/test”
optimizer {
sgd {
lr: 0.01
decay: 0.0
momentum: 0.9
nesterov: False
}
}
batch_size_per_gpu: 200
n_epochs: 500
n_workers: 16
reg_config {
type: “L2”
scope: “Conv2D,Dense”
weight_decay: 5e-5
}
lr_config {
cosine {
learning_rate: 0.05
min_lr_ratio: 0.001
}
}
enable_random_crop: True
enable_center_crop: True
enable_color_augmentation: True
mixup_alpha: 0.2
label_smoothing: 0.1
}

train_dataloader {
data_prefix: “/workspace/tao-experiments/data/c5/train”
}

val_dataloader {
data_prefix: “/workspace/tao-experiments/data/c5/test”
}

root@0604619f0b55:/workspace/tao-experiments/classification/efficientnet_b0# cat ~/.tao_mounts.json
{
“Mounts”: [
{
“source”: “/home/vas/video/tao”,
“destination”: “/workspace”
}
]
}

• How to reproduce the issue ? (This is for errors. Please share the command line and the detailed log here.)

docker run -p 8888:8888 --gpus all -it --rm --privileged -v /home/vas/video/tao:/workspace nvcr.io/nvidia/tao/tao-toolkit:5.3.0-pyt /bin/bash
ngc registry resource download-version “nvidia/tao/cv_samples:v1.4.1”


root@0604619f0b55:/workspace/tao-experiments/classification/efficientnet_b0# classification_pyt train -e ./spec.txt -r /workspace/tao-experiments/classification/efficientnet_b0/results:

One of `ann_file`, `data_root` and `data_prefix` must be specified.
Error executing job with overrides: ['results_dir=/workspace/tao-experiments/classification/efficientnet_b0/results']
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/nvidia_tao_pytorch/cv/classification/scripts/train.py", line 117, in <module>
    main()
  File "/usr/local/lib/python3.10/dist-packages/nvidia_tao_pytorch/core/hydra/hydra_runner.py", line 107, in wrapper
    _run_hydra(
  File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 389, in _run_hydra
    _run_app(
  File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 452, in _run_app
    run_and_report(
  File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 216, in run_and_report
    raise ex
  File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 213, in run_and_report
    return func()
  File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 453, in <lambda>
    lambda: hydra.run(
  File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/hydra.py", line 132, in run
    _ = ret.return_value
  File "/usr/local/lib/python3.10/dist-packages/hydra/core/utils.py", line 260, in return_value
    raise self._return_value
  File "/usr/local/lib/python3.10/dist-packages/hydra/core/utils.py", line 186, in run_job
    ret.return_value = task_function(task_cfg)
  File "/usr/local/lib/python3.10/dist-packages/nvidia_tao_pytorch/cv/classification/scripts/train.py", line 113, in main
    raise e
  File "/usr/local/lib/python3.10/dist-packages/nvidia_tao_pytorch/cv/classification/scripts/train.py", line 99, in main
    run_experiment(cfg, results_dir=results_dir)
  File "/usr/local/lib/python3.10/dist-packages/nvidia_tao_pytorch/cv/classification/scripts/train.py", line 80, in run_experiment
    runner.train()
  File "/usr/local/lib/python3.10/dist-packages/mmengine/runner/runner.py", line 1728, in train
    self._train_loop = self.build_train_loop(
  File "/usr/local/lib/python3.10/dist-packages/mmengine/runner/runner.py", line 1527, in build_train_loop
    loop = EpochBasedTrainLoop(
  File "/usr/local/lib/python3.10/dist-packages/mmengine/runner/loops.py", line 44, in __init__
    super().__init__(runner, dataloader)
  File "/usr/local/lib/python3.10/dist-packages/mmengine/runner/base_loop.py", line 26, in __init__
    self.dataloader = runner.build_dataloader(
  File "/usr/local/lib/python3.10/dist-packages/mmengine/runner/runner.py", line 1370, in build_dataloader
    dataset = DATASETS.build(dataset_cfg)
  File "/usr/local/lib/python3.10/dist-packages/mmengine/registry/registry.py", line 570, in build
    return self.build_func(cfg, *args, **kwargs, registry=self)
  File "/usr/local/lib/python3.10/dist-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
    obj = obj_cls(**args)  # type: ignore
  File "/usr/local/lib/python3.10/dist-packages/mmpretrain/datasets/imagenet.py", line 122, in __init__
    super().__init__(
  File "/usr/local/lib/python3.10/dist-packages/mmpretrain/datasets/custom.py", line 200, in __init__
    assert (ann_file or data_prefix or data_root), \
AssertionError: One of `ann_file`, `data_root` and `data_prefix` must be specified.
[2024-06-09 15:16:26,180] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 1512) of binary: /usr/bin/python
Traceback (most recent call last):
  File "/usr/local/bin/torchrun", line 8, in <module>
    sys.exit(main())
  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 351, in wrapper
    return f(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/run.py", line 806, in main
    run(args)
  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/run.py", line 797, in run
    elastic_launch(
  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/launcher/api.py", line 134, in __call__
    return launch_agent(self._config, self._entrypoint, list(args))
  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
    raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
/usr/local/lib/python3.10/dist-packages/nvidia_tao_pytorch/cv/classification/scripts/train.py FAILED
------------------------------------------------------------
Failures:
  <NO_OTHER_FAILURES>
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
  time      : 2024-06-09_15:16:26
  host      : 0604619f0b55
  rank      : 0 (local_rank: 0)
  exitcode  : 1 (pid: 1512)
  error_file: <N/A>
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
============================================================
Execution status: FAIL

Please change to -e ./spec.yaml.

  1. Could you tell me where can I find the spec.yaml sample for classification.

  2. I tried the spec.cfg in cv_samples_vv1.4.1,still failed.
    /workspace/cv_samples_vv1.4.1/classification/tao_voc/specs/classification_spec.cfg.

  3. Is this notebook tested and work OK in a TAO docker container? Mine didn’t work OK.
    cv_samples_vv1.4.1/classification/tao_voc

Please use latest 5.3.0 notebook. See the command from
TAO Toolkit Quick Start Guide - NVIDIA Docs.