Please provide the following information when requesting support.
• Hardware (2080Ti)
• Network Type (Classification)
• TLT Version (Please run “tlt info --verbose” and share “docker_tag” here)
root@0604619f0b55:/workspace/tao-experiments/classification/efficientnet_b0# tao info
Configuration of the TAO Toolkit Instance
task_group: [‘model’, ‘dataset’, ‘deploy’]
format_version: 3.0
toolkit_version: 5.3.0
published_date: 03/14/2024
• Training spec file(If have, please share here)
results_dir: “/workspace/tao-experiments/classification/efficientnet_b0/results”
model_config {
arch: “efficientnet_b0”
use_bias: False
use_imagenet_head: True
resize_interpolation_method: BICUBIC
input_image_size: “3,224,224”
}
train_config {
preprocess_mode: “caffe”
train_dataset_path: “/workspace/tao-experiments/data/c5/train”
val_dataset_path: “/workspace/tao-experiments/data/c5/test”
optimizer {
sgd {
lr: 0.01
decay: 0.0
momentum: 0.9
nesterov: False
}
}
batch_size_per_gpu: 200
n_epochs: 500
n_workers: 16
reg_config {
type: “L2”
scope: “Conv2D,Dense”
weight_decay: 5e-5
}
lr_config {
cosine {
learning_rate: 0.05
min_lr_ratio: 0.001
}
}
enable_random_crop: True
enable_center_crop: True
enable_color_augmentation: True
mixup_alpha: 0.2
label_smoothing: 0.1
}
train_dataloader {
data_prefix: “/workspace/tao-experiments/data/c5/train”
}
val_dataloader {
data_prefix: “/workspace/tao-experiments/data/c5/test”
}
root@0604619f0b55:/workspace/tao-experiments/classification/efficientnet_b0# cat ~/.tao_mounts.json
{
“Mounts”: [
{
“source”: “/home/vas/video/tao”,
“destination”: “/workspace”
}
]
}
• How to reproduce the issue ? (This is for errors. Please share the command line and the detailed log here.)
docker run -p 8888:8888 --gpus all -it --rm --privileged -v /home/vas/video/tao:/workspace nvcr.io/nvidia/tao/tao-toolkit:5.3.0-pyt /bin/bash
ngc registry resource download-version “nvidia/tao/cv_samples:v1.4.1”
root@0604619f0b55:/workspace/tao-experiments/classification/efficientnet_b0# classification_pyt train -e ./spec.txt -r /workspace/tao-experiments/classification/efficientnet_b0/results:
One of `ann_file`, `data_root` and `data_prefix` must be specified.
Error executing job with overrides: ['results_dir=/workspace/tao-experiments/classification/efficientnet_b0/results']
Traceback (most recent call last):
File "/usr/local/lib/python3.10/dist-packages/nvidia_tao_pytorch/cv/classification/scripts/train.py", line 117, in <module>
main()
File "/usr/local/lib/python3.10/dist-packages/nvidia_tao_pytorch/core/hydra/hydra_runner.py", line 107, in wrapper
_run_hydra(
File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 389, in _run_hydra
_run_app(
File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 452, in _run_app
run_and_report(
File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 216, in run_and_report
raise ex
File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 213, in run_and_report
return func()
File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 453, in <lambda>
lambda: hydra.run(
File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/hydra.py", line 132, in run
_ = ret.return_value
File "/usr/local/lib/python3.10/dist-packages/hydra/core/utils.py", line 260, in return_value
raise self._return_value
File "/usr/local/lib/python3.10/dist-packages/hydra/core/utils.py", line 186, in run_job
ret.return_value = task_function(task_cfg)
File "/usr/local/lib/python3.10/dist-packages/nvidia_tao_pytorch/cv/classification/scripts/train.py", line 113, in main
raise e
File "/usr/local/lib/python3.10/dist-packages/nvidia_tao_pytorch/cv/classification/scripts/train.py", line 99, in main
run_experiment(cfg, results_dir=results_dir)
File "/usr/local/lib/python3.10/dist-packages/nvidia_tao_pytorch/cv/classification/scripts/train.py", line 80, in run_experiment
runner.train()
File "/usr/local/lib/python3.10/dist-packages/mmengine/runner/runner.py", line 1728, in train
self._train_loop = self.build_train_loop(
File "/usr/local/lib/python3.10/dist-packages/mmengine/runner/runner.py", line 1527, in build_train_loop
loop = EpochBasedTrainLoop(
File "/usr/local/lib/python3.10/dist-packages/mmengine/runner/loops.py", line 44, in __init__
super().__init__(runner, dataloader)
File "/usr/local/lib/python3.10/dist-packages/mmengine/runner/base_loop.py", line 26, in __init__
self.dataloader = runner.build_dataloader(
File "/usr/local/lib/python3.10/dist-packages/mmengine/runner/runner.py", line 1370, in build_dataloader
dataset = DATASETS.build(dataset_cfg)
File "/usr/local/lib/python3.10/dist-packages/mmengine/registry/registry.py", line 570, in build
return self.build_func(cfg, *args, **kwargs, registry=self)
File "/usr/local/lib/python3.10/dist-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
obj = obj_cls(**args) # type: ignore
File "/usr/local/lib/python3.10/dist-packages/mmpretrain/datasets/imagenet.py", line 122, in __init__
super().__init__(
File "/usr/local/lib/python3.10/dist-packages/mmpretrain/datasets/custom.py", line 200, in __init__
assert (ann_file or data_prefix or data_root), \
AssertionError: One of `ann_file`, `data_root` and `data_prefix` must be specified.
[2024-06-09 15:16:26,180] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 1512) of binary: /usr/bin/python
Traceback (most recent call last):
File "/usr/local/bin/torchrun", line 8, in <module>
sys.exit(main())
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 351, in wrapper
return f(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/run.py", line 806, in main
run(args)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/run.py", line 797, in run
elastic_launch(
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/launcher/api.py", line 134, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
/usr/local/lib/python3.10/dist-packages/nvidia_tao_pytorch/cv/classification/scripts/train.py FAILED
------------------------------------------------------------
Failures:
<NO_OTHER_FAILURES>
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
time : 2024-06-09_15:16:26
host : 0604619f0b55
rank : 0 (local_rank: 0)
exitcode : 1 (pid: 1512)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
============================================================
Execution status: FAIL