I reached to next step.
But still have error.
The error is Error executing job with overrides:
My command is
action_recognition train \
-e /workspace/Nyan/tao_source_codes_v5.0.0/notebooks/tao_launcher_starter_kit/action_recognition_net/specs/experiment_rgb_3d_finetune.yaml \
-k nvidia_tao \
results_dir=/workspace/Nyan/tao_source_codes_v5.0.0/notebooks/tao_launcher_starter_kit/action_recognition_net/results/rgb_3d_ptm \
model.rgb_pretrained_model_path=/workspace/Nyan/tao_source_codes_v5.0.0/notebooks/tao_launcher_starter_kit/action_recognition_net/pretrained/resnet18_3d_rgb_hmdb5_32.tlt \
model.rgb_pretrained_num_classes=5
My spec files are
experiment_rgb_3d_finetune.yaml (1.1 KB)
train_rgb_3d_finetune.yaml (1013 Bytes)
root@00c2c3d0f99f:/workspace/Nyan/tao_source_codes_v5.0.0/notebooks/tao_launcher_starter_kit/action_recognition_net# action_recognition train -e /workspace/Nyan/tao_source_codes_v5.0.0/notebooks/tao_launcher_starter_kit/action_recognition_net/specs/experiment_rgb_3d_finetune.yaml -k nvidia_tao results_dir=/workspace/Nyan/tao_source_codes_v5.0.0/notebooks/tao_launcher_starter_kit/action_recognition_net/results/rgb_3d_ptm model.rgb_pretrained_model_path=/workspace/Nyan/tao_source_codes_v5.0.0/notebooks/tao_launcher_starter_kit/action_recognition_net/pretrained/resnet18_3d_rgb_hmdb5_32.tlt model.rgb_pretrained_num_classes=5
[2023-08-22 05:19:50,840 - TAO Toolkit - torch.distributed.nn.jit.instantiator - INFO] Created a temporary directory at /tmp/tmpp1qu75as
[2023-08-22 05:19:50,841 - TAO Toolkit - torch.distributed.nn.jit.instantiator - INFO] Writing /tmp/tmpp1qu75as/_remote_module_non_scriptable.py
sys:1: UserWarning:
'experiment_rgb_3d_finetune.yaml' is validated against ConfigStore schema with the same name.
This behavior is deprecated in Hydra 1.1 and will be removed in Hydra 1.2.
See https://hydra.cc/docs/next/upgrades/1.0_to_1.1/automatic_schema_matching for migration instructions.
<frozen core.hydra.hydra_runner>:107: UserWarning:
'experiment_rgb_3d_finetune.yaml' is validated against ConfigStore schema with the same name.
This behavior is deprecated in Hydra 1.1 and will be removed in Hydra 1.2.
See https://hydra.cc/docs/next/upgrades/1.0_to_1.1/automatic_schema_matching for migration instructions.
/usr/local/lib/python3.8/dist-packages/hydra/_internal/hydra.py:119: UserWarning: Future Hydra versions will no longer change working directory at job runtime by default.
See https://hydra.cc/docs/next/upgrades/1.1_to_1.2/changes_to_job_working_dir/ for more information.
ret = run_job(
loading trained weights from /workspace/Nyan/tao_source_codes_v5.0.0/notebooks/tao_launcher_starter_kit/action_recognition_net/pretrained/resnet18_3d_rgb_hmdb5_32.tlt
ResNet3d(
(conv1): Conv3d(3, 64, kernel_size=(5, 7, 7), stride=(2, 2, 2), padding=(2, 3, 3), bias=False)
(bn1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
(maxpool): MaxPool3d(kernel_size=(1, 3, 3), stride=2, padding=(0, 1, 1), dilation=1, ceil_mode=False)
(layer1): Sequential(
(0): BasicBlock3d(
(conv1): Conv3d(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
(bn1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
(conv2): Conv3d(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
(bn2): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(1): BasicBlock3d(
(conv1): Conv3d(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
(bn1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
(conv2): Conv3d(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
(bn2): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(layer2): Sequential(
(0): BasicBlock3d(
(conv1): Conv3d(64, 128, kernel_size=(3, 3, 3), stride=(1, 2, 2), padding=(1, 1, 1), bias=False)
(bn1): BatchNorm3d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
(conv2): Conv3d(128, 128, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
(bn2): BatchNorm3d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(downsample): Sequential(
(0): Conv3d(64, 128, kernel_size=(1, 1, 1), stride=(1, 2, 2), bias=False)
(1): BatchNorm3d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(1): BasicBlock3d(
(conv1): Conv3d(128, 128, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
(bn1): BatchNorm3d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
(conv2): Conv3d(128, 128, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
(bn2): BatchNorm3d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(layer3): Sequential(
(0): BasicBlock3d(
(conv1): Conv3d(128, 256, kernel_size=(3, 3, 3), stride=(1, 2, 2), padding=(1, 1, 1), bias=False)
(bn1): BatchNorm3d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
(conv2): Conv3d(256, 256, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
(bn2): BatchNorm3d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(downsample): Sequential(
(0): Conv3d(128, 256, kernel_size=(1, 1, 1), stride=(1, 2, 2), bias=False)
(1): BatchNorm3d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(1): BasicBlock3d(
(conv1): Conv3d(256, 256, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
(bn1): BatchNorm3d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
(conv2): Conv3d(256, 256, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
(bn2): BatchNorm3d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(layer4): Sequential(
(0): BasicBlock3d(
(conv1): Conv3d(256, 512, kernel_size=(3, 3, 3), stride=(1, 2, 2), padding=(1, 1, 1), bias=False)
(bn1): BatchNorm3d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
(conv2): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
(bn2): BatchNorm3d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(downsample): Sequential(
(0): Conv3d(256, 512, kernel_size=(1, 1, 1), stride=(1, 2, 2), bias=False)
(1): BatchNorm3d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(1): BasicBlock3d(
(conv1): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
(bn1): BatchNorm3d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
(conv2): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
(bn2): BatchNorm3d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(avg_pool): AdaptiveAvgPool3d(output_size=(1, 1, 1))
(fc_cls): Linear(in_features=512, out_features=4, bias=True)
)
/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/connectors/accelerator_connector.py:441: LightningDeprecationWarning: Setting `Trainer(gpus=[0])` is deprecated in v1.7 and will be removed in v2.0. Please use `Trainer(accelerator='gpu', devices=[0])` instead.
rank_zero_deprecation(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: /workspace/Nyan/tao_source_codes_v5.0.0/notebooks/tao_launcher_starter_kit/action_recognition_net/results/rgb_3d_ptm/train/lightning_logs
/usr/local/lib/python3.8/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:604: UserWarning: Checkpoint directory /workspace/Nyan/tao_source_codes_v5.0.0/notebooks/tao_launcher_starter_kit/action_recognition_net/results/rgb_3d_ptm/train exists and is not empty.
rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
Train dataset samples: 860
Validation dataset samples: 215
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
Adjusting learning rate of group 0 to 1.0000e-03.
| Name | Type | Params
--------------------------------------------
0 | model | ResNet3d | 33.2 M
1 | train_accuracy | Accuracy | 0
2 | val_accuracy | Accuracy | 0
--------------------------------------------
33.2 M Trainable params
0 Non-trainable params
33.2 M Total params
132.749 Total estimated model params size (MB)
Sanity Checking: 0it [00:00, ?it/s]<frozen cv.action_recognition.dataloader.frame_sampler>:125: RuntimeWarning: divide by zero encountered in remainder
<frozen cv.action_recognition.dataloader.frame_sampler>:125: RuntimeWarning: divide by zero encountered in remainder
Caught IndexError in DataLoader worker process 0.
Original Traceback (most recent call last):
File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
data = fetcher.fetch(index)
File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/fetch.py", line 58, in fetch
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/fetch.py", line 58, in <listcomp>
data = [self.dataset[idx] for idx in possibly_batched_index]
File "<frozen cv.action_recognition.dataloader.ar_dataset>", line 385, in __getitem__
File "<frozen cv.action_recognition.dataloader.ar_dataset>", line 349, in get_frames
IndexError: list index out of range
Error executing job with overrides: ['encryption_key=nvidia_tao', 'results_dir=/workspace/Nyan/tao_source_codes_v5.0.0/notebooks/tao_launcher_starter_kit/action_recognition_net/results/rgb_3d_ptm', 'model.rgb_pretrained_model_path=/workspace/Nyan/tao_source_codes_v5.0.0/notebooks/tao_launcher_starter_kit/action_recognition_net/pretrained/resnet18_3d_rgb_hmdb5_32.tlt', 'model.rgb_pretrained_num_classes=5']
An error occurred during Hydra's exception formatting:
AssertionError()
Traceback (most recent call last):
File "/usr/local/lib/python3.8/dist-packages/hydra/_internal/utils.py", line 254, in run_and_report
assert mdl is not None
AssertionError
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "</usr/local/lib/python3.8/dist-packages/nvidia_tao_pytorch/cv/action_recognition/scripts/train.py>", line 3, in <module>
File "<frozen cv.action_recognition.scripts.train>", line 135, in <module>
File "<frozen core.hydra.hydra_runner>", line 107, in wrapper
File "/usr/local/lib/python3.8/dist-packages/hydra/_internal/utils.py", line 389, in _run_hydra
_run_app(
File "/usr/local/lib/python3.8/dist-packages/hydra/_internal/utils.py", line 452, in _run_app
run_and_report(
File "/usr/local/lib/python3.8/dist-packages/hydra/_internal/utils.py", line 296, in run_and_report
raise ex
File "/usr/local/lib/python3.8/dist-packages/hydra/_internal/utils.py", line 213, in run_and_report
return func()
File "/usr/local/lib/python3.8/dist-packages/hydra/_internal/utils.py", line 453, in <lambda>
lambda: hydra.run(
File "/usr/local/lib/python3.8/dist-packages/hydra/_internal/hydra.py", line 132, in run
_ = ret.return_value
File "/usr/local/lib/python3.8/dist-packages/hydra/core/utils.py", line 260, in return_value
raise self._return_value
File "/usr/local/lib/python3.8/dist-packages/hydra/core/utils.py", line 186, in run_job
ret.return_value = task_function(task_cfg)
File "<frozen cv.action_recognition.scripts.train>", line 131, in main
File "<frozen cv.action_recognition.scripts.train>", line 120, in main
File "<frozen cv.action_recognition.scripts.train>", line 95, in run_experiment
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py", line 603, in fit
call._call_and_handle_interrupt(
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/call.py", line 38, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py", line 645, in _fit_impl
self._run(model, ckpt_path=self.ckpt_path)
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py", line 1098, in _run
results = self._run_stage()
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py", line 1177, in _run_stage
self._run_train()
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py", line 1190, in _run_train
self._run_sanity_check()
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py", line 1262, in _run_sanity_check
val_loop.run()
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/loops/loop.py", line 199, in run
self.advance(*args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/loops/dataloader/evaluation_loop.py", line 152, in advance
dl_outputs = self.epoch_loop.run(self._data_fetcher, dl_max_batches, kwargs)
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/loops/loop.py", line 199, in run
self.advance(*args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py", line 121, in advance
batch = next(data_fetcher)
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/utilities/fetching.py", line 184, in __next__
return self.fetching_function()
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/utilities/fetching.py", line 265, in fetching_function
self._fetch_next_batch(self.dataloader_iter)
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/utilities/fetching.py", line 280, in _fetch_next_batch
batch = next(iterator)
File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/dataloader.py", line 635, in __next__
data = self._next_data()
File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/dataloader.py", line 1347, in _next_data
return self._process_data(data)
File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/dataloader.py", line 1373, in _process_data
data.reraise()
File "/usr/local/lib/python3.8/dist-packages/torch/_utils.py", line 636, in reraise
raise exception
IndexError: Caught IndexError in DataLoader worker process 0.
Original Traceback (most recent call last):
File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
data = fetcher.fetch(index)
File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/fetch.py", line 58, in fetch
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/fetch.py", line 58, in <listcomp>
data = [self.dataset[idx] for idx in possibly_batched_index]
File "<frozen cv.action_recognition.dataloader.ar_dataset>", line 385, in __getitem__
File "<frozen cv.action_recognition.dataloader.ar_dataset>", line 349, in get_frames
IndexError: list index out of range
[2023-08-22 05:20:05,499 - TAO Toolkit - root - ERROR] Execution status: FAIL
root@00c2c3d0f99f:/workspace/Nyan/tao_source_codes_v5.0.0/notebooks/tao_launcher_starter_kit/action_recognition_net#