Hi Morganh,
some of the intermediate cannot be saved. Whenever this happens the container stops. When having 20 epochs, epoch 12 always crashes. So I have never reached the last one. When I use 10 epochs I can reach the last checkpoint with restarting training a couple of times from previous checkpoints.
Here is the complete Log of a crash after first epoch:
…
Epoch: 0.996353/20:, Cur-Step: 6010, loss(cross_entropy): 0.65571, Running average loss:0.17416, Time taken: 0 ETA: 0.0
2024-07-31 06:53:10,233 [TAO Toolkit] [INFO] main 161: Epoch: 0.996353/20:, Cur-Step: 6010, loss(cross_entropy): 0.65571, Running average loss:0.17416, Time taken: 0 ETA: 0.0
Epoch: 0.998011/20:, Cur-Step: 6020, loss(cross_entropy): 0.05284, Running average loss:0.17403, Time taken: 0 ETA: 0.0
2024-07-31 06:53:11,344 [TAO Toolkit] [INFO] main 161: Epoch: 0.998011/20:, Cur-Step: 6020, loss(cross_entropy): 0.05284, Running average loss:0.17403, Time taken: 0 ETA: 0.0
Epoch: 0.999668/20:, Cur-Step: 6030, loss(cross_entropy): 0.07221, Running average loss:0.17392, Time taken: 0 ETA: 0.0
2024-07-31 06:53:13,135 [TAO Toolkit] [INFO] main 161: Epoch: 0.999668/20:, Cur-Step: 6030, loss(cross_entropy): 0.07221, Running average loss:0.17392, Time taken: 0 ETA: 0.0
INFO:tensorflow:Saving checkpoints for step-6032.
2024-07-31 06:53:13,423 [TAO Toolkit] [INFO] tensorflow 76: Saving checkpoints for step-6032.
2024-07-31 06:54:18,771 [TAO Toolkit] [INFO] root 2102: Dst tensor is not initialized.
[[node conv2d_3/kernel/Adam (defined at /tensorflow_core/python/framework/ops.py:1748) ]]
Original stack trace for ‘conv2d_3/kernel/Adam’:
File “/nvidia_tao_tf1/cv/unet/scripts/train.py”, line 578, in
main()
File “/nvidia_tao_tf1/cv/unet/scripts/train.py”, line 551, in main
run_experiment(config_path=args.experiment_spec_file,
File “/nvidia_tao_tf1/cv/unet/scripts/train.py”, line 423, in run_experiment
train_unet(results_dir, experiment_spec, ptm, model_file,
File “/nvidia_tao_tf1/cv/unet/scripts/train.py”, line 298, in train_unet
run_training_loop(estimator, dataset, params, unet_model,
File “/nvidia_tao_tf1/cv/unet/scripts/train.py”, line 131, in run_training_loop
estimator.train(
File “/tensorflow_estimator/python/estimator/estimator.py”, line 370, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File “/tensorflow_estimator/python/estimator/estimator.py”, line 1161, in _train_model
return self._train_model_default(input_fn, hooks, saving_listeners)
File “/tensorflow_estimator/python/estimator/estimator.py”, line 1190, in _train_model_default
estimator_spec = self._call_model_fn(
File “/tensorflow_estimator/python/estimator/estimator.py”, line 1149, in _call_model_fn
model_fn_results = self._model_fn(features=features, **kwargs)
File “/nvidia_tao_tf1/cv/unet/utils/model_fn.py”, line 356, in unet_fn
train_op = opt.minimize(total_loss, gate_gradients=gate_gradients,
File “/tensorflow_core/python/training/optimizer.py”, line 428, in minimize
return self.apply_gradients(grads_and_vars, global_step=global_step,
File “/tensorflow_core/python/training/optimizer.py”, line 687, in apply_gradients
maybe_apply_op = smart_cond.smart_cond(should_apply_grads, apply_fn,
File “/tensorflow_core/python/framework/smart_cond.py”, line 58, in smart_cond
return control_flow_ops.cond(pred, true_fn=true_fn, false_fn=false_fn,
File “/tensorflow_core/python/util/deprecation.py”, line 513, in new_func
return func(*args, **kwargs)
File “/tensorflow_core/python/ops/control_flow_ops.py”, line 1224, in cond
orig_res_t, res_t = context_t.BuildCondBranch(true_fn)
File “/tensorflow_core/python/ops/control_flow_ops.py”, line 1061, in BuildCondBranch
original_result = fn()
File “/tensorflow_core/python/training/optimizer.py”, line 640, in apply_fn
self._create_slots(var_list)
File “/tensorflow_core/python/training/adam.py”, line 131, in _create_slots
self._zeros_slot(v, “m”, self._name)
File “/tensorflow_core/python/training/optimizer.py”, line 1224, in _zeros_slot
new_slot_variable = slot_creator.create_zeros_slot(var, op_name)
File “/tensorflow_core/python/training/slot_creator.py”, line 188, in create_zeros_slot
return create_slot_with_initializer(
File “/tensorflow_core/python/training/slot_creator.py”, line 163, in create_slot_with_initializer
return _create_slot_var(primary, initializer, “”, validate_shape, shape,
File “/tensorflow_core/python/training/slot_creator.py”, line 67, in _create_slot_var
slot = variable_scope.get_variable(
File “/tensorflow_core/python/ops/variable_scope.py”, line 1484, in get_variable
return get_variable_scope().get_variable(
File “/tensorflow_core/python/ops/variable_scope.py”, line 1227, in get_variable
return var_store.get_variable(
File “/tensorflow_core/python/ops/variable_scope.py”, line 552, in get_variable
return _true_getter(
File “/tensorflow_core/python/ops/variable_scope.py”, line 505, in _true_getter
return self._get_single_variable(
File “/tensorflow_core/python/ops/variable_scope.py”, line 922, in _get_single_variable
v = variables.VariableV1(
File “/tensorflow_core/python/ops/variables.py”, line 258, in call
return cls._variable_v1_call(*args, **kwargs)
File “/tensorflow_core/python/ops/variables.py”, line 204, in _variable_v1_call
return previous_getter(
File “/tensorflow_core/python/ops/variables.py”, line 197, in
previous_getter = lambda **kwargs: default_variable_creator(None, **kwargs)
File “/tensorflow_core/python/ops/variable_scope.py”, line 2505, in default_variable_creator
return variables.RefVariable(
File “/tensorflow_core/python/ops/variables.py”, line 262, in call
return super(VariableMetaclass, cls).call(*args, **kwargs)
File “/tensorflow_core/python/ops/variables.py”, line 1676, in init
self._init_from_args(
File “/tensorflow_core/python/ops/variables.py”, line 1823, in _init_from_args
self._variable = state_ops.variable_op_v2(
File “/tensorflow_core/python/ops/state_ops.py”, line 74, in variable_op_v2
return gen_state_ops.variable_v2(
File “/tensorflow_core/python/ops/gen_state_ops.py”, line 1619, in variable_v2
_, _, _op = _op_def_lib._apply_op_helper(
File “/tensorflow_core/python/framework/op_def_library.py”, line 792, in _apply_op_helper
op = g.create_op(op_type_name, inputs, dtypes=None, name=scope,
File “/tensorflow_core/python/util/deprecation.py”, line 513, in new_func
return func(*args, **kwargs)
File “/tensorflow_core/python/framework/ops.py”, line 3356, in create_op
return self._create_op_internal(op_type, inputs, dtypes, input_types, name,
File “/tensorflow_core/python/framework/ops.py”, line 3418, in _create_op_internal
ret = Operation(
File “/tensorflow_core/python/framework/ops.py”, line 1748, in init
self._traceback = tf_stack.extract_stack()
Traceback (most recent call last):
File “/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/client/session.py”, line 1365, in _do_call
return fn(*args)
File “/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/client/session.py”, line 1349, in _run_fn
return self._call_tf_sessionrun(options, feed_dict, fetch_list,
File “/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/client/session.py”, line 1441, in _call_tf_sessionrun
return tf_session.TF_SessionRun_wrapper(self._session, options, feed_dict,
tensorflow.python.framework.errors_impl.InternalError: Dst tensor is not initialized.
[[{{node conv2d_3/kernel/Adam}}]]
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File “/usr/local/lib/python3.8/dist-packages/nvidia_tao_tf1/cv/unet/scripts/train.py”, line 578, in
main()
File “/usr/local/lib/python3.8/dist-packages/nvidia_tao_tf1/cv/unet/scripts/train.py”, line 570, in main
raise e
File “/usr/local/lib/python3.8/dist-packages/nvidia_tao_tf1/cv/unet/scripts/train.py”, line 551, in main
run_experiment(config_path=args.experiment_spec_file,
File “/usr/local/lib/python3.8/dist-packages/nvidia_tao_tf1/cv/unet/scripts/train.py”, line 423, in run_experiment
train_unet(results_dir, experiment_spec, ptm, model_file,
File “/usr/local/lib/python3.8/dist-packages/nvidia_tao_tf1/cv/unet/scripts/train.py”, line 298, in train_unet
run_training_loop(estimator, dataset, params, unet_model,
File “/usr/local/lib/python3.8/dist-packages/nvidia_tao_tf1/cv/unet/scripts/train.py”, line 131, in run_training_loop
estimator.train(
File “/usr/local/lib/python3.8/dist-packages/tensorflow_estimator/python/estimator/estimator.py”, line 370, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File “/usr/local/lib/python3.8/dist-packages/tensorflow_estimator/python/estimator/estimator.py”, line 1161, in _train_model
return self._train_model_default(input_fn, hooks, saving_listeners)
File “/usr/local/lib/python3.8/dist-packages/tensorflow_estimator/python/estimator/estimator.py”, line 1193, in _train_model_default
return self._train_with_estimator_spec(estimator_spec, worker_hooks,
File “/usr/local/lib/python3.8/dist-packages/tensorflow_estimator/python/estimator/estimator.py”, line 1494, in _train_with_estimator_spec
_, loss = mon_sess.run([estimator_spec.train_op, estimator_spec.loss])
File “/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/training/monitored_session.py”, line 750, in run
return self._sess.run(
File “/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/training/monitored_session.py”, line 1255, in run
return self._sess.run(
File “/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/training/monitored_session.py”, line 1360, in run
raise six.reraise(*original_exc_info)
File “/usr/local/lib/python3.8/dist-packages/six.py”, line 719, in reraise
raise value
File “/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/training/monitored_session.py”, line 1345, in run
return self._sess.run(*args, **kwargs)
File “/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/training/monitored_session.py”, line 1421, in run
hook.after_run(
File “/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/training/basic_session_run_hooks.py”, line 594, in after_run
if self._save(run_context.session, global_step):
File “/usr/local/lib/python3.8/dist-packages/nvidia_tao_tf1/cv/unet/hooks/checkpoint_saver_hook.py”, line 85, in _save
self._save_checkpoint(session, step)
File “/usr/local/lib/python3.8/dist-packages/nvidia_tao_tf1/cv/unet/hooks/checkpoint_saver_hook.py”, line 104, in _save_checkpoint
saver.save(session, os.path.join(ckzip_folder, “model.ckpt”), global_step=epoch)
File “/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/training/saver.py”, line 1174, in save
model_checkpoint_path = sess.run(
File “/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/client/session.py”, line 955, in run
result = self._run(None, fetches, feed_dict, options_ptr,
File “/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/client/session.py”, line 1179, in _run
results = self._do_run(handle, final_targets, final_fetches,
File “/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/client/session.py”, line 1358, in _do_run
return self._do_call(_run_fn, feeds, fetches, targets, options,
File “/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/client/session.py”, line 1384, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.InternalError: Dst tensor is not initialized.
[[node conv2d_3/kernel/Adam (defined at /tensorflow_core/python/framework/ops.py:1748) ]]
Original stack trace for ‘conv2d_3/kernel/Adam’:
File “/nvidia_tao_tf1/cv/unet/scripts/train.py”, line 578, in
main()
File “/nvidia_tao_tf1/cv/unet/scripts/train.py”, line 551, in main
run_experiment(config_path=args.experiment_spec_file,
File “/nvidia_tao_tf1/cv/unet/scripts/train.py”, line 423, in run_experiment
train_unet(results_dir, experiment_spec, ptm, model_file,
File “/nvidia_tao_tf1/cv/unet/scripts/train.py”, line 298, in train_unet
run_training_loop(estimator, dataset, params, unet_model,
File “/nvidia_tao_tf1/cv/unet/scripts/train.py”, line 131, in run_training_loop
estimator.train(
File “/tensorflow_estimator/python/estimator/estimator.py”, line 370, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File “/tensorflow_estimator/python/estimator/estimator.py”, line 1161, in _train_model
return self._train_model_default(input_fn, hooks, saving_listeners)
File “/tensorflow_estimator/python/estimator/estimator.py”, line 1190, in _train_model_default
estimator_spec = self._call_model_fn(
File “/tensorflow_estimator/python/estimator/estimator.py”, line 1149, in _call_model_fn
model_fn_results = self._model_fn(features=features, **kwargs)
File “/nvidia_tao_tf1/cv/unet/utils/model_fn.py”, line 356, in unet_fn
train_op = opt.minimize(total_loss, gate_gradients=gate_gradients,
File “/tensorflow_core/python/training/optimizer.py”, line 428, in minimize
return self.apply_gradients(grads_and_vars, global_step=global_step,
File “/tensorflow_core/python/training/optimizer.py”, line 687, in apply_gradients
maybe_apply_op = smart_cond.smart_cond(should_apply_grads, apply_fn,
File “/tensorflow_core/python/framework/smart_cond.py”, line 58, in smart_cond
return control_flow_ops.cond(pred, true_fn=true_fn, false_fn=false_fn,
File “/tensorflow_core/python/util/deprecation.py”, line 513, in new_func
return func(*args, **kwargs)
File “/tensorflow_core/python/ops/control_flow_ops.py”, line 1224, in cond
orig_res_t, res_t = context_t.BuildCondBranch(true_fn)
File “/tensorflow_core/python/ops/control_flow_ops.py”, line 1061, in BuildCondBranch
original_result = fn()
File “/tensorflow_core/python/training/optimizer.py”, line 640, in apply_fn
self._create_slots(var_list)
File “/tensorflow_core/python/training/adam.py”, line 131, in _create_slots
self._zeros_slot(v, “m”, self._name)
File “/tensorflow_core/python/training/optimizer.py”, line 1224, in _zeros_slot
new_slot_variable = slot_creator.create_zeros_slot(var, op_name)
File “/tensorflow_core/python/training/slot_creator.py”, line 188, in create_zeros_slot
return create_slot_with_initializer(
File “/tensorflow_core/python/training/slot_creator.py”, line 163, in create_slot_with_initializer
return _create_slot_var(primary, initializer, “”, validate_shape, shape,
File “/tensorflow_core/python/training/slot_creator.py”, line 67, in _create_slot_var
slot = variable_scope.get_variable(
File “/tensorflow_core/python/ops/variable_scope.py”, line 1484, in get_variable
return get_variable_scope().get_variable(
File “/tensorflow_core/python/ops/variable_scope.py”, line 1227, in get_variable
return var_store.get_variable(
File “/tensorflow_core/python/ops/variable_scope.py”, line 552, in get_variable
return _true_getter(
File “/tensorflow_core/python/ops/variable_scope.py”, line 505, in _true_getter
return self._get_single_variable(
File “/tensorflow_core/python/ops/variable_scope.py”, line 922, in _get_single_variable
v = variables.VariableV1(
File “/tensorflow_core/python/ops/variables.py”, line 258, in call
return cls._variable_v1_call(*args, **kwargs)
File “/tensorflow_core/python/ops/variables.py”, line 204, in _variable_v1_call
return previous_getter(
File “/tensorflow_core/python/ops/variables.py”, line 197, in
previous_getter = lambda **kwargs: default_variable_creator(None, **kwargs)
File “/tensorflow_core/python/ops/variable_scope.py”, line 2505, in default_variable_creator
return variables.RefVariable(
File “/tensorflow_core/python/ops/variables.py”, line 262, in call
return super(VariableMetaclass, cls).call(*args, **kwargs)
File “/tensorflow_core/python/ops/variables.py”, line 1676, in init
self._init_from_args(
File “/tensorflow_core/python/ops/variables.py”, line 1823, in _init_from_args
self._variable = state_ops.variable_op_v2(
File “/tensorflow_core/python/ops/state_ops.py”, line 74, in variable_op_v2
return gen_state_ops.variable_v2(
File “/tensorflow_core/python/ops/gen_state_ops.py”, line 1619, in variable_v2
_, _, _op = _op_def_lib._apply_op_helper(
File “/tensorflow_core/python/framework/op_def_library.py”, line 792, in _apply_op_helper
op = g.create_op(op_type_name, inputs, dtypes=None, name=scope,
File “/tensorflow_core/python/util/deprecation.py”, line 513, in new_func
return func(*args, **kwargs)
File “/tensorflow_core/python/framework/ops.py”, line 3356, in create_op
return self._create_op_internal(op_type, inputs, dtypes, input_types, name,
File “/tensorflow_core/python/framework/ops.py”, line 3418, in _create_op_internal
ret = Operation(
File “/tensorflow_core/python/framework/ops.py”, line 1748, in init
self._traceback = tf_stack.extract_stack()
Telemetry data couldn’t be sent, but the command ran successfully.
[WARNING]:
Execution status: FAIL
2024-07-31 08:54:21,075 [TAO Toolkit] [INFO] nvidia_tao_cli.components.docker_handler.docker_handler 363: Stopping container.
Thanks