I have upgraded to latest TAO version nvidia-tao==0.1.24.
When I train for multiple GPUs, I have error as
Cannot reshape a tensor with 25690112 elements to shape [256,256,14,14] (12845056 elements) for 'mask_head_reshape_1/mask_head_reshape_1' (op: 'Reshape') with input shapes: [4,128,256,14,14], [4] and with input tensors computed as partial shapes: input[1] = [256,256,14,14].
My spec file is attached.
maskrcnn_retrain_resnet50.txt (2.1 KB)
The whole errors are as follows.
/root/.cache/bazel/_bazel_root/ed34e6d125608f91724fda23656f1726/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/mask_rcnn/layers/reshape_layer.py:25 call
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/array_ops.py:131 reshape
result = gen_array_ops.reshape(tensor, shape, name)
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/gen_array_ops.py:8115 reshape
"Reshape", tensor=tensor, shape=shape, name=name)
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/op_def_library.py:794 _apply_op_helper
op_def=op_def)
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/util/deprecation.py:513 new_func
return func(*args, **kwargs)
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py:3357 create_op
attrs, op_def, compute_device)
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py:3426 _create_op_internal
op_def=op_def)
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py:1770 __init__
control_input_ops)
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py:1610 _create_c_op
raise ValueError(str(e))
ValueError: Cannot reshape a tensor with 25690112 elements to shape [256,256,14,14] (12845056 elements) for 'mask_head_reshape_1/mask_head_reshape_1' (op: 'Reshape') with input shapes: [4,128,256,14,14], [4] and with input tensors computed as partial shapes: input[1] = [256,256,14,14].
Traceback (most recent call last):
File "/root/.cache/bazel/_bazel_root/ed34e6d125608f91724fda23656f1726/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/mask_rcnn/scripts/train.py", line 254, in <module>
File "/root/.cache/bazel/_bazel_root/ed34e6d125608f91724fda23656f1726/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/mask_rcnn/scripts/train.py", line 250, in main
File "/root/.cache/bazel/_bazel_root/ed34e6d125608f91724fda23656f1726/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/mask_rcnn/scripts/train.py", line 237, in main
File "/root/.cache/bazel/_bazel_root/ed34e6d125608f91724fda23656f1726/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/mask_rcnn/scripts/train.py", line 88, in run_executer
File "/root/.cache/bazel/_bazel_root/ed34e6d125608f91724fda23656f1726/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/mask_rcnn/executer/distributed_executer.py", line 418, in train_and_eval
File "/usr/local/lib/python3.6/dist-packages/tensorflow_estimator/python/estimator/estimator.py", line 370, in train
loss = self._trainfined in certain environments, like the interactive Python shell do not expose their source code. If that is the case, you should to define them in a .py source file. If you are certain the code is graph-compatible, wrap the call using @tf.autograph.do_not_convert. Original error: could not get source code
ocal/lib/python3.6/dist-packages/tensorflow_core/python/framework/op_def_library.py:794 _apply_op_helper
op_def=op_def)
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/util/deprecation.py:513 new_func
return func(*args, **kwargs)
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py:3357 create_op
attrs, op_def, compute_device)
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py:3426 _create_op_internal
op_def=op_def)
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py:1770 __init__
control_input_ops)
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py:1610 _create_c_op
raise ValueError(str(e))
ValueError: Cannot reshape a tensor with 25690112 elements to shape [256,256,14,14] (12845056 elements) for 'mask_head_reshape_1/mask_head_reshape_1' (op: 'Reshape') with input shapes: [4,128,256,14,14], [4] and with input tensors computed as partial shapes: input[1] = [256,256,14,14].
Traceback (most recent call last):
File "/root/.cache/bazel/_bazel_root/ed34e6d125608f91724fda23656f1726/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/mask_rcnn/scripts/train.py", line 254, in <module>
File "/root/.cache/bazel/_bazel_root/ed34e6d125608f91724fda23656f1726/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/mask_rcnn/scripts/train.py", line 250, in main
File "/root/.cache/bazel/_bazel_root/ed34e6d125608f91724fda23656f1726/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/mask_rcnn/scripts/train.py", line 237, in main
File "/root/.cache/bazel/_bazel_root/ed34e6d125608f91724fda23656f1726/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/mask_rcnn/scripts/train.py", line 88, in run_executer
File "/root/.cache/bazel/_bazel_root/ed34e6d125608f91724fda23656f1726/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/mask_rcnn/executer/distributed_executer.py", line 418, in train_and_eval
File "/usr/local/lib/python3.6/dist-packages/tensorflow_estimator/python/estimator/estimator.py", line 370, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_estimator/python/estimator/estimator.py", line 1161, in _train_model
return self._train_model_default(input_fn, hooks, saving_listeners)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_estimator/python/estimator/estimator.py", line 1191, in _train_model_default
features, labels, ModeKeys.TRAIN, self.config)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_estimator/python/estimator/estimator.py", line 1149, in _call_model_fn
model_fn_results = self._model_fn(features=features, **kwargs)
File "/root/.cache/bazel/_bazel_root/ed34e6d125608f91724fda23656f1726/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/mask_rcnn/models/mask_rcnn_model.py", line 699, in mask_rcnn_model_fn
File "/root/.cache/bazel/_bazel_root/ed34e6d125608f91724fda23656f1726/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/mask_rcnn/models/mask_rcnn_model.py", line 533, in _model_fn
File "/root/.cache/bazel/_bazel_root/ed34e6d125608f91724fda23656f1726/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/mask_rcnn/models/mask_rcnn_model.py", line 187, in build_model_graph
File "/root/.cache/bazel/_bazel_root/ed34e6d125608f91724fda23656f1726/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/mask_rcnn/utils/model_loader.py", line 104, in get_model_with_input
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/saving/model_config.py", line 92, in model_from_json
re_model(input_fn, hooks, saving_listeners)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_estimator/python/estimator/estimator.py", line 1161, in _train_model
return self._train_model_default(input_fn, hooks, saving_listeners)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_estimator/python/estimator/estimator.py", line 1191, in _train_model_default
features, labels, ModeKeys.TRAIN, self.config)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_estimator/python/estimator/estimator.py", line 1149, in _call_model_fn
model_fn_results = self._model_fn(features=features, **kwargs)
File "/root/.cache/bazel/_bazel_root/ed34e6d125608f91724fda23656f1726/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/mask_rcnn/models/mask_rcnn_model.py", line 699, in mask_rcnn_model_fn
File "/root/.cache/bazel/_bazel_root/ed34e6d125608f91724fda23656f1726/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/mask_rcnn/models/mask_rcnn_model.py", line 533, in _model_fn
File "/root/.cache/bazel/_bazel_root/ed34e6d125608f91724fda23656f1726/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/mask_rcnn/models/mask_rcnn_model.py", line 187, in build_model_graph
File "/root/.cache/bazel/_bazel_root/ed34e6d125608f91724fda23656f1726/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/mask_rcnn/utils/model_loader.py", line 104, in get_model_with_input
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/saving/model_config.py", line 92, in model_from_json
return deserialize(config, custom_objects=custom_objects)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/layers/serialization.py", line 105, in deserialize
printable_module_name='layer')
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/utils/generic_utils.py", line 191, in deserialize_keras_object
list(custom_objects.items())))
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/network.py", line 1076, in from_config
process_node(layer, node_data)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/network.py", line 1034, in process_node
layer(input_tensors, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/base_layer.py", line 854, in __call__
outputs = call_fn(cast_inputs, *args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/autograph/impl/api.py", line 237, in wrapper
raise e.ag_error_metadata.to_exception(e)
ValueError: in converted code:
/root/.cache/bazel/_bazel_root/ed34e6d125608f91724fda23656f1726/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/mask_rcnn/layers/reshape_layer.py:25 call
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/array_ops.py:131 reshape
result = gen_array_ops.reshape(tensor, shape, name)
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/gen_array_ops.py:8115 reshape
"Reshape", tensor=tensor, shape=shape, name=name)
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/op_def_library.py:794 _apply_op_helper
op_def=op_def)
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/util/deprecation.py:513 new_func
return func(*args, **kwargs)
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py:3357 create_op
attrs, op_def, compute_device)
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py:3426 _create_op_internal
op_def=op_def)
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py:1770 __init__
control_input_ops)
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py:1610 _create_c_op
raise Vturn deserialize(config, custom_objects=custom_objects)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/layers/serialization.py", line 105, in deserialize
printable_module_name='layer')
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/utils/generic_utils.py", line 191, in deserialize_keras_object
list(custom_objects.items())))
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/network.py", line 1076, in from_config
process_node(layer, node_data)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/network.py", line 1034, in process_node
layer(input_tensors, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/base_layer.py", line 854, in __call__
outputs = call_fn(cast_inputs, *args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/autograph/impl/api.py", line 237, in wrapper
raise e.ag_error_metadata.to_exception(e)
ValueError: in converted code:
/root/.cache/bazel/_bazel_root/ed34e6d125608f91724fda23656f1726/execroot/ai_infra/bazel-out/k8-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/mask_rcnn/layers/reshape_layer.py:25 call
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/array_ops.py:131 reshape
result = gen_array_ops.reshape(tensor, shape, name)
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/gen_array_ops.py:8115 reshape
"Reshape", tensor=tensor, shape=shape, name=name)
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/op_def_library.py:794 _apply_op_helper
op_def=op_def)
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/util/deprecation.py:513 new_func
return func(*args, **kwargs)
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py:3357 create_op
attrs, op_def, compute_device)
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py:3426 _create_op_internal
op_def=op_def)
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py:1770 __init__
control_input_ops)
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py:1610 _create_c_op
raise ValueError(str(e))
ValueError: Cannot reshape a tensor with 25690112 elements to shape [256,256,14,14] (12845056 elements) for 'mask_head_reshape_1/mask_head_reshape_1' (op: 'Reshape') with input shapes: [4,128,256,14,14], [4] and with input tensors computed as partial shapes: input[1] = [256,256,14,14].
alueError(str(e))
ValueError: Cannot reshape a tensor with 25690112 elements to shape [256,256,14,14] (12845056 elements) for 'mask_head_reshape_1/mask_head_reshape_1' (op: 'Reshape') with input shapes: [4,128,256,14,14], [4] and with input tensors computed as partial shapes: input[1] = [256,256,14,14].
--------------------------------------------------------------------------
mpirun detected that one or more processes exited with non-zero status, thus causing
the job to be terminated. The first process to do so was:
Process name: [[45090,1],1]
Exit code: 1
--------------------------------------------------------------------------
2022-06-23 13:58:02,724 [INFO] tlt.components.docker_handler.docker_handler: Stopping container.