Here is the official TensorFlow benchmark script: https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks.
After I pulled and set up Nvidia NGC TensorFlow 19.04 container, I cloned the benchmark repository and checked out the cnn_tf_v1.13_compatible
branch to match with the TF version inside the NGC container.
Then I followed the instruction on the benchmark website inside the Getting Started section. It worked perfectly out of the box when running inside the official TensorFlow container, but inside the NGC container, it threw an error:
Traceback (most recent call last):
File "tf_cnn_benchmarks.py", line 72, in <module>
app.run(main) # Raises error on invalid flags, unlike tf.app.run()
File "/usr/local/lib/python3.5/dist-packages/absl/app.py", line 300, in run
_run_main(main, args)
File "/usr/local/lib/python3.5/dist-packages/absl/app.py", line 251, in _run_main
sys.exit(main(argv))
File "tf_cnn_benchmarks.py", line 68, in main
bench.run()
File "/mnt/benchmarks/scripts/tf_cnn_benchmarks/benchmark_cnn.py", line 1851, in run
return self._benchmark_train()
File "/mnt/benchmarks/scripts/tf_cnn_benchmarks/benchmark_cnn.py", line 2047, in _benchmark_train
build_result = self._build_graph()
File "/mnt/benchmarks/scripts/tf_cnn_benchmarks/benchmark_cnn.py", line 2081, in _build_graph
(input_producer_op, enqueue_ops, fetches) = self._build_model()
File "/mnt/benchmarks/scripts/tf_cnn_benchmarks/benchmark_cnn.py", line 2861, in _build_model
phase_train)
File "/mnt/benchmarks/scripts/tf_cnn_benchmarks/benchmark_cnn.py", line 2939, in _build_fetches
loss_scale_params)
File "/mnt/benchmarks/scripts/tf_cnn_benchmarks/variable_mgr.py", line 112, in append_apply_gradients_ops
self.grad_has_inf_nan)
File "/mnt/benchmarks/scripts/tf_cnn_benchmarks/variable_mgr_util.py", line 114, in append_gradients_with_loss_scale
training_ops.extend(get_apply_gradients_ops_func())
File "/mnt/benchmarks/scripts/tf_cnn_benchmarks/variable_mgr.py", line 108, in get_apply_gradients_ops_func
return [opt.apply_gradients(grads)]
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/optimizer.py", line 591, in apply_gradients
shift_update_op = self._update_gradient_shift(all_finite)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/optimizer.py", line 861, in _update_gradient_shift
return control_flow_ops.cond(all_finite, finite_branch, overflow_branch)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/util/deprecation.py", line 507, in new_func
return func(*args, **kwargs)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/control_flow_ops.py", line 2097, in cond
orig_res_t, res_t = context_t.BuildCondBranch(true_fn)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/control_flow_ops.py", line 1941, in BuildCondBranch
original_result = fn()
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/optimizer.py", line 859, in finite_branch
return control_flow_ops.cond(should_update, boost_branch, incr_branch)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/util/deprecation.py", line 507, in new_func
return func(*args, **kwargs)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/control_flow_ops.py", line 2097, in cond
orig_res_t, res_t = context_t.BuildCondBranch(true_fn)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/control_flow_ops.py", line 1941, in BuildCondBranch
original_result = fn()
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/optimizer.py", line 847, in boost_branch
new_scale_val = clip_ops.clip_by_value(scalar * 2.0, scale_min, scale_max)
TypeError: unsupported operand type(s) for *: 'NoneType' and 'float'