Question: I am not familiar with GPU computing and CUDA, was wondering if anyone know how I can resolve this issue / error? Do I require any special code for GPU computing other then using my imports?
I was on Epoch 1 / 100 and 2054 / 20736 iterations when it crashed with this message.
OS: Windows 10
CUDA v10
Tensorflow-gpu 2.0.0
Keras 2.2.4
Imports:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.layers import LSTM, SimpleRNN, GRU
from tensorflow.keras.layers import TimeDistributed
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.models import model_from_json
Error: ResourceExhaustedError: OOM when allocating tensor with shape[128,8,21] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:Minimum]
Traceback:
My code in relation to the rest of the traceback:
history = model.fit_generator(generator_train,
steps_per_epoch=num_batches_per_epoch,
epochs=100,
verbose=1,
callbacks=callback_list,
validation_data=val_data_src,
validation_steps=1664)
~\AppData\Local\Continuum\anaconda3\envs\tensorflow-gpu\lib\site-packages\tensorflow_core\python\keras\engine\training.py in fit_generator(self, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, validation_freq, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
1295 shuffle=shuffle,
1296 initial_epoch=initial_epoch,
→ 1297 steps_name=‘steps_per_epoch’)
1298
1299 def evaluate_generator(self,
~\AppData\Local\Continuum\anaconda3\envs\tensorflow-gpu\lib\site-packages\tensorflow_core\python\keras\engine\training_generator.py in model_iteration(model, data, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, validation_freq, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch, mode, batch_size, steps_name, **kwargs)
263
264 is_deferred = not model._is_compiled
→ 265 batch_outs = batch_function(*batch_data)
266 if not isinstance(batch_outs, list):
267 batch_outs = [batch_outs]
~\AppData\Local\Continuum\anaconda3\envs\tensorflow-gpu\lib\site-packages\tensorflow_core\python\keras\engine\training.py in train_on_batch(self, x, y, sample_weight, class_weight, reset_metrics)
971 outputs = training_v2_utils.train_on_batch(
972 self, x, y=y, sample_weight=sample_weight,
→ 973 class_weight=class_weight, reset_metrics=reset_metrics)
974 outputs = (outputs[‘total_loss’] + outputs[‘output_losses’] +
975 outputs[‘metrics’])
~\AppData\Local\Continuum\anaconda3\envs\tensorflow-gpu\lib\site-packages\tensorflow_core\python\keras\engine\training_v2_utils.py in train_on_batch(model, x, y, sample_weight, class_weight, reset_metrics)
262 y,
263 sample_weights=sample_weights,
→ 264 output_loss_metrics=model._output_loss_metrics)
265
266 if reset_metrics:
~\AppData\Local\Continuum\anaconda3\envs\tensorflow-gpu\lib\site-packages\tensorflow_core\python\keras\engine\training_eager.py in train_on_batch(model, inputs, targets, sample_weights, output_loss_metrics)
309 sample_weights=sample_weights,
310 training=True,
→ 311 output_loss_metrics=output_loss_metrics))
312 if not isinstance(outs, list):
313 outs = [outs]
~\AppData\Local\Continuum\anaconda3\envs\tensorflow-gpu\lib\site-packages\tensorflow_core\python\keras\engine\training_eager.py in _process_single_batch(model, inputs, targets, output_loss_metrics, sample_weights, training)
250 output_loss_metrics=output_loss_metrics,
251 sample_weights=sample_weights,
→ 252 training=training))
253 if total_loss is None:
254 raise ValueError('The model cannot be run ’
~\AppData\Local\Continuum\anaconda3\envs\tensorflow-gpu\lib\site-packages\tensorflow_core\python\keras\engine\training_eager.py in _model_loss(model, inputs, targets, output_loss_metrics, sample_weights, training)
164
165 if hasattr(loss_fn, ‘reduction’):
→ 166 per_sample_losses = loss_fn.call(targets[i], outs[i])
167 weighted_losses = losses_utils.compute_weighted_loss(
168 per_sample_losses,
~\AppData\Local\Continuum\anaconda3\envs\tensorflow-gpu\lib\site-packages\tensorflow_core\python\keras\losses.py in call(self, y_true, y_pred)
219 y_pred, y_true = tf_losses_util.squeeze_or_expand_dimensions(
220 y_pred, y_true)
→ 221 return self.fn(y_true, y_pred, **self._fn_kwargs)
222
223 def get_config(self):
~\AppData\Local\Continuum\anaconda3\envs\tensorflow-gpu\lib\site-packages\tensorflow_core\python\keras\losses.py in binary_crossentropy(y_true, y_pred, from_logits, label_smoothing)
992 _smooth_labels, lambda: y_true)
993 return K.mean(
→ 994 K.binary_crossentropy(y_true, y_pred, from_logits=from_logits), axis=-1)
995
996
~\AppData\Local\Continuum\anaconda3\envs\tensorflow-gpu\lib\site-packages\tensorflow_core\python\keras\backend.py in binary_crossentropy(target, output, from_logits)
4574 output.op.type != ‘Sigmoid’):
4575 epsilon_ = constant_to_tensor(epsilon(), output.dtype.base_dtype)
→ 4576 output = clip_ops.clip_by_value(output, epsilon, 1. - epsilon_)
4577
4578 # Compute cross entropy from probabilities.
~\AppData\Local\Continuum\anaconda3\envs\tensorflow-gpu\lib\site-packages\tensorflow_core\python\util\dispatch.py in wrapper(*args, **kwargs)
178 “”“Call target, and fall back on dispatchers if there is a TypeError.”“”
179 try:
→ 180 return target(*args, **kwargs)
181 except (TypeError, ValueError):
182 # Note: convert_to_eager_tensor currently raises a ValueError, not a
~\AppData\Local\Continuum\anaconda3\envs\tensorflow-gpu\lib\site-packages\tensorflow_core\python\ops\clip_ops.py in clip_by_value(t, clip_value_min, clip_value_max, name)
80
81 # Go through list of tensors, for each value in each tensor clip
—> 82 t_min = math_ops.minimum(values, clip_value_max)
83 # Assert that the shape is compatible with the initial shape,
84 # to prevent unintentional broadcasting.
~\AppData\Local\Continuum\anaconda3\envs\tensorflow-gpu\lib\site-packages\tensorflow_core\python\ops\gen_math_ops.py in minimum(x, y, name)
6550 else:
6551 message = e.message
→ 6552 _six.raise_from(_core._status_to_exception(e.code, message), None)
6553 # Add nodes to the TensorFlow graph.
6554 try:
~\AppData\Local\Continuum\anaconda3\envs\tensorflow-gpu\lib\site-packages\six.py in raise_from(value, from_value)