Hi!
I have a python model using LSTM layers that accelerates with CUDA on one machine (Windows) but not another (Linux). I’m trying to find where the issue is, but having no luck.
The Windows machine is Windows 11. It uses a GeForce RTX 2070 with driver 546.59 and CUDA 11.2.2 and cuDNN 8.1.0.77. Tensorflow is used with version 2.10.
The Linux machine is Debian GNU/Linux 10 (buster). It uses a GeForce RTX A6000 with driver 460.32.03 and CUDA 11.2.2 and cuDNN 8.1.0.77. Tensorflow is used with version 2.10.
It runs on both machines, but is accelerated on the Windows machine. The message I get on the Linux machine is:
# lstm will not use cuDNN kernels since it doesn’t meet the criteria. It will use a generic GPU kernel as fallback when running on GPU
I’ve reviewed the requirements in the documentation in the following link, and my code seems to fit the requirements:
tf.keras.layers.LSTM | TensorFlow v2.10.1.
I have included my code at the bottom of this post for review.
I cannot provide the raw data, but I have provided the shape of the sequences passed to the model within the code (when extracted from the dict that is initially pulled in) - it is all positional data so is simply floats.
The sequences are right-padded upon creation before being received in this code but, for simplicity in debugging, I have removed masking so they can just be treated as equal length sequences.
I have also tried with just generating random data and feeding into an extremely basic model with no tweaks to test and I get the same issue on the Linux machine.
Has anyone experienced anything like this before? Any ideas for things I might not have thought to check? Any and all help greatly appreciated!
Code
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.callbacks import LearningRateScheduler,
EarlyStopping, ModelCheckpoint
import json
#%%
"""
LSTM cuDNN requirements: https://www.tensorflow.org/versions/r2.10/api_docs/python/tf/keras/layers/LSTM
1: activation == tanh
2: recurrent_activation == sigmoid
3: recurrent_dropout == 0
4: unroll is False
5: use_bias is True
6: Inputs if using masking are right-padded
7: Eager execution is enabled in the outermost context
Requirements met:
1: lines 65 & 73
2: lines 66 & 74
3: lines 67 & 75
4: lines 68 & 76
5: lines 69 & 77
6: Masking removed, inputs right-padded anyway
X.shape == (73159, 50, 216)
y.shape == (73159, 2)
val_X.shape == (18658, 50, 216)
val_y.shape == (18658, 2)
7: line 47
"""
def gpu_memory_growth():
physical_devices =
tf.config.experimental.list_physical_devices('GPU')
if physical_devices:
print(f'{physical_devices} detected.')
try:
for device in physical_devices:
tf.config.experimental.set_memory_growth(device, True)
print(f'Memory growth enabled for {device}.')
except RuntimeError as e:
print(f'Memory growth not enabled due to {e}')
gpu_memory_growth()
tf.data.experimental.enable_debug_mode()
tf.config.run_functions_eagerly(True)
print("Eager execution enabled:", tf.executing_eagerly())
class ModelTrainer():
def __init__(self, sequences, y_sequences, T=50, epochs=10):
self.sequences = sequences
self.y_sequences = y_sequences
self.T = T
self.epochs = epochs
self.models = {role: self.build_model() for role in range(1, 11)}
def build_model(self):
input_shape = [self.T, self.sequences[1].shape[-1]]
input_shape = (None, self.T, self.sequences[1].shape[-1])
model = tf.keras.models.Sequential([
tf.keras.layers.LSTM(512,
return_sequences=True,
activation='tanh',
recurrent_activation='sigmoid',
recurrent_dropout=0,
unroll=False,
use_bias=True
),
tf.keras.layers.LSTM(512,
return_sequences=False,
activation='tanh',
recurrent_activation='sigmoid',
recurrent_dropout=0,
unroll=False,
use_bias=True
),
tf.keras.layers.Dense(2)
])
model.compile(optimizer='adam', loss='mse')
model.build(input_shape=input_shape)
return model
def create_windows(self, role_sequences, role_y_sequences):
X, y = [], []
for seq, y_seq in zip(role_sequences, role_y_sequences):
for t in range(len(seq) - self.T):
X.append(seq[t:t+self.T])
y.append(y_seq[t+self.T])
return np.array(X), np.array(y)
def step_decay_schedule(self, initial_lr, decay_factor, step_size):
def schedule(epoch):
return initial_lr * (decay_factor ** np.floor(epoch / step_size))
return LearningRateScheduler(schedule)
def train_model(self, role_sequences, role_y_sequences, model,
role_name):
tf.keras.backend.clear_session()
# split data into training and validation sets
train_size = int(0.8 * len(role_sequences))
val_sequences = role_sequences[train_size:]
val_y_sequences = role_y_sequences[train_size:]
role_sequences = role_sequences[:train_size]
role_y_sequences = role_y_sequences[:train_size]
X, y = self.create_windows(role_sequences, role_y_sequences)
val_X, val_y = self.create_windows(val_sequences,
val_y_sequences)
tf.random.set_seed(42)
initial_lr = 0.01
decay_factor = 0.5
step_size = 10
lr_scheduler = self.step_decay_schedule(initial_lr, decay_factor,
step_size)
early_stopping = EarlyStopping(monitor='val_loss', patience=5,
restore_best_weights=True)
checkpoint_path = fr'/home/stephen/Desktop/model_checkpoints/model_checkpoint_{role_name}.h5'
checkpoint_dir = os.path.dirname(checkpoint_path)
os.makedirs(checkpoint_dir, exist_ok=True)
checkpoint = ModelCheckpoint(filepath=checkpoint_path, save_weights_only=False, save_best_only=True, monitor='val_loss', mode='min', verbose=1, save_freq='epoch')
if os.path.exists(checkpoint_path):
model = tf.keras.models.load_model(checkpoint_path)
print('Loaded model from checkpoint')
# train for a few epochs first
try:
# epochs in loop outside fit call to catch resource errors
for epoch in range(self.epochs):
model.fit(X, y, epochs=1, validation_data=(val_X, val_y), callbacks=[lr_scheduler, early_stopping, checkpoint])
except tf.errors.ResourceExhaustedError as e:
print('ResourceExhaustedError: ', e)
print('Saving current model and exiting')
model.save(checkpoint_path)
# Phase 1 Imitation Learning Loop
try:
for k in range(1, self.T + 1): # Increase window size from 1 to T
for t in range(0, self.T, k): # Slide through the sequence at intervals of k
for i in range(k): # Iterate over subsequences within the window
for sequences_batch, y_batch in tf.data.Dataset.from_tensor_slices((role_sequences, role_y_sequences)).batch(8):
# Predict action a'_t+i using the model
with tf.GradientTape() as tape:
states = sequences_batch[:, t:t + self.T, :] # Current states for prediction
predicted_actions = model(states, training=True) # Predict the next actions
true_actions = y_batch[:, t + self.T - 1, :] # Ground truth actions
# Calculate the loss between predicted and actual actions
loss = tf.keras.losses.mean_squared_error(true_actions, predicted_actions)
# Update the model using the gradients from the loss
gradients = tape.gradient(loss, model.trainable_variables)
model.optimizer.apply_gradients(zip(gradients, model.trainable_variables))
# Update the state using the predicted action a'_t+i
for seq_idx in range(len(role_sequences)):
action = model.predict(role_sequences[seq_idx, t:t + self.T, :][np.newaxis, ...])
role_sequences[seq_idx, t + self.T, :2] = action[0, :2] # Update the sequence with the predicted action
# After updating, re-train the model to improve its predictions using the new states
try:
model.fit(X, y, epochs=1, validation_data=(val_X, val_y), callbacks=[lr_scheduler, early_stopping, checkpoint])
except tf.errors.ResourceExhaustedError as e:
print('ResourceExhaustedError: ', e)
print('Saving current model and exiting')
model.save(checkpoint_path)
raise
except tf.errors.ResourceExhaustedError as e:
print('ResourceExhaustedError: ', e)
print('Saving current model and exiting')
model.save(checkpoint_path)
raise
def train_all_models(self):
for role, model in self.models.items():
role_sequences = self.sequences[role]
role_y_sequences = self.y_sequences[role]
self.train_model(role_sequences, role_y_sequences, model, role)
#%%
defensive_input_path = '/home/stephen/Downloads/def_input.json'
defensive_y_path = '/home/stephen/Downloads/def_y.json'
def load_json_to_np_array(file_path):
with open(file_path, 'r') as f:
data = json.load(f)
def convert_lists(obj):
if isinstance(obj, list):
return np.array(obj)
elif isinstance(obj, dict):
# Ensure the keys of the dictionary are integers if possible
return {int(k): convert_lists(v) for k, v in obj.items()}
else:
return obj
return convert_lists(data)
defensive_input_vectors = load_json_to_np_array(defensive_input_path)
defensive_y_vectors = load_json_to_np_array(defensive_y_path)
#%%
defensive_model_trainer = ModelTrainer(defensive_input_vectors, defensive_y_vectors)
defensive_model_trainer.train_all_models()