LSTM layers not accelerating despite meeting criteria

Hi!

I have a python model using LSTM layers that accelerates with CUDA on one machine (Windows) but not another (Linux). I’m trying to find where the issue is, but having no luck.

The Windows machine is Windows 11. It uses a GeForce RTX 2070 with driver 546.59 and CUDA 11.2.2 and cuDNN 8.1.0.77. Tensorflow is used with version 2.10.

The Linux machine is Debian GNU/Linux 10 (buster). It uses a GeForce RTX A6000 with driver 460.32.03 and CUDA 11.2.2 and cuDNN 8.1.0.77. Tensorflow is used with version 2.10.

It runs on both machines, but is accelerated on the Windows machine. The message I get on the Linux machine is:

# lstm will not use cuDNN kernels since it doesn’t meet the criteria. It will use a generic GPU kernel as fallback when running on GPU

I’ve reviewed the requirements in the documentation in the following link, and my code seems to fit the requirements:
tf.keras.layers.LSTM  |  TensorFlow v2.10.1.

I have included my code at the bottom of this post for review.

I cannot provide the raw data, but I have provided the shape of the sequences passed to the model within the code (when extracted from the dict that is initially pulled in) - it is all positional data so is simply floats.

The sequences are right-padded upon creation before being received in this code but, for simplicity in debugging, I have removed masking so they can just be treated as equal length sequences.

I have also tried with just generating random data and feeding into an extremely basic model with no tweaks to test and I get the same issue on the Linux machine.

Has anyone experienced anything like this before? Any ideas for things I might not have thought to check? Any and all help greatly appreciated!

Code

import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.callbacks import LearningRateScheduler, 
EarlyStopping, ModelCheckpoint
import json
#%%
"""
LSTM cuDNN requirements: https://www.tensorflow.org/versions/r2.10/api_docs/python/tf/keras/layers/LSTM
 
    1: activation == tanh
    2: recurrent_activation == sigmoid
    3: recurrent_dropout == 0
    4: unroll is False
    5: use_bias is True
    6: Inputs if using masking are right-padded
    7: Eager execution is enabled in the outermost context
 
Requirements met:
    
    1: lines 65 & 73
    2: lines 66 & 74
    3: lines 67 & 75
    4: lines 68 & 76
    5: lines 69 & 77
    6: Masking removed, inputs right-padded anyway
        X.shape == (73159, 50, 216)
        y.shape == (73159, 2)
        val_X.shape == (18658, 50, 216)
        val_y.shape == (18658, 2)
    7: line 47
"""
 
def gpu_memory_growth():
    physical_devices = 
tf.config.experimental.list_physical_devices('GPU')
    if physical_devices:
        print(f'{physical_devices} detected.')
        try:
            for device in physical_devices:
                tf.config.experimental.set_memory_growth(device, True)
                print(f'Memory growth enabled for {device}.')
        except RuntimeError as e:
            print(f'Memory growth not enabled due to {e}')
            
gpu_memory_growth()
tf.data.experimental.enable_debug_mode()
tf.config.run_functions_eagerly(True)
print("Eager execution enabled:", tf.executing_eagerly())
 
 
class ModelTrainer():
    def __init__(self, sequences, y_sequences, T=50, epochs=10):
        self.sequences = sequences
        self.y_sequences = y_sequences
        self.T = T
        self.epochs = epochs
        self.models = {role: self.build_model() for role in range(1, 11)}
 
    def build_model(self):
        input_shape = [self.T, self.sequences[1].shape[-1]]
        input_shape = (None, self.T, self.sequences[1].shape[-1])
        model = tf.keras.models.Sequential([
            tf.keras.layers.LSTM(512,
                                 return_sequences=True,
                                 activation='tanh',
                                 recurrent_activation='sigmoid',
                                 recurrent_dropout=0,
                                 unroll=False,
                                 use_bias=True
                                 ),
            tf.keras.layers.LSTM(512,
                                 return_sequences=False,
                                 activation='tanh',
                                 recurrent_activation='sigmoid',
                                 recurrent_dropout=0,
                                 unroll=False,
                                 use_bias=True
                                 ),
            tf.keras.layers.Dense(2)
        ])
        model.compile(optimizer='adam', loss='mse')
        model.build(input_shape=input_shape)
        return model
 
    def create_windows(self, role_sequences, role_y_sequences):
        X, y = [], []
        for seq, y_seq in zip(role_sequences, role_y_sequences):
            for t in range(len(seq) - self.T):
                X.append(seq[t:t+self.T])
                y.append(y_seq[t+self.T])
        return np.array(X), np.array(y)
 
    def step_decay_schedule(self, initial_lr, decay_factor, step_size):
        def schedule(epoch):
            return initial_lr * (decay_factor ** np.floor(epoch / step_size))
        return LearningRateScheduler(schedule)
    
    def train_model(self, role_sequences, role_y_sequences, model, 
role_name):
        tf.keras.backend.clear_session()
        # split data into training and validation sets
        train_size = int(0.8 * len(role_sequences))
        val_sequences = role_sequences[train_size:]
        val_y_sequences = role_y_sequences[train_size:]
        role_sequences = role_sequences[:train_size]
        role_y_sequences = role_y_sequences[:train_size]
    
        X, y = self.create_windows(role_sequences, role_y_sequences)
        val_X, val_y = self.create_windows(val_sequences, 
val_y_sequences)
 
        tf.random.set_seed(42)
    
        initial_lr = 0.01
        decay_factor = 0.5
        step_size = 10
    
        lr_scheduler = self.step_decay_schedule(initial_lr, decay_factor, 
step_size)
        early_stopping = EarlyStopping(monitor='val_loss', patience=5, 
restore_best_weights=True)
 
        checkpoint_path = fr'/home/stephen/Desktop/model_checkpoints/model_checkpoint_{role_name}.h5'
        checkpoint_dir = os.path.dirname(checkpoint_path)
        os.makedirs(checkpoint_dir, exist_ok=True)
        checkpoint = ModelCheckpoint(filepath=checkpoint_path, save_weights_only=False, save_best_only=True, monitor='val_loss', mode='min', verbose=1, save_freq='epoch')
        if os.path.exists(checkpoint_path):
            model = tf.keras.models.load_model(checkpoint_path)
            print('Loaded model from checkpoint')
            
            
        # train for a few epochs first
        try:
            # epochs in loop outside fit call to catch resource errors
            for epoch in range(self.epochs):
                model.fit(X, y, epochs=1, validation_data=(val_X, val_y), callbacks=[lr_scheduler, early_stopping, checkpoint])    
 
        except tf.errors.ResourceExhaustedError as e:
            print('ResourceExhaustedError: ', e)
            print('Saving current model and exiting')
            model.save(checkpoint_path)
        
        # Phase 1 Imitation Learning Loop
        try:
            for k in range(1, self.T + 1):  # Increase window size from 1 to T
                for t in range(0, self.T, k):  # Slide through the sequence at intervals of k
                    for i in range(k):  # Iterate over subsequences within the window
                        for sequences_batch, y_batch in tf.data.Dataset.from_tensor_slices((role_sequences, role_y_sequences)).batch(8):
                            # Predict action a'_t+i using the model
                            with tf.GradientTape() as tape:
                                states = sequences_batch[:, t:t + self.T, :]  # Current states for prediction
                                predicted_actions = model(states, training=True)  # Predict the next actions
                                true_actions = y_batch[:, t + self.T - 1, :]  # Ground truth actions
    
                                # Calculate the loss between predicted and actual actions
                                loss = tf.keras.losses.mean_squared_error(true_actions, predicted_actions)
    
                            # Update the model using the gradients from the loss
                            gradients = tape.gradient(loss, model.trainable_variables)
                            model.optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    
                            # Update the state using the predicted action a'_t+i
                            for seq_idx in range(len(role_sequences)):
                                action = model.predict(role_sequences[seq_idx, t:t + self.T, :][np.newaxis, ...])
                                role_sequences[seq_idx, t + self.T, :2] = action[0, :2]  # Update the sequence with the predicted action
    
                    # After updating, re-train the model to improve its predictions using the new states
                    try:
                        model.fit(X, y, epochs=1, validation_data=(val_X, val_y), callbacks=[lr_scheduler, early_stopping, checkpoint])
                    except tf.errors.ResourceExhaustedError as e:
                        print('ResourceExhaustedError: ', e)
                        print('Saving current model and exiting')
                        model.save(checkpoint_path)
                        raise
    
        except tf.errors.ResourceExhaustedError as e:
            print('ResourceExhaustedError: ', e)
            print('Saving current model and exiting')
            model.save(checkpoint_path)
            raise
        
        
    def train_all_models(self):
        for role, model in self.models.items():
            role_sequences = self.sequences[role]
            role_y_sequences = self.y_sequences[role]
            self.train_model(role_sequences, role_y_sequences, model, role)
            
            
#%%
 
defensive_input_path = '/home/stephen/Downloads/def_input.json'
defensive_y_path = '/home/stephen/Downloads/def_y.json'
 
def load_json_to_np_array(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
        
    def convert_lists(obj):
        if isinstance(obj, list):
            return np.array(obj)
        elif isinstance(obj, dict):
            # Ensure the keys of the dictionary are integers if possible
            return {int(k): convert_lists(v) for k, v in obj.items()}
        else:
            return obj
 
    return convert_lists(data)
 
 
defensive_input_vectors = load_json_to_np_array(defensive_input_path)
defensive_y_vectors = load_json_to_np_array(defensive_y_path)
 
#%%
 
 
defensive_model_trainer = ModelTrainer(defensive_input_vectors, defensive_y_vectors)
defensive_model_trainer.train_all_models()