Getting error, RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED while running a basic RNN model

This is my code for a basic RNN model and I am using the MNSIT dataset. My ultimate goal is to train this model on a custom dataset however I am trying to run this model on the MNSIT dataset so that I can be sure that the code and the model are running properly before I try to run my model.

When I run this model on my GPU I get the error that has been pasted below. However interestingly, When I run my model on CPU instead of my GPU, the model runs perfectly.

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from load_data import IntentEstimationDataset

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

input_size = 28
sequence_length = 28
num_layers = 2
hidden_size = 256
num_classes = 10
learning_rate = 0.001
batch_size = 64
num_epochs = 2


class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size*sequence_length, num_classes)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        out, _ = self.rnn(x, h0)
        out = out.reshape(out.shape[0], -1)
        out = self.fc(out)
        return out


# Load data:
train_dataset = datasets.MNIST(root='/home/sharyat/catkin_ws/src/data_imu/script/', train=True,
                               transform=transforms.ToTensor(), download=True)
test_dataset = datasets.MNIST(root='/home/sharyat/catkin_ws/src/data_imu/script/', train=False,
                              transform=transforms.ToTensor(), download=True)

train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size,shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size,shuffle=True)


# initialise model:
model = RNN(input_size, hidden_size, num_layers, num_classes).to(device)

# Loss and optimiser:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

for epochs in range(num_epochs):
    for batch_idx, (data, targets) in enumerate(train_loader):
        print(batch_idx)
        data = data.to(device=device).squeeze(1)
        targets = targets.to(device=device)

        # forward
        scores = model(data)
        loss = criterion(scores, targets)

        # backward
        optimizer.zero_grad()
        loss.backward()

        optimizer.step()


def check_accuracy(loader, model):
    if loader.dataset.train:
        print('Checking accuracy on training data')
    else:
        print('Checking accuracy on test data')

    num_correct = 0
    num_samples = 0
    model.eval()

    with torch.no_grad():
        for x, y in loader:
            x = x.to(device=device)
            y = y.to(device=device)
            x = x.reshape(x.shape[0], -1)

            scores = model(x)
            _, predictions = scores.max[1]
            num_correct += (predictions == y).sum()
            num_samples += predictions.size(0)

        print(f'Got {num_correct}/{num_samples} with accuracy {float(num_correct)/float(num_samples)*100}')

    model.train()


check_accuracy(train_loader, model)

And After running the code for about 9 mins, I get this error:

Traceback (most recent call last):
  File "/home/sharyat/catkin_ws/src/data_imu/script/intent_estimation_model.py", line 63, in <module>
    scores = model(data)
  File "/home/sharyat/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 550, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/sharyat/catkin_ws/src/data_imu/script/intent_estimation_model.py", line 33, in forward
    out, _ = self.rnn(x, h0)
  File "/home/sharyat/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 550, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/sharyat/.local/lib/python3.8/site-packages/torch/nn/modules/rnn.py", line 227, in forward
    result = _impl(input, hx, self._flat_weights, self.bias, self.num_layers,
RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED

Process finished with exit code 1

A few things I have considered after looking at other posts:

  1. My used memory for GPU is no where around the maximum memory.

After running Nvidia-smi I get:

Mon Dec 27 17:56:35 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0  On |                  N/A |
| N/A   39C    P5    13W /  N/A |    996MiB /  5946MiB |     25%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
|    0   N/A  N/A      1054      G   /usr/lib/xorg/Xorg                 45MiB |
|    0   N/A  N/A      1668      G   /usr/lib/xorg/Xorg                312MiB |
|    0   N/A  N/A      1859      G   /usr/bin/gnome-shell              153MiB |
|    0   N/A  N/A      2193      G   ...AAAAAAAAA= --shared-files       29MiB |
|    0   N/A  N/A      2589      G   /usr/lib/firefox/firefox          443MiB |
+-----------------------------------------------------------------------------+

After running nvcc --version I get:

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Sun_Jul_28_19:07:16_PDT_2019
Cuda compilation tools, release 10.1, V10.1.243

And after running print(torch.version) in python, i get:
1.5.1+cu101

Hi,

Based on the info provided, it doesn’t look like TensorRT related issue. The following may help you. If you have further queries, we recommend you to post your concern on related platform.

Thank you.

1 Like

Hi,

I have the same error. I have broken package??
I have another computer using Cuda 11.6 and it runs ok. Following this link for segmentation, https://www.highvoltagecode.com/post/edge-ai-semantic-segmentation-on-nvidia-jetson

And I just bought a new computer and installed 12.1, same code but I got this error.
So, is it the pytorch or cuda problem? I am confused.
I don’t mind to downgrade my cuda to 11.6, I have tried several links but failed to follow. Do you have any working link for downgrading? Thx
I followed this link, but I cannot install the line

sudo apt-get -y install cuda-11-6
Nor
sudo apt-get -y install cuda-11.6
torchvision.models.segmentation.FCN() => configuring model for training
Traceback (most recent call last):
  File "train.py", line 331, in <module>
    main(args)
  File "train.py", line 292, in main
    train_one_epoch(model, criterion, optimizer, data_loader, lr_scheduler, device, epoch, args.print_freq)
  File "train.py", line 189, in train_one_epoch
    loss.backward()
  File "/home/nvidia/anaconda3/envs/segmentation/lib/python3.6/site-packages/torch/tensor.py", line 107, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph)
  File "/home/nvidia/anaconda3/envs/segmentation/lib/python3.6/site-packages/torch/autograd/__init__.py", line 93, in backward
    allow_unreachable=True)  # allow_unreachable flag
RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED

+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 530.30.02              Driver Version: 530.30.02    CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|=========================================+======================+======================|
|   0  NVIDIA GeForce RTX 4070 L...    On | 00000000:01:00.0 Off |                  N/A |
| N/A   39C    P8                4W / 115W|      1MiB /  8188MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                                         
+---------------------------------------------------------------------------------------+
| Processes:                                                                            |
|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
|        ID   ID                                                             Usage      |
|=======================================================================================|
|  No running processes found                                                           |
+---------------------------------------------------------------------------------------+
python -c "import torch; print(torch.__version__)"
1.1.0
(segmentation) nvidia@nvidia-Kuangshi16-Super-Series-GM6PX7X:~/pytorch-segmentation$ nvcc --version
Command 'nvcc' not found, but can be installed with:
sudo apt install nvidia-cuda-toolkit
(segmentation) nvidia@nvidia-Kuangshi16-Super-Series-GM6PX7X:~/pytorch-segmentation$ sudo apt install nvidia-cuda-toolkit
[sudo] password for nvidia: 
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
Some packages could not be installed. This may mean that you have
requested an impossible situation or if you are using the unstable
distribution that some required packages have not yet been created
or been moved out of Incoming.
The following information may help to resolve the situation:

The following packages have unmet dependencies:
 libcuinj64-11.5 : Depends: libnvidia-compute-495 (>= 495) but it is not going to be installed or
                            libnvidia-compute-495-server (>= 495) but it is not installable or
                            libcuda.so.1 (>= 495) or
                            libcuda-11.5-1
 libnvidia-ml-dev : Depends: libnvidia-compute-495 (>= 495) but it is not going to be installed or
                             libnvidia-compute-495-server (>= 495) but it is not installable or
                             libnvidia-ml.so.1 (>= 495)
 nvidia-cuda-dev : Depends: libnvidia-compute-495 (>= 495) but it is not going to be installed or
                            libnvidia-compute-495-server (>= 495) but it is not installable or
                            libcuda.so.1 (>= 495) or
                            libcuda-11.5-1
                   Recommends: libnvcuvid1 but it is not installable
E: Unable to correct problems, you have held broken packages.
(segmentation) nvidia@nvidia-Kuangshi16-Super-Series-GM6PX7X:~/pytorch-segmentation$ 

(segmentation) nvidia@nvidia-Kuangshi16-Super-Series-GM6PX7X:~/pytorch-segmentation$ sudo aptitude install nvidia-cuda-toolkit
The following NEW packages will be installed:
  node-html5shiv{a} nvidia-cuda-gdb{a} nvidia-cuda-toolkit{b} nvidia-cuda-toolkit-doc{a} 
  nvidia-opencl-dev{a} ocl-icd-opencl-dev{a} opencl-c-headers{a} opencl-clhpp-headers{a} 
0 packages upgraded, 8 newly installed, 0 to remove and 0 not upgraded.
Need to get 0 B/72.6 MB of archives. After unpacking 147 MB will be used.
The following packages have unmet dependencies:
 nvidia-cuda-toolkit : Depends: nvidia-profiler (= 11.5.114~11.5.1-1ubuntu1) but it is not installable
                       Depends: nvidia-cuda-dev (= 11.5.1-1ubuntu1) but it is not installable
The following actions will resolve these dependencies:

     Keep the following packages at their current version:
1)     nvidia-cuda-toolkit [Not Installed]                



Accept this solution? [Y/n/q/?] Y
No packages will be installed, upgraded, or removed.
0 packages upgraded, 0 newly installed, 0 to remove and 0 not upgraded.
Need to get 0 B of archives. After unpacking 0 B will be used.

Hi,

Here is the error

    allow_unreachable=True)  # allow_unreachable flag
RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED
(segmentation) nvidia@nvidia-Kuangshi16-Super-Series-GM6PX7X:~/pytorch-segmentation$ sudo apt-get -y install cuda-11.6
[sudo] password for nvidia: 
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
Note, selecting 'libcuda-11.6-1' for regex 'cuda-11.6'
Note, selecting 'cuda-11-6' for regex 'cuda-11.6'
Some packages could not be installed. This may mean that you have
requested an impossible situation or if you are using the unstable
distribution that some required packages have not yet been created
or been moved out of Incoming.
The following information may help to resolve the situation:

The following packages have unmet dependencies:
 libcufile-11-6 : Depends: liburcu6 but it is not installable
E: Unable to correct problems, you have held broken packages.
(segmentation) nvidia@nvidia-Kuangshi16-Super-Series-GM6PX7X:~/pytorch-segmentation$ nvidia-smi
Tue Apr 18 00:13:25 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 530.30.02              Driver Version: 530.30.02    CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|=========================================+======================+======================|
|   0  NVIDIA GeForce RTX 4070 L...    On | 00000000:01:00.0 Off |                  N/A |
| N/A   43C    P0               N/A / 115W|      6MiB /  8188MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                                         
+---------------------------------------------------------------------------------------+
| Processes:                                                                            |
|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
|        ID   ID                                                             Usage      |
|=======================================================================================|
|    0   N/A  N/A      1456      G   /usr/lib/xorg/Xorg                            4MiB |
+---------------------------------------------------------------------------------------+