Hi, thank you for the detailed response.
I’ve carefully reviewed all the suggested causes and solutions:
-
Memory Issues:
I’ve checked my code thoroughly for memory management problems. There was at most 10~~20 percent of GPU memory utilization as I monitored it. Plus, the temperature GPU and CPU are somewhat stable to (50 ~ 60 C)
-
Incompatible Versions:
I’m aware that I’m using PyTorch 2.2.0+cu121 and CUDA 12.1. I’ve verified compatibility, and both PyTorch and CUDA are up-to-date. I also tested with other version combinations, but the issue persists.
-
Faulty GPU Hardware:
I may have to try this, but I want to find the root cause beforehand. It is just my reasoning, but the error happens after nvidia 550 driver were automatically updated on 16th Jan 2025. (I tried other drivers 535 570 open, server, but the error persists)
I’ve also followed the additional tips you mentioned:
-
Ensured all tensors are valid before calling .backward()
, including checking their shapes and device allocation.
-
Wrapped tensor operations with error handling blocks.
→ I handled the exception for the shape mismatch, but the error occurred before it caught the exception
-
Monitored memory usage during training to make sure it stays within GPU capacity limits.
→ As I check there was only 1- ~ 2- percent of GPU memory
Is it possible for your end to reproduce this error? It would be really appreciated.
I also reached to the pytorch team, but they haven’t answered yet. so any advice is appreciated for now…
Thanks again for your support.
import torch
import torch.nn as nn
import numpy as np
import math
import time
import traceback
from tqdm import tqdm
#device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
device = torch.device("cuda:0")
print(f"[Info] Using device: {device}", flush=True)
# Dummy Config class
class DummyOpt:
def __init__(self):
self.hiddenSize = 100
self.step = 1
self.batchSize = 100
self.nonhybrid = False
self.lr = 0.001
self.l2 = 1e-5
self.lr_dc_step = 3
self.lr_dc = 0.1
# GNN model definition (can be skipped if imported above)
class GNN(nn.Module):
def __init__(self, hidden_size, step=1):
super(GNN, self).__init__()
self.step = step
self.hidden_size = hidden_size
self.edge_proj = nn.Linear(hidden_size, hidden_size)
self.update = nn.Linear(hidden_size * 2, hidden_size)
def GNNCell(self, A, hidden):
N = A.shape[1]
A_in = A[:, :, :N] # [B, N, N]
A_out = A[:, :, N:] # [B, N, N]
edge_in = torch.matmul(A_in, self.edge_proj(hidden)) # [B, N, H]
edge_out = torch.matmul(A_out, self.edge_proj(hidden)) # [B, N, H]
edge_msg = (edge_in + edge_out) / 2
combined = torch.cat([hidden, edge_msg], dim=-1) # [B, N, 2H]
out = self.update(combined)
return torch.relu(out)
def forward(self, A, hidden):
for _ in range(self.step):
hidden = self.GNNCell(A, hidden)
return hidden
class SessionGraph(nn.Module):
def __init__(self, opt, n_node):
super(SessionGraph, self).__init__()
self.hidden_size = opt.hiddenSize
self.batch_size = opt.batchSize
self.nonhybrid = opt.nonhybrid
self.embedding = nn.Embedding(n_node, self.hidden_size)
self.gnn = GNN(self.hidden_size, step=opt.step)
self.linear_one = nn.Linear(self.hidden_size, self.hidden_size)
self.linear_two = nn.Linear(self.hidden_size, self.hidden_size)
self.linear_three = nn.Linear(self.hidden_size, 1)
self.linear_transform = nn.Linear(self.hidden_size * 2, self.hidden_size)
def compute_scores(self, hidden, mask):
ht = hidden[torch.arange(mask.shape[0]), torch.sum(mask, 1) - 1]
q1 = self.linear_one(ht).unsqueeze(1)
q2 = self.linear_two(hidden)
alpha = self.linear_three(torch.sigmoid(q1 + q2))
a = torch.sum(alpha * hidden * mask.unsqueeze(-1).float(), dim=1)
if not self.nonhybrid:
a = self.linear_transform(torch.cat([a, ht], dim=1))
b = self.embedding.weight[1:] # exclude padding idx
scores = torch.matmul(a, b.transpose(1, 0))
return scores
def forward(self, inputs, A):
hidden = self.embedding(inputs)
hidden = self.gnn(A, hidden)
return hidden
# Generate dummy input
def generate_dummy_data(batch_size, seq_len, n_node):
alias_inputs = np.tile(np.arange(seq_len), (batch_size, 1)) # (batch, seq)
#A = np.random.rand(batch_size, seq_len, seq_len * 2) # shape: 100 * 10 * 20 (20000)
A = np.random.rand(batch_size, seq_len, seq_len * 2).astype(np.float32)
items = np.random.randint(1, n_node, size=(batch_size, seq_len))
mask = (items != 0).astype(int)
targets = np.random.randint(1, n_node, size=(batch_size,))
return alias_inputs, A, items, mask, targets
# Main loop
def run_dummy_loop():
opt = DummyOpt()
n_node = 1000
model = SessionGraph(opt, n_node).to(device)
model.train()
num_iterations = 5000
seq_len = 10
for i in range(num_iterations):
try:
##########################################
# Create variables on CPU
alias_inputs, A, items, mask, targets = generate_dummy_data(opt.batchSize, seq_len, n_node)
##########################################
# Move variables from CPU to GPU
alias_inputs = torch.tensor(alias_inputs, dtype=torch.long, device=device)
#A = torch.tensor(A, dtype=torch.float32, device=device)
A = torch.tensor(A, device=device) # 이제 dtype 생략 가능
items = torch.tensor(items, dtype=torch.long, device=device)
mask = torch.tensor(mask, dtype=torch.long, device=device)
targets = torch.tensor(targets, dtype=torch.long, device=device)
# Check for NaNs or Infs
assert not torch.isnan(targets).any(), "Targets contain NaNs"
assert not torch.isinf(targets).any(), "Targets contain Infs"
##########################################
# GPU-side computation (GNN message passing)
hidden = model(items, A)
# Reorder sequence using loop + indexing
#seq_hidden = torch.stack([hidden[i][alias_inputs[i]] for i in range(alias_inputs.shape[0])])
# Reorder sequence using gather -> No error initially, but got illegal instruction at loop 97
alias_idx = alias_inputs.unsqueeze(-1).expand(-1, -1, hidden.size(2)) # (batch, seq_len, hidden_size)
seq_hidden = torch.gather(hidden, dim=1, index=alias_idx)
##########################################
# GPU-side computation (GNN message passing)
scores = model.compute_scores(seq_hidden, mask)
torch.cuda.synchronize()
##########################################
# GPU-side computation (prediction and loss update)
loss_fn = nn.CrossEntropyLoss()
loss = loss_fn(scores, targets - 1)
assert not torch.isnan(loss).any(), "Loss contains NaNs"
assert not torch.isnan(loss), "Loss is NaN"
if scores.shape[0] != targets.shape[0]:
print(f"[{i}] scores: {scores.shape}, targets: {targets.shape}", flush=True)
if scores.device != targets.device:
print(f"[{i}] score device: {scores.device}", flush=True)
print(f"[{i}] target device: {targets.device}", flush=True)
#assert scores.shape[0] == targets.shape[0], "Mismatch between scores and targets"
#assert scores.device == targets.device, "scores and targets must be on the same device"
loss.backward()
##########################################
# Move prediction values back to CPU
# Test NumPy conversion
result_np = loss.detach().cpu().numpy() # changed from loss.item() but still causes error
_ = np.log(np.clip(result_np, 1e-8, None))
#loss_value = max(loss.item(), 1e-8)
#_ = math.log(loss_value)
#loss_value = torch.clamp(loss, min=1e-8)
#_ = torch.log(loss_value)
torch.cuda.synchronize()
if i % 1000 == 0:
print(f"[{i}] Loss: {result_np:.6f}", flush=True)
#if i % 1000 == 0:
# print(f"[{i}] loss: {loss.item():.6f}, hidden max: {hidden.max().item():.4f}, has NaN: {torch.isnan(hidden).any()}", flush=True)
#print(f"[{i}] scores shape: {scores.shape}, targets shape: {targets.shape}", flush=True)
#print(f"[{i}] loss dtype: {loss.dtype}, device: {loss.device}, value: {loss.item():.6f}", flush=True)
if i % 1000 == 0:
loss_value = loss.detach() # no item()
hidden_max = hidden.max().detach()
has_nan = torch.isnan(hidden).any().detach()
print(f"[{i}] loss: {loss_value:.6f}, hidden max: {hidden_max:.4f}, has NaN: {bool(has_nan)}", flush=True)
except Exception as e:
print(f"\n🔥 Exception at iteration {i}: {traceback.format_exc()}", flush=True)
break
for i in tqdm(range(0, 2000), desc='progress'):
print(f'loop {i}th')
run_dummy_loop()