Pytorch 1.7 nan results

Hello,

I’m trying to learn how to use Pytorch and have been following along with this video series https://www.youtube.com/watch?v=9j-_dOze4IM but I’m getting different results when I run these examples on my AGX compared to a standard Ubuntu VM on my normal desktop. On my AGX I’m using 1.7 Pytorch from this post PyTorch for Jetson - version 1.7.0 now available and on my VM I used pip3 install torch torchvision.

The problem is when I try to run the code in the video linked above, on my AGX I get the following results:

nvidia@xavier:~/src/cuda_projects/pytorch$ python3 pytorchHello2.py 
Net(
  (fc1): Linear(in_features=784, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=64, bias=True)
  (fc4): Linear(in_features=64, out_features=10, bias=True)
)
tensor(3.3226e+08, grad_fn=<NllLossBackward>)
tensor(1.7130e+10, grad_fn=<NllLossBackward>)
tensor(-2.7093e+09, grad_fn=<NllLossBackward>)
Accuracy:  0.112

nvidia@xavier:~/src/cuda_projects/pytorch$ python3 pytorchHello2.py 
Net(
  (fc1): Linear(in_features=784, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=64, bias=True)
  (fc4): Linear(in_features=64, out_features=10, bias=True)
)
tensor(nan, grad_fn=<NllLossBackward>)
tensor(nan, grad_fn=<NllLossBackward>)
tensor(nan, grad_fn=<NllLossBackward>)
Accuracy:  0.099

With the majority of the results being nan with an accuracy of 0.099.

However on my Ubuntu VM with the same code, I get the expected results:

tensor(0.3005, grad_fn=<NllLossBackward>)
tensor(0.0167, grad_fn=<NllLossBackward>)
tensor(0.0097, grad_fn=<NllLossBackward>)
Accuracy:  0.968

I’m not sure what I’m doing wrong, or perhaps I missed something during my setup? I would appreciate any suggestions on how to fix this issue with Pytorch on the AGX.

Python code that I’m using on both machines:

import torch
import torchvision
from torchvision import transforms, datasets
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt

train = datasets.MNIST("", train = True, download = True, transform = transforms.Compose([transforms.ToTensor()]))

test = datasets.MNIST("", train = False, download = True, transform = transforms.Compose([transforms.ToTensor()]))

trainset = torch.utils.data.DataLoader(train, batch_size = 10, shuffle = True)
testset = torch.utils.data.DataLoader(test, batch_size = 10, shuffle = True)

class Net(nn.Module):
  def __init__(self):
    
    super().__init__()
    
    self.fc1 = nn.Linear(28*28, 64) # 28*28 = number of pixels in each image in dataset
    self.fc2 = nn.Linear(64, 64) # input=64 based on output of fc1
    self.fc3 = nn.Linear(64, 64) # input=64 based on output of fc2
    self.fc4 = nn.Linear(64, 10) # input=64 based on output of fc2, output=10 because we have 10 digits in our data

  def forward(self, x):
    x = F.relu(self.fc1(x))
    x = F.relu(self.fc2(x))
    x = F.relu(self.fc3(x))
    x = self.fc4(x)

    return F.log_softmax(x, dim=1)

net = Net()
print(net)

optimizer = optim.Adam(net.parameters(), lr=0.001)

EPOCHS = 3

for epoch in range(EPOCHS):
  for data in trainset:
    # data is a batch of featuresets and labels
    X, y = data
    net.zero_grad()
    output = net(X.view(-1, 28*28))
    loss = F.nll_loss(output, y)
    loss.backward()
    optimizer.step()
  print(loss)

correct = 0
total = 0
  
with torch.no_grad():
  for data in testset:
    X, y = data
    output = net(X.view(-1, 28*28))
    for idx, i in enumerate(output):
      if torch.argmax(i) == y[idx]:
        correct += 1
      total += 1

print("Accuracy: ", round(correct/total, 3))

print(torch.argmax(net(X[0].view(-1, 28*28))[0]), torch.argmax(net(X[1].view(-1, 28*28))[0]), torch.argmax(net(X[2].view(-1, 28*28))[0]))
print(torch.argmax(net(X[3].view(-1, 28*28))[0]), torch.argmax(net(X[4].view(-1, 28*28))[0]), torch.argmax(net(X[5].view(-1, 28*28))[0]))
print(torch.argmax(net(X[6].view(-1, 28*28))[0]), torch.argmax(net(X[7].view(-1, 28*28))[0]), torch.argmax(net(X[8].view(-1, 28*28))[0]))

fig = plt.figure(figsize=(28, 28))

for i in range(9):
  fig.add_subplot(3, 3, i+1)
  plt.imshow(X[i].view(28,28))
plt.show()

Cheers
-T

Hi,

May I know which JetPack version do you use?
Please noted that the pyTorch v1.7.0 package is built for JetPack 4.4 or JetPack 4.4.1.

Thanks

nvidia@xavier:~$ cat /etc/nv_tegra_release
# R32 (release), REVISION: 4.4, GCID: 23942405, BOARD: t186ref, EABI: aarch64, DATE: Fri Oct 16 19:37:08 UTC 2020

I did some more Pytorch examples from the same YouTube channel, and those seem to be working fine. So now I’m really confused, perhaps a Python issue?

I did notice that in some of his examples he was getting multi-digit float values while the same code on my AGX was just showing a 0. or 1.

Hi,

We are going to reproduce this and give you more information later.
Thanks.

1 Like

I made a few modifications to the example code from the YouTube video, just to put everything on the gpu instead of the cpu and it works fine. Not really sure what to make of the nan issue on the cpu. Here is the output when running on the GPU.

nvidia@xavier:~/src/cuda_projects/pytorch$ python3 pytorchHello2.py
Net(
  (fc1): Linear(in_features=784, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=64, bias=True)
  (fc4): Linear(in_features=64, out_features=10, bias=True)
)
tensor(0.0204, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.6798, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.1874, device='cuda:0', grad_fn=<NllLossBackward>)

Updated code below. I’m sure there are probably better ways to do this, but I’m very new to ML and GPUs so this seemed like the path of least resistance.

import torch
import torchvision
from torchvision import transforms, datasets
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt

train = datasets.MNIST("", train = True, download = True, transform = transforms.Compose([transforms.ToTensor()]))

test = datasets.MNIST("", train = False, download = True, transform = transforms.Compose([transforms.ToTensor()]))

trainset = torch.utils.data.DataLoader(train, batch_size = 10, shuffle = True)
testset = torch.utils.data.DataLoader(test, batch_size = 10, shuffle = False)

gpu = torch.device("cuda:0")
cpu = torch.device("cpu:0")

class Net(nn.Module):
  def __init__(self):

    super().__init__()

    self.fc1 = nn.Linear(28*28, 64) # 28*28 = number of pixels in each image in dataset
    self.fc2 = nn.Linear(64, 64) # input=64 based on output of fc1
    self.fc3 = nn.Linear(64, 64) # input=64 based on output of fc2
    self.fc4 = nn.Linear(64, 10) # input=64 based on output of fc2, output=10 because we have 10 digits in our data

  def forward(self, x):
    x = F.relu(self.fc1(x))
    x = F.relu(self.fc2(x))
    x = F.relu(self.fc3(x))
    x = self.fc4(x)

    return F.log_softmax(x, dim=1)

net = Net().to(gpu)
print(net)

optimizer = optim.Adam(net.parameters(), lr=0.001)

EPOCHS = 3

for epoch in range(EPOCHS):
  for data in trainset:
    # data is a batch of featuresets and labels
    X, y = data
    net.zero_grad()
    output = net(X.view(-1, 28*28).to(gpu))
    loss = F.nll_loss(output, y.to(gpu))
    loss.backward()
    optimizer.step()
#    print("Training")
  print(loss)

correct = 0
total = 0

with torch.no_grad():
  for data in testset:
    X, y = data
    output = net(X.view(-1,784).to(gpu))
    print(output)
    for idx, i in enumerate(output):
      #print(torch.argmax(i), y[idx])
      if torch.argmax(i) == y[idx]:
        correct += 1
      total += 1

print("Accuracy: ", round(correct/total, 3))

print(torch.argmax(net(X[0].view(-1, 28*28).to(gpu))[0]), torch.argmax(net(X[1].view(-1, 28*28).to(gpu))[0]), torch.argmax(net(X[2].view(-1, 28*28).to(gpu))[0]))
print(torch.argmax(net(X[3].view(-1, 28*28).to(gpu))[0]), torch.argmax(net(X[4].view(-1, 28*28).to(gpu))[0]), torch.argmax(net(X[5].view(-1, 28*28).to(gpu))[0]))
print(torch.argmax(net(X[6].view(-1, 28*28).to(gpu))[0]), torch.argmax(net(X[7].view(-1, 28*28).to(gpu))[0]), torch.argmax(net(X[8].view(-1, 28*28).to(gpu))[0]))

fig = plt.figure(figsize=(28, 28))

for i in range(9):
  fig.add_subplot(3, 3, i+1)
  plt.imshow(X[i].view(28,28).to(cpu))
plt.show()

Hi,

Thanks for your feedback.

A possible reason is that Jetson doesn’t support concurrent access of memory buffer.
So if CPU and GPU try to access the same buffer at the same time(ex. adjacency layer but different processor), the return data will be undefined.

Thanks.