Hello,
I’m trying to learn how to use Pytorch and have been following along with this video series Training Model - Deep Learning and Neural Networks with Python and Pytorch p.4 - YouTube but I’m getting different results when I run these examples on my AGX compared to a standard Ubuntu VM on my normal desktop. On my AGX I’m using 1.7 Pytorch from this post PyTorch for Jetson and on my VM I used pip3 install torch torchvision.
The problem is when I try to run the code in the video linked above, on my AGX I get the following results:
nvidia@xavier:~/src/cuda_projects/pytorch$ python3 pytorchHello2.py
Net(
(fc1): Linear(in_features=784, out_features=64, bias=True)
(fc2): Linear(in_features=64, out_features=64, bias=True)
(fc3): Linear(in_features=64, out_features=64, bias=True)
(fc4): Linear(in_features=64, out_features=10, bias=True)
)
tensor(3.3226e+08, grad_fn=<NllLossBackward>)
tensor(1.7130e+10, grad_fn=<NllLossBackward>)
tensor(-2.7093e+09, grad_fn=<NllLossBackward>)
Accuracy: 0.112
nvidia@xavier:~/src/cuda_projects/pytorch$ python3 pytorchHello2.py
Net(
(fc1): Linear(in_features=784, out_features=64, bias=True)
(fc2): Linear(in_features=64, out_features=64, bias=True)
(fc3): Linear(in_features=64, out_features=64, bias=True)
(fc4): Linear(in_features=64, out_features=10, bias=True)
)
tensor(nan, grad_fn=<NllLossBackward>)
tensor(nan, grad_fn=<NllLossBackward>)
tensor(nan, grad_fn=<NllLossBackward>)
Accuracy: 0.099
With the majority of the results being nan with an accuracy of 0.099.
However on my Ubuntu VM with the same code, I get the expected results:
tensor(0.3005, grad_fn=<NllLossBackward>)
tensor(0.0167, grad_fn=<NllLossBackward>)
tensor(0.0097, grad_fn=<NllLossBackward>)
Accuracy: 0.968
I’m not sure what I’m doing wrong, or perhaps I missed something during my setup? I would appreciate any suggestions on how to fix this issue with Pytorch on the AGX.
Python code that I’m using on both machines:
import torch
import torchvision
from torchvision import transforms, datasets
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
train = datasets.MNIST("", train = True, download = True, transform = transforms.Compose([transforms.ToTensor()]))
test = datasets.MNIST("", train = False, download = True, transform = transforms.Compose([transforms.ToTensor()]))
trainset = torch.utils.data.DataLoader(train, batch_size = 10, shuffle = True)
testset = torch.utils.data.DataLoader(test, batch_size = 10, shuffle = True)
class Net(nn.Module):
def __init__(self):
super().__init__()
self.fc1 = nn.Linear(28*28, 64) # 28*28 = number of pixels in each image in dataset
self.fc2 = nn.Linear(64, 64) # input=64 based on output of fc1
self.fc3 = nn.Linear(64, 64) # input=64 based on output of fc2
self.fc4 = nn.Linear(64, 10) # input=64 based on output of fc2, output=10 because we have 10 digits in our data
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.relu(self.fc3(x))
x = self.fc4(x)
return F.log_softmax(x, dim=1)
net = Net()
print(net)
optimizer = optim.Adam(net.parameters(), lr=0.001)
EPOCHS = 3
for epoch in range(EPOCHS):
for data in trainset:
# data is a batch of featuresets and labels
X, y = data
net.zero_grad()
output = net(X.view(-1, 28*28))
loss = F.nll_loss(output, y)
loss.backward()
optimizer.step()
print(loss)
correct = 0
total = 0
with torch.no_grad():
for data in testset:
X, y = data
output = net(X.view(-1, 28*28))
for idx, i in enumerate(output):
if torch.argmax(i) == y[idx]:
correct += 1
total += 1
print("Accuracy: ", round(correct/total, 3))
print(torch.argmax(net(X[0].view(-1, 28*28))[0]), torch.argmax(net(X[1].view(-1, 28*28))[0]), torch.argmax(net(X[2].view(-1, 28*28))[0]))
print(torch.argmax(net(X[3].view(-1, 28*28))[0]), torch.argmax(net(X[4].view(-1, 28*28))[0]), torch.argmax(net(X[5].view(-1, 28*28))[0]))
print(torch.argmax(net(X[6].view(-1, 28*28))[0]), torch.argmax(net(X[7].view(-1, 28*28))[0]), torch.argmax(net(X[8].view(-1, 28*28))[0]))
fig = plt.figure(figsize=(28, 28))
for i in range(9):
fig.add_subplot(3, 3, i+1)
plt.imshow(X[i].view(28,28))
plt.show()
Cheers
-T