I want to use Docker to run an image to utilize the GPU on Jetson, but I found that the GPU is not being used

my jetson orin nano jetpack 6.2.1

my docker version
root@nvidia-desktop:/home/nvidia# docker version
Client: Docker Engine - Community
Version: 28.3.2
API version: 1.51
Go version: go1.24.5
Git commit: 578ccf6
Built: Wed Jul 9 16:13:42 2025
OS/Arch: linux/arm64
Context: default

Server: Docker Engine - Community
Engine:
Version: 28.3.2
API version: 1.51 (minimum version 1.24)
Go version: go1.24.5
Git commit: e77ff99
Built: Wed Jul 9 16:13:42 2025
OS/Arch: linux/arm64
Experimental: true
containerd:
Version: 1.7.27
GitCommit: 05044ec0a9a75232cad458027ca83437aae3f4da
nvidia:
Version: 1.2.5
GitCommit: v1.2.5-0-g59923ef
docker-init:
Version: 0.19.0
GitCommit: de40ad0

NVIDIA Container Toolkit

root@nvidia-desktop:/home/nvidia# nvidia-container-runtime --version
NVIDIA Container Runtime version 1.17.8
commit: f202b80a9b9d0db00d9b1d73c0128c8962c55f4d
spec: 1.2.1

runc version 1.2.5
commit: v1.2.5-0-g59923ef
spec: 1.2.0
go: go1.23.7
libseccomp: 2.5.3
root@nvidia-desktop:/home/nvidia# docker info | grep Runtimes
Runtimes: io.containerd.runc.v2 nvidia runc
root@nvidia-desktop:/home/nvidia#

docker daemon.json
root@nvidia-desktop:/home/nvidia# cat /etc/docker/daemon.json
{
“builder”: {
“gc”: {
“defaultKeepStorage”: “20GB”,
“enabled”: true
}
},
“data-root”: “/var/lib/docker”,
“exec-opts”: [
“native.cgroupdriver=systemd”
],
“experimental”: true,
“features”: {
“buildkit”: true
},
“log-driver”: “json-file”,
“log-opts”: {
“max-file”: “60”,
“max-size”: “500m”
},
“registry-mirrors”: [
https://ghcr.ikubernetes.xyz”,
https://cloudsmith.ikubernetes.xyz”,
https://docker.ikubernetes.xyz”,
https://quey.ikubernetes.xyz”,
https://gcr.ikubernetes.xyz”,
https://k8s-gcr.ikubernetes.xyz”,
https://docker.registry.cyou”,
https://docker-cf.registry.cyou”,
https://dockercf.jsdelivr.fyi”,
https://docker.jsdelivr.fyi”,
https://dockertest.jsdelivr.fyi”,
https://mirror.aliyuncs.com”,
https://dockerproxy.com”,
https://mirror.baidubce.com”,
https://docker.m.daocloud.io”,
https://docker.nju.edu.cn”,
https://docker.mirrors.sjtug.sjtu.edu.cn”,
https://docker.mirrors.ustc.edu.cn”,
https://mirror.iscas.ac.cn”,
https://docker.rainbond.cc”,
https://k8s..ikubernetes.xyz”
],
“runtimes”: {
“nvidia”: {
“args”: ,
“path”: “nvidia-container-runtime”,
“runtimesArgs” :

    }
},
"default-runtime":"nvidia"

}
root@nvidia-desktop:/home/nvidia#

Docker Test the jetsonGPU
PyTorch training code mnist.py

from future import print_function
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR

class Net(nn.Module):
def init(self):
super(Net, self).init()
self.conv1 = nn.Conv2d(1, 32, 3, 1)
self.conv2 = nn.Conv2d(32, 64, 3, 1)
self.dropout1 = nn.Dropout(0.25)
self.dropout2 = nn.Dropout(0.5)
self.fc1 = nn.Linear(9216, 128)
self.fc2 = nn.Linear(128, 10)

def forward(self, x):
    x = self.conv1(x)
    x = F.relu(x)
    x = self.conv2(x)
    x = F.relu(x)
    x = F.max_pool2d(x, 2)
    x = self.dropout1(x)
    x = torch.flatten(x, 1)
    x = self.fc1(x)
    x = F.relu(x)
    x = self.dropout2(x)
    x = self.fc2(x)
    output = F.log_softmax(x, dim=1)
    return output

def train(args, model, device, train_loader, optimizer, epoch):
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = F.nll_loss(output, target)
loss.backward()
optimizer.step()
if batch_idx % args.log_interval == 0:
print(‘Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}’.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))
if args.dry_run:
break

def test(model, device, test_loader):
model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = model(data)
test_loss += F.nll_loss(output, target, reduction=‘sum’).item() # sum up batch loss
pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
correct += pred.eq(target.view_as(pred)).sum().item()

test_loss /= len(test_loader.dataset)

print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
    test_loss, correct, len(test_loader.dataset),
    100. * correct / len(test_loader.dataset)))

def main():
# Training settings
parser = argparse.ArgumentParser(description=‘PyTorch MNIST Example’)
parser.add_argument(‘–batch-size’, type=int, default=64, metavar=‘N’,
help=‘input batch size for training (default: 64)’)
parser.add_argument(‘–test-batch-size’, type=int, default=1000, metavar=‘N’,
help=‘input batch size for testing (default: 1000)’)
parser.add_argument(‘–epochs’, type=int, default=14, metavar=‘N’,
help=‘number of epochs to train (default: 14)’)
parser.add_argument(‘–lr’, type=float, default=1.0, metavar=‘LR’,
help=‘learning rate (default: 1.0)’)
parser.add_argument(‘–gamma’, type=float, default=0.7, metavar=‘M’,
help=‘Learning rate step gamma (default: 0.7)’)
parser.add_argument(‘–no-cuda’, action=‘store_true’, default=False,
help=‘disables CUDA training’)
parser.add_argument(‘–no-mps’, action=‘store_true’, default=False,
help=‘disables macOS GPU training’)
parser.add_argument(‘–dry-run’, action=‘store_true’, default=False,
help=‘quickly check a single pass’)
parser.add_argument(‘–seed’, type=int, default=1, metavar=‘S’,
help=‘random seed (default: 1)’)
parser.add_argument(‘–log-interval’, type=int, default=10, metavar=‘N’,
help=‘how many batches to wait before logging training status’)
parser.add_argument(‘–save-model’, action=‘store_true’, default=False,
help=‘For Saving the current Model’)
args = parser.parse_args()
use_cuda = not args.no_cuda and torch.cuda.is_available()
use_mps = not args.no_mps and torch.backends.mps.is_available()

torch.manual_seed(args.seed)

if use_cuda:
    device = torch.device("cuda")
elif use_mps:
    device = torch.device("mps")
else:
    device = torch.device("cpu")

train_kwargs = {'batch_size': args.batch_size}
test_kwargs = {'batch_size': args.test_batch_size}
if use_cuda:
    cuda_kwargs = {'num_workers': 1,
                   'pin_memory': True,
                   'shuffle': True}
    train_kwargs.update(cuda_kwargs)
    test_kwargs.update(cuda_kwargs)

transform=transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
    ])
dataset1 = datasets.MNIST('../data', train=True, download=True,
                   transform=transform)
dataset2 = datasets.MNIST('../data', train=False,
                   transform=transform)
train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)
test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)

model = Net().to(device)
optimizer = optim.Adadelta(model.parameters(), lr=args.lr)

scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
for epoch in range(1, args.epochs + 1):
    train(args, model, device, train_loader, optimizer, epoch)
    test(model, device, test_loader)
    scheduler.step()

if args.save_model:
    torch.save(model.state_dict(), "mnist_cnn.pt")

if name == ‘main’:
main()

Create a Docker image using a Dockerfile, based on the NVIDIA PyTorch base image.
FROM nvcr.io/nvidia/l4t-pytorch:r35.2.1-pth2.0-py3

COPY pytorch-mnist.py /home/

docker build -t mnist:1.0 .

Create a Docker image using a Dockerfile, based on the NVIDIA PyTorch base image.

docker run -it --runtime nvidia mnist:1.0 /bin/bash

python3 /home/pytorch-minst.py

Find GPU not be used,CPU be used.

When I run mnist.py directly on Jetson, I find that it can use the GPU.

I don’t know where I made the mistake. Can you help me?

Hi,

It looks like you are using the r35.2.1 image (from JetPack 5).
Which JetPack version do you use for setting up the device?

Thanks.