Very low CUDA performance for PyTorch in Windows and WSL2

Host Env:
CPU: Intel Core-i7-7700HQ @2.8GHz
RAM: 16GB
GPU: NVidia GeForce 1050Ti
OS: 64-bit Windows Home 2004 (20241.1005)
CUDA: 11.2
conda: 4.8.4
Python: 3.7.5
Pytorch: 1.5.1
torchtext: 0.6.0
cudatoolkit: 10.1
CuDNN:7

WSL2 Env:
OS: Ubuntu 18.04
conda: 4.8.4
Python: 3.7.5
Pytorch: 1.5.1
torchtext: 0.6.0
cudatoolkit: 10.1
CuDNN:7

Docker Env:
Docker: 19.03.13
conda: 4.8.4
Python: 3.7.5
Pytorch: 1.5.0
torchtext: 0.6.0
cudatoolkit: 10.1
CuDNN:7

I tested two Pytorch lenet5 training scripts in the 3 envs, one of which uses CPU and the other uses GPU. I am very confused that GPU performance is not good, and working in Windows is much slower than working the virtualized Linux and the docker . I can’t put the screenshot. pls check the time logging:

Windows output:

  • training on cuda
    epoch 1, loss 1.7694, train acc 0.353, test acc 0.581, time 7.8 sec
    epoch 2, loss 0.4776, train acc 0.628, test acc 0.687, time 6.7 sec
    epoch 3, loss 0.2609, train acc 0.711, test acc 0.717, time 6.7 sec
    epoch 4, loss 0.1738, train acc 0.738, test acc 0.743, time 6.6 sec
    epoch 5, loss 0.1276, train acc 0.754, test acc 0.756, time 6.7 sec

  • training on cpu
    epoch 1, loss 1.8857, train acc 0.304, test acc 0.586, time 20.8 sec
    epoch 2, loss 0.4707, train acc 0.634, test acc 0.676, time 20.2 sec
    epoch 3, loss 0.2549, train acc 0.719, test acc 0.719, time 20.7 sec
    epoch 4, loss 0.1698, train acc 0.743, test acc 0.741, time 21.4 sec
    epoch 5, loss 0.1257, train acc 0.758, test acc 0.755, time 22.0 sec

in WSL2 Ubuntu

  • training on cuda
    epoch 1, loss 1.8913, train acc 0.305, test acc 0.574, time 10.7 sec
    epoch 2, loss 0.4851, train acc 0.619, test acc 0.665, time 10.3 sec
    epoch 3, loss 0.2610, train acc 0.710, test acc 0.729, time 9.9 sec
    epoch 4, loss 0.1728, train acc 0.737, test acc 0.743, time 10.3 sec
    epoch 5, loss 0.1285, train acc 0.751, test acc 0.749, time 10.2 sec

  • training on cpu
    epoch 1, loss 1.8504, train acc 0.322, test acc 0.583, time 6.5 sec
    epoch 2, loss 0.4723, train acc 0.635, test acc 0.676, time 6.3 sec
    epoch 3, loss 0.2590, train acc 0.713, test acc 0.723, time 5.6 sec
    epoch 4, loss 0.1717, train acc 0.739, test acc 0.740, time 5.6 sec
    epoch 5, loss 0.1268, train acc 0.754, test acc 0.750, time 5.6 sec

in WSL Ubuntu Docker

  • training on cuda
    epoch 1, loss 1.8288, train acc 0.325, test acc 0.588, time 11.0 sec
    epoch 2, loss 0.4789, train acc 0.622, test acc 0.674, time 10.7 sec
    epoch 3, loss 0.2598, train acc 0.713, test acc 0.727, time 10.6 sec
    epoch 4, loss 0.1707, train acc 0.739, test acc 0.747, time 10.9 sec
    epoch 5, loss 0.1251, train acc 0.757, test acc 0.760, time 10.7 sec

  • training on cpu
    epoch 1, loss 1.8938, train acc 0.302, test acc 0.561, time 5.8 sec
    epoch 2, loss 0.4748, train acc 0.641, test acc 0.682, time 5.8 sec
    epoch 3, loss 0.2510, train acc 0.718, test acc 0.728, time 5.7 sec
    epoch 4, loss 0.1673, train acc 0.742, test acc 0.731, time 5.8 sec
    epoch 5, loss 0.1236, train acc 0.758, test acc 0.757, time 6.1 sec

Two python scripts for the test:

  • lenet5.py
import os
import time
import torch
from torch import nn, optim

import sys
sys.path.append("..") 
import d2lzh_pytorch as d2l

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(torch.__version__)
print(device)


class LeNet(nn.Module):
    def __init__(self):
        super(LeNet, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(1, 6, 5), # in_channels, out_channels, kernel_size
            nn.Sigmoid(),
            nn.MaxPool2d(2, 2), # kernel_size, stride
            nn.Conv2d(6, 16, 5),
            nn.Sigmoid(),
            nn.MaxPool2d(2, 2)
        )
        self.fc = nn.Sequential(
            nn.Linear(16*4*4, 120),
            nn.Sigmoid(),
            nn.Linear(120, 84),
            nn.Sigmoid(),
            nn.Linear(84, 10)
        )

    def forward(self, img):
        feature = self.conv(img)
        output = self.fc(feature.view(img.shape[0], -1))
        return output

    
net = LeNet()
print(net)


batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size=batch_size)


def evaluate_accuracy(data_iter, net, device=None):
    if device is None and isinstance(net, torch.nn.Module):
        device = list(net.parameters())[0].device
    acc_sum, n = 0.0, 0
    with torch.no_grad():
        for X, y in data_iter:
            if isinstance(net, torch.nn.Module):
                net.eval() 
                acc_sum += (net(X.to(device)).argmax(dim=1) == y.to(device)).float().sum().cpu().item()
                net.train()
            else: 
                if('is_training' in net.__code__.co_varnames): 
                    
                    acc_sum += (net(X, is_training=False).argmax(dim=1) == y).float().sum().item() 
                else:
                    acc_sum += (net(X).argmax(dim=1) == y).float().sum().item() 
            n += y.shape[0]
    return acc_sum / n


def train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs):
    net = net.to(device)
    print("training on ", device)
    loss = torch.nn.CrossEntropyLoss()
    batch_count = 0
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time.time()
        for X, y in train_iter:
            X = X.to(device)
            y = y.to(device)
            y_hat = net(X)
            l = loss(y_hat, y)
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            train_l_sum += l.cpu().item()
            train_acc_sum += (y_hat.argmax(dim=1) == y).sum().cpu().item()
            n += y.shape[0]
            batch_count += 1
        test_acc = evaluate_accuracy(test_iter, net)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
              % (epoch + 1, train_l_sum / batch_count, train_acc_sum / n, test_acc, time.time() - start))


if __name__ == "__main__":
    lr, num_epochs = 0.001, 5
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)
    train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)
  • lenet5-cpu.py
import os
import time
import torch
from torch import nn, optim

import sys
sys.path.append("..") 
import d2lzh_pytorch as d2l

os.environ["CUDA_VISIBLE_DEVICaES"] = "0"
device = torch.device('cpu')

print(torch.__version__)
print(device)


class LeNet(nn.Module):
    def __init__(self):
        super(LeNet, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(1, 6, 5), # in_channels, out_channels, kernel_size
            nn.Sigmoid(),
            nn.MaxPool2d(2, 2), # kernel_size, stride
            nn.Conv2d(6, 16, 5),
            nn.Sigmoid(),
            nn.MaxPool2d(2, 2)
        )
        self.fc = nn.Sequential(
            nn.Linear(16*4*4, 120),
            nn.Sigmoid(),
            nn.Linear(120, 84),
            nn.Sigmoid(),
            nn.Linear(84, 10)
        )

    def forward(self, img):
        feature = self.conv(img)
        output = self.fc(feature.view(img.shape[0], -1))
        return output

    
net = LeNet()
print(net)


batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size=batch_size)


def evaluate_accuracy(data_iter, net, device=None):
    if device is None and isinstance(net, torch.nn.Module):
        device = list(net.parameters())[0].device
    acc_sum, n = 0.0, 0
    with torch.no_grad():
        for X, y in data_iter:
            if isinstance(net, torch.nn.Module):
                net.eval() 
                acc_sum += (net(X.to(device)).argmax(dim=1) == y.to(device)).float().sum().cpu().item()
                net.train() 
            else: 
                if('is_training' in net.__code__.co_varnames): 
                    acc_sum += (net(X, is_training=False).argmax(dim=1) == y).float().sum().item() 
                else:
                    acc_sum += (net(X).argmax(dim=1) == y).float().sum().item() 
            n += y.shape[0]
    return acc_sum / n


def train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs):
    net = net.to(device)
    print("training on ", device)
    loss = torch.nn.CrossEntropyLoss()
    batch_count = 0
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time.time()
        for X, y in train_iter:
            X = X.to(device)
            y = y.to(device)
            y_hat = net(X)
            l = loss(y_hat, y)
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            train_l_sum += l.cpu().item()
            train_acc_sum += (y_hat.argmax(dim=1) == y).sum().cpu().item()
            n += y.shape[0]
            batch_count += 1
        test_acc = evaluate_accuracy(test_iter, net)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
              % (epoch + 1, train_l_sum / batch_count, train_acc_sum / n, test_acc, time.time() - start))


if __name__ == "__main__":
    lr, num_epochs = 0.001, 5
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)
    train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)

The only difference in the two scripts is in line 11. Device is disgnated to CPU in the lenet5-cpu.py.