Hi!
I’m trying to make my network utilize my A100 tensor cores. To debug I’ve created simple model with single convolutional layer.
import torch.nn as nn
import torch
import nvidia_dlprof_pytorch_nvtx
from triton.testing import do_bench
import contextlib
class TestNet(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(64, 512, kernel_size=(3,3), padding=1)
def forward(self, x):
return self.conv1(x)
def run_step(opt, model):
# with torch.cuda.amp.autocast():
opt.zero_grad()
x = torch.randn(64,64,128,128, dtype=torch.float16, requires_grad=True).to(device)
model = model.half()
out = model(x)
loss = out.sum()
loss.backward()
opt.step()
if __name__ == '__main__':
device = 'cuda:0'
do_dlprof = True
do_benchmark = False
do_profiler = False
profiler_path = ...
torch.backends.cudnn.benchmark = True
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
if do_dlprof:
nvidia_dlprof_pytorch_nvtx.init()
model = TestNet()
opt = torch.optim.Adam(model.parameters())
model = model.to(device)
torch.manual_seed(123)
dlprof_ctx = torch.autograd.profiler.emit_nvtx(enabled=do_dlprof)
with dlprof_ctx:
profiler_ctx = torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CPU,
torch.profiler.ProfilerActivity.CUDA],
schedule=torch.profiler.schedule(skip_first=0,wait=0,warmup=5,active=10,repeat=1),
on_trace_ready=torch.profiler.tensorboard_trace_handler(dir_name=profiler_path),
record_shapes=True,
profile_memory=False,
with_stack=True) if do_profiler else contextlib.nullcontext()
with profiler_ctx as p:
for i in range(15):
if do_benchmark:
do_bench(lambda: run_step(opt, model), warmup=10, rep=10)
else:
run_step(opt, model)
if do_profiler:
p.step()
I’ve already tried:
- making dimensions of input tensor and in\out channels divisible by 8
- mixed precision\half precision
- switch to tf32
But no matter what I do, dlprof profiler tells that tensor cores are not used for convolution operation.
At the same time pytorch profiler tells that tensor cores were used.
If it helps, the name of used convolution kernel is ‘cutlass_tensorop_f16_s16816fprop_optimized_f16_128x128_32x3_nhwc’
Why the results of both profilers are inconsistent do the tensor cores work indeed?