Description
I’ve converted an hrnet-w18 landmark detection model to onnx then TensorRT format and to test I’m trying to inference a dummy pyTorch tensor of all 1’s
im = torch.ones(1,3,256,256).half().to(device)
but for some reason the input array as stored in bindings[‘input.1’].data is not an array of ones and is instead a random assortment of values sometimes containing NaN. As a result, the output bindings[‘3450’].data is also often (but not always) NaN. I’m extremely confused about how a fixed input can result in undetermined and erroneous outputs as well as how to fix it.
Sample debugging messages showing the actual input to the model and the output:
Input
Binding(name='input.1', dtype=<class 'numpy.float16'>, shape=(1, 3, 256, 256), data=tensor([[[[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.5034e-06,
0.0000e+00, 0.0000e+00],
[ 0.0000e+00, 0.0000e+00, -1.2500e-01, ..., 1.4901e-06,
0.0000e+00, 0.0000e+00],
[ 0.0000e+00, 0.0000e+00, -1.5360e+03, ..., 8.3447e-06,
0.0000e+00, 0.0000e+00],
...,
[-8.8882e-03, -5.8057e-01, 5.1611e-01, ..., -2.6525e+02,
1.1024e-02, -1.9170e+00],
[-3.4618e-04, -3.3888e+04, 8.4400e+03, ..., 4.9706e-03,
1.5656e+04, 8.2375e+01],
[ 1.2891e+00, 1.6976e+04, 1.1289e+01, ..., -3.2663e-05,
-2.0625e+00, 7.4625e+01]],
[[ 1.2939e-01, 1.5242e+01, -7.2960e+03, ..., -6.6188e+01,
nan, -7.0870e-05],
[ 1.2947e-02, 3.2723e-05, 6.7600e+03, ..., -3.2723e-05,
-7.8040e+03, -6.5193e-03],
[ 1.1078e-02, -1.0040e-02, -1.1456e+02, ..., 4.8157e-02,
-2.5120e+04, -4.1062e+01],
...,
[-1.3893e-02, 2.8620e+03, 2.9600e+02, ..., -7.6133e+00,
2.7817e-02, 5.0163e-03],
[-1.5748e-04, -1.9033e+00, 9.0625e+01, ..., 7.8918e-02,
-1.6928e+04, 8.9062e-01],
[-1.0640e+03, -4.4507e-01, 5.5054e-02, ..., -9.0950e+02,
8.4758e-05, 4.3583e-04]],
[[ 1.2684e-04, -6.0005e-03, 3.6346e-02, ..., 9.7875e+01,
5.4368e+04, -9.3317e-04],
[-1.4962e+02, -7.7546e-05, 1.0147e-03, ..., 7.8250e+02,
6.8000e+03, -7.0572e-05],
[-2.7786e-02, nan, -1.0339e-01, ..., 2.9907e-03,
-4.6680e+00, -2.8419e-04],
...,
[ 1.2244e+02, 1.8960e+04, -3.1200e+03, ..., 4.4727e-01,
-5.3312e+01, 5.6055e+00],
[-5.8528e+04, nan, 1.3130e+03, ..., 1.7226e-04,
-2.9102e-01, -8.5100e+02],
[ 1.4648e+04, 5.8624e+04, 2.1120e+03, ..., 3.7094e+01,
-9.8375e+01, -3.2723e-05]]]], device='cuda:0', dtype=torch.float16), ptr=140600680644608)
Output
Binding(name='3450', dtype=<class 'numpy.float16'>, shape=(1, 4, 64, 64), data=tensor([[[[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan],
...,
[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan]],
[[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan],
...,
[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan]],
[[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan],
...,
[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan]],
[[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan],
...,
[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan]]]], device='cuda:0',
dtype=torch.float16), ptr=140600681037824)
Environment
GPU: RTX 3070
Pytorch version: 1.12.1
CUDA: V11.1.105
TensorRT: 8.4.2
Relevant Files
Files (onnx, trt) can be found at: https://drive.google.com/drive/folders/1lnHGmMMcei0lfhoHSlgp4oKWEHADooqg?usp=sharing
Steps To Reproduce
The cell I’m running on Colab that produces this bug is shown below:
import sys
import torch
from PIL import Image
import cv2
import random
import time
import numpy as np
import tensorrt as trt
from PIL import Image
from pathlib import Path
from collections import OrderedDict,namedtuple
import torchvision.transforms as transforms
# inference with new TRT model
w = './model.trt'
device = torch.device('cuda:0')
# Infer TensorRT Engine
Binding = namedtuple('Binding', ('name', 'dtype', 'shape', 'data', 'ptr'))
logger = trt.Logger(trt.Logger.INFO)
trt.init_libnvinfer_plugins(logger, namespace="")
with open(w, 'rb') as f, trt.Runtime(logger) as runtime:
model = runtime.deserialize_cuda_engine(f.read())
bindings = OrderedDict()
for index in range(model.num_bindings):
name = model.get_binding_name(index)
dtype = trt.nptype(model.get_binding_dtype(index))
shape = tuple(model.get_binding_shape(index))
data = torch.from_numpy(np.empty(shape, dtype=np.dtype(dtype))).to(device)
bindings[name] = Binding(name, dtype, shape, data, int(data.data_ptr()))
binding_addrs = OrderedDict((n, d.ptr) for n, d in bindings.items())
context = model.create_execution_context()
transform = transforms.Compose([transforms.ToTensor()])
im = torch.ones(1,3,256,256).half().to(device)
start = time.perf_counter()
binding_addrs['images'] = int(im.data_ptr())
context.execute_v2(list(binding_addrs.values()))
print("--------------------------------------------------------")
print(bindings['input.1'])
print(bindings['3450'])