# Conversion to TensorRT leads to terrible accuracy

I am trying to convert the inference part of the pytracking tomp101 algorithm to tensorrt

I’ve converted it to onnx, the inference seems to be fine, the bounding box catches on correctly ALTHOUGH comparing the difference between output tensors of the original model and onnx model using this code seem to differ by quite a lot (the tensor values seem to differ, but tracks objects just fine :D ):

{sample_x [dtype=float32, shape=(1, 1024, 18, 18)],
train_samples [dtype=float32, shape=(1, 1024, 18, 18)],
target_labels [dtype=float32, shape=(1, 1, 18, 18)],
train_ltrb [dtype=float32, shape=(1, 4, 18, 18)]}
[I] trt-runner-N0-01/18/24-04:45:38
---- Inference Output(s) ----
{bbreg_test_feat_enc [dtype=float32, shape=(1, 1, 256, 18, 18)],
bbreg_weights [dtype=float32, shape=(1, 256, 1, 1)],
target_scores [dtype=float32, shape=(1, 1, 18, 18)]}

r1 = original_model(inputs)
r2 = session.run(inputs)

avg11=avg11+(torch.mean(torch.abs(r1[0] - torch.from_numpy(r2[0]).cuda())))
avg12=avg12+(torch.mean(torch.abs(r1[1] - torch.from_numpy(r2[1]).cuda())))
avg13=avg13+(torch.mean(torch.abs(r1[2] - torch.from_numpy(r2[2]).cuda())))

print(avg11/30)
print(avg12/30)
print(avg13/30)

BUT

when the model is converted to TensorRT the accuracy drops!!! inference is terrible.

Does anybody have any suggestions on how to improve it? Maybe should I modify the onnx model with graph surgeon? maybe theres some polygraphy tool that I could use?

Maybe there is a trtexec method of converting that preserves accuracy?

THANK YOU

## Environment

TensorRT Version: 8.6

NVIDIA GPU: GTX 1660 Ti

NVIDIA Driver Version: 546.01

CUDA Version: 12.1

CUDNN Version: 8.9.7

Operating System:

Python Version (if applicable): 3.10.13

PyTorch Version (if applicable): 2.1.2+cu121

Baremetal or Container (if so, version): no environment

## Relevant Files

Hi @ttomukas740 ,

Thanks

Hello, I apologize to you for waiting so long for an answer, I honestly didn’t see your answer.
I am using polygraphy for inference and my fix was to put the tensors to cpu before running inference on polygraphy. When passing tensors with device cuda to polygraphy inference sometimes we can get accuracy issues.

Here is the part of code that had accuracy issues when passing tensors with device cuda for inference:

``````def MLP(channels, do_bn=True):
n = len(channels)
layers = []
for i in range(1, n):
layers.append(
nn.Conv1d(channels[i - 1], channels[i], kernel_size=1, bias=True))
if i < (n-1):
if do_bn:
layers.append(nn.BatchNorm1d(channels[i]))
layers.append(nn.ReLU())
return nn.Sequential(*layers)

class FilterPredictor(nn.Module):
def __init__(self, transformer, feature_sz, use_test_frame_encoding=True):
print("feature_sz filter")
print(feature_sz)
super().__init__()
self.transformer = transformer
self.feature_sz = feature_sz
self.use_test_frame_encoding = use_test_frame_encoding

self.box_encoding = MLP([4, self.transformer.d_model//4, self.transformer.d_model, self.transformer.d_model])

self.query_embed_fg = nn.Embedding(1, self.transformer.d_model)

if self.use_test_frame_encoding:
self.query_embed_test = nn.Embedding(1, self.transformer.d_model)

self.query_embed_fg_decoder = self.query_embed_fg

self.pos_encoding = PositionEmbeddingSine(num_pos_feats=self.transformer.d_model//2, sine_type='lin_sine',
avoid_aliazing=True, max_spatial_resolution=feature_sz)

def forward(self, train_feat, test_feat, num_gth_frames, train_label, train_ltrb_target):
return self.predict_cls_bbreg_filters_parallel(train_feat, test_feat, train_label,1, train_ltrb_target)

def get_positional_encoding(self, feat):
nframes, nseq, _, h, w = feat.shape
mask = torch.zeros((nframes * nseq, h, w), dtype=torch.bool, device=feat.device)
tt = torch.tensor((), dtype=torch.bool, device=feat.device)
mask = tt.new_zeros((nframes * nseq, h, w))

return pos.reshape(nframes, nseq, -1, h, w)

def predict_cls_bbreg_filters_parallel(self, train_feat, test_feat, train_label, num_gth_frames, train_ltrb_target):
# train_label size guess: Nf_tr, Ns, H, W.
if train_feat.dim() == 4:
train_feat = train_feat.unsqueeze(1)
if test_feat.dim() == 4:
test_feat = test_feat.unsqueeze(1)
if train_ltrb_target.dim() == 4:
train_ltrb_target = train_ltrb_target.unsqueeze(1)

h, w = test_feat.shape[-2:]
H, W = train_feat.shape[-2:]

train_feat_stack = torch.cat([train_feat, train_feat], dim=1)
test_feat_stack = torch.cat([test_feat, test_feat], dim=1)
train_label_stack = torch.cat([train_label, train_label], dim=1)
train_ltrb_target_stack = torch.cat([train_ltrb_target, train_ltrb_target], dim=1)

test_pos = self.get_positional_encoding(test_feat)  # Nf_te, Ns, C, H, W #ok
train_pos = self.get_positional_encoding(train_feat)  # Nf_tr, Ns, C, H, W #ok

test_feat_seq = test_feat_stack.permute(1, 2, 0, 3, 4).flatten(2).permute(2, 0, 1)  # Nf_te*H*W, Ns, C
train_feat_seq = train_feat_stack.permute(1, 2, 0, 3, 4).flatten(2).permute(2, 0, 1)  # Nf_tr*H*W, Ns, C
train_label_seq = train_label_stack.permute(1, 0, 2, 3).flatten(1).permute(1, 0).unsqueeze(2)  # Nf_tr*H*W,Ns,1
train_ltrb_target_seq_T = train_ltrb_target_stack.permute(1, 2, 0, 3, 4).flatten(2)  # Ns,4,Nf_tr*H*W

test_pos = test_pos.permute(1, 2, 0, 3, 4).flatten(2).permute(2, 0, 1)
train_pos = train_pos.permute(1, 2, 0, 3, 4).flatten(2).permute(2, 0, 1)

fg_token = self.query_embed_fg.weight.reshape(1, 1, -1)
train_label_enc = fg_token * train_label_seq

train_ltrb_target_enc = self.box_encoding(train_ltrb_target_seq_T)

train_ltrb_target_enc = train_ltrb_target_enc.permute(2, 0, 1)  # Nf_tr*H*H,Ns,C

if self.use_test_frame_encoding:
test_token = self.query_embed_test.weight.reshape(1, 1, -1)
test_label_enc = torch.ones_like(test_feat_seq) * test_token # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
feat = torch.cat([train_feat_seq + train_label_enc + train_ltrb_target_enc, test_feat_seq + test_label_enc], dim=0)
else:
feat = torch.cat([train_feat_seq + train_label_enc + train_ltrb_target_enc, test_feat_seq], dim=0)

pos = torch.cat([train_pos, test_pos], dim=0)

query_embed=self.query_embed_fg_decoder.weight,
pos_embed=pos)

enc_opt = enc_mem[-h * w:].transpose(0, 1).permute(0, 2, 1)
enc_opt = enc_opt.reshape(test_feat_stack.shape)
dec_opt = output_embed.squeeze(0).transpose(1, 2)
dec_opt = dec_opt.reshape(test_feat_stack.shape[1], -1, 1, 1)

cls_enc_opt = enc_opt[:, 0].unsqueeze(1)
bbreg_enc_opt = enc_opt[:, 1].unsqueeze(1)
cls_dec_opt = dec_opt[0].unsqueeze(0)
bbreg_dec_opt = dec_opt[1].unsqueeze(0)

return cls_dec_opt, bbreg_dec_opt, cls_enc_opt, bbreg_enc_opt
``````

If additional code is needed for example for the transformer or PositionalEmbedingSine let me know.
I’m not sure if this is a bug maybe we really cannot pass tensors that are on cuda to polygraphys inference