Conversion to TensorRT leads to terrible accuracy

I am trying to convert the inference part of the pytracking tomp101 algorithm to tensorrt

I’ve converted it to onnx, the inference seems to be fine, the bounding box catches on correctly ALTHOUGH comparing the difference between output tensors of the original model and onnx model using this code seem to differ by quite a lot (the tensor values seem to differ, but tracks objects just fine :D ):

{sample_x [dtype=float32, shape=(1, 1024, 18, 18)],
train_samples [dtype=float32, shape=(1, 1024, 18, 18)],
target_labels [dtype=float32, shape=(1, 1, 18, 18)],
train_ltrb [dtype=float32, shape=(1, 4, 18, 18)]}
[I] trt-runner-N0-01/18/24-04:45:38
---- Inference Output(s) ----
{bbreg_test_feat_enc [dtype=float32, shape=(1, 1, 256, 18, 18)],
bbreg_weights [dtype=float32, shape=(1, 256, 1, 1)],
target_scores [dtype=float32, shape=(1, 1, 18, 18)]}

r1 = original_model(inputs)
r2 = session.run(inputs)

avg11=avg11+(torch.mean(torch.abs(r1[0] - torch.from_numpy(r2[0]).cuda())))
avg12=avg12+(torch.mean(torch.abs(r1[1] - torch.from_numpy(r2[1]).cuda())))
avg13=avg13+(torch.mean(torch.abs(r1[2] - torch.from_numpy(r2[2]).cuda())))

print(avg11/30)
print(avg12/30)
print(avg13/30)

BUT

when the model is converted to TensorRT the accuracy drops!!! inference is terrible.

Does anybody have any suggestions on how to improve it? Maybe should I modify the onnx model with graph surgeon? maybe theres some polygraphy tool that I could use?

Maybe there is a trtexec method of converting that preserves accuracy?

THANK YOU

Environment

TensorRT Version: 8.6

NVIDIA GPU: GTX 1660 Ti

NVIDIA Driver Version: 546.01

CUDA Version: 12.1

CUDNN Version: 8.9.7

Operating System:

Python Version (if applicable): 3.10.13

PyTorch Version (if applicable): 2.1.2+cu121

Baremetal or Container (if so, version): no environment

Relevant Files

Model link: modified_latest3_sanitized2.onnx - Google Drive

Hi @ttomukas740 ,
Can you please help us with repro inference script to analyze the issue.

Thanks

Hello, I apologize to you for waiting so long for an answer, I honestly didn’t see your answer.
I am using polygraphy for inference and my fix was to put the tensors to cpu before running inference on polygraphy. When passing tensors with device cuda to polygraphy inference sometimes we can get accuracy issues.

Here is the part of code that had accuracy issues when passing tensors with device cuda for inference:

def MLP(channels, do_bn=True):
    n = len(channels)
    layers = []
    for i in range(1, n):
        layers.append(
            nn.Conv1d(channels[i - 1], channels[i], kernel_size=1, bias=True))
        if i < (n-1):
            if do_bn:
                layers.append(nn.BatchNorm1d(channels[i]))
            layers.append(nn.ReLU())
    return nn.Sequential(*layers)

class FilterPredictor(nn.Module):
    def __init__(self, transformer, feature_sz, use_test_frame_encoding=True):
        print("feature_sz filter")
        print(feature_sz)
        super().__init__()
        self.transformer = transformer
        self.feature_sz = feature_sz
        self.use_test_frame_encoding = use_test_frame_encoding

        self.box_encoding = MLP([4, self.transformer.d_model//4, self.transformer.d_model, self.transformer.d_model])

        self.query_embed_fg = nn.Embedding(1, self.transformer.d_model)

        if self.use_test_frame_encoding:
            self.query_embed_test = nn.Embedding(1, self.transformer.d_model)

        self.query_embed_fg_decoder = self.query_embed_fg

        self.pos_encoding = PositionEmbeddingSine(num_pos_feats=self.transformer.d_model//2, sine_type='lin_sine',
                                                  avoid_aliazing=True, max_spatial_resolution=feature_sz)
        
    def forward(self, train_feat, test_feat, num_gth_frames, train_label, train_ltrb_target):
        return self.predict_cls_bbreg_filters_parallel(train_feat, test_feat, train_label,1, train_ltrb_target)
    
    def get_positional_encoding(self, feat):
        nframes, nseq, _, h, w = feat.shape
        mask = torch.zeros((nframes * nseq, h, w), dtype=torch.bool, device=feat.device)
        tt = torch.tensor((), dtype=torch.bool, device=feat.device)
        mask = tt.new_zeros((nframes * nseq, h, w))
        pos = self.pos_encoding(mask)
        

        return pos.reshape(nframes, nseq, -1, h, w)

    def predict_cls_bbreg_filters_parallel(self, train_feat, test_feat, train_label, num_gth_frames, train_ltrb_target):
        # train_label size guess: Nf_tr, Ns, H, W.
        if train_feat.dim() == 4:
            train_feat = train_feat.unsqueeze(1)
        if test_feat.dim() == 4:
            test_feat = test_feat.unsqueeze(1)
        if train_ltrb_target.dim() == 4:
            train_ltrb_target = train_ltrb_target.unsqueeze(1)

        h, w = test_feat.shape[-2:]
        H, W = train_feat.shape[-2:]

        train_feat_stack = torch.cat([train_feat, train_feat], dim=1)
        test_feat_stack = torch.cat([test_feat, test_feat], dim=1)
        train_label_stack = torch.cat([train_label, train_label], dim=1)
        train_ltrb_target_stack = torch.cat([train_ltrb_target, train_ltrb_target], dim=1)

        test_pos = self.get_positional_encoding(test_feat)  # Nf_te, Ns, C, H, W #ok
        train_pos = self.get_positional_encoding(train_feat)  # Nf_tr, Ns, C, H, W #ok

        test_feat_seq = test_feat_stack.permute(1, 2, 0, 3, 4).flatten(2).permute(2, 0, 1)  # Nf_te*H*W, Ns, C
        train_feat_seq = train_feat_stack.permute(1, 2, 0, 3, 4).flatten(2).permute(2, 0, 1)  # Nf_tr*H*W, Ns, C
        train_label_seq = train_label_stack.permute(1, 0, 2, 3).flatten(1).permute(1, 0).unsqueeze(2)  # Nf_tr*H*W,Ns,1
        train_ltrb_target_seq_T = train_ltrb_target_stack.permute(1, 2, 0, 3, 4).flatten(2)  # Ns,4,Nf_tr*H*W

        test_pos = test_pos.permute(1, 2, 0, 3, 4).flatten(2).permute(2, 0, 1)
        train_pos = train_pos.permute(1, 2, 0, 3, 4).flatten(2).permute(2, 0, 1)
        
        fg_token = self.query_embed_fg.weight.reshape(1, 1, -1)
        train_label_enc = fg_token * train_label_seq
        
        train_ltrb_target_enc = self.box_encoding(train_ltrb_target_seq_T)
        
        train_ltrb_target_enc = train_ltrb_target_enc.permute(2, 0, 1)  # Nf_tr*H*H,Ns,C

        if self.use_test_frame_encoding:
            test_token = self.query_embed_test.weight.reshape(1, 1, -1)
            test_label_enc = torch.ones_like(test_feat_seq) * test_token # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
            feat = torch.cat([train_feat_seq + train_label_enc + train_ltrb_target_enc, test_feat_seq + test_label_enc], dim=0)
        else:
            feat = torch.cat([train_feat_seq + train_label_enc + train_ltrb_target_enc, test_feat_seq], dim=0)

        pos = torch.cat([train_pos, test_pos], dim=0)

        src_key_padding_mask = torch.zeros(feat.shape[1], feat.shape[0]).bool()
        src_key_padding_mask[1, num_gth_frames*H*W:-h*w] = 1.
        src_key_padding_mask = src_key_padding_mask.bool().to(feat.device)

        output_embed, enc_mem = self.transformer(feat, mask=src_key_padding_mask,
                                             query_embed=self.query_embed_fg_decoder.weight,
                                             pos_embed=pos)
        
        enc_opt = enc_mem[-h * w:].transpose(0, 1).permute(0, 2, 1)
        enc_opt = enc_opt.reshape(test_feat_stack.shape)
        dec_opt = output_embed.squeeze(0).transpose(1, 2)
        dec_opt = dec_opt.reshape(test_feat_stack.shape[1], -1, 1, 1)

        cls_enc_opt = enc_opt[:, 0].unsqueeze(1)
        bbreg_enc_opt = enc_opt[:, 1].unsqueeze(1)
        cls_dec_opt = dec_opt[0].unsqueeze(0)
        bbreg_dec_opt = dec_opt[1].unsqueeze(0)

        return cls_dec_opt, bbreg_dec_opt, cls_enc_opt, bbreg_enc_opt

If additional code is needed for example for the transformer or PositionalEmbedingSine let me know.
I’m not sure if this is a bug maybe we really cannot pass tensors that are on cuda to polygraphys inference