Could not find any implementation for node failure of TensorRT 8.5 when running on GPU Jetson Xavier NX

f.requiem11 · December 6, 2024, 12:08am

Description

When I try to convert SuperPoint model from onnx to tensorrt engine using trtexec I faced

[optimizer.cpp::computeCosts::3728] Error Code 10: Internal Error (Could not find any implementation for node {ForeignNode[/Flatten...(Unnamed Layer* 139) [Shuffle]]}

error. It works in tensorrt 8.6 but since our workspace is Jetson Xavier NX and the latest supported Jetpack version for Xavier NX has Tensorrt 8.5, upgrading Tensorrt is not an option for now.
I also tried to simplify onnx model and it didn’t help me too.

Environment

TensorRT Version: 8.5

NVIDIA GPU: Jetson Xavier NX

CUDA Version: 11.4

Operating System: Jetpack 5.1.4

Python Version (if applicable): 3.8

This is model’s code:

import torch
from torch import nn

class MaxPool(nn.Module):
    def __init__(self, nms_radius: int):
        super(MaxPool, self).__init__()
        self.block = nn.MaxPool2d(kernel_size=nms_radius * 2 + 1, stride=1, padding=nms_radius)

    def forward(self, x):
        x = x.unsqueeze(dim=1)
        return torch.squeeze(self.block(x), dim=1)


def simple_nms(scores, nms_radius: int):
    """ Fast Non-maximum suppression to remove nearby points """
    assert (nms_radius >= 0)

    # def max_pool(x):
    #     return torch.nn.functional.max_pool2d(
    #         x, kernel_size=nms_radius * 2 + 1, stride=1, padding=nms_radius)

    max_pool = MaxPool(nms_radius)

    zeros = torch.zeros_like(scores)
    max_mask = scores == max_pool(scores)
    for _ in range(2):
        supp_mask = max_pool(max_mask.float()) > 0
        supp_scores = torch.where(supp_mask, zeros, scores)
        new_max_mask = supp_scores == max_pool(supp_scores)
        max_mask = max_mask | (new_max_mask & (~supp_mask))
    return torch.where(max_mask, scores, zeros)


def remove_borders(keypoints, scores, border: int, height: int, width: int):
    """ Removes keypoints too close to the border """
    mask_h = (keypoints[:, 0] >= border) & (keypoints[:, 0] < (height - border))
    mask_w = (keypoints[:, 1] >= border) & (keypoints[:, 1] < (width - border))
    mask = mask_h & mask_w
    return keypoints[mask], scores[mask]


def top_k_keypoints(keypoints, scores, k: int):
    if k >= len(keypoints):
        return keypoints, scores
    scores, indices = torch.topk(scores, k, dim=0)
    return keypoints[indices], scores


def sample_descriptors(keypoints, descriptors, s: int = 8):
    """ Interpolate descriptors at keypoint locations """
    b, c, h, w = descriptors.shape
    keypoints = keypoints - s / 2 + 0.5
    keypoints /= torch.tensor([(w * s - s / 2 - 0.5), (h * s - s / 2 - 0.5)],
                              ).to(keypoints)[None]
    keypoints = keypoints * 2 - 1  # normalize to (-1, 1)
    args = {'align_corners': True} if int(torch.__version__[2]) > 2 else {}
    descriptors = torch.nn.functional.grid_sample(
        descriptors, keypoints.view(b, 1, -1, 2), mode='bilinear', **args)
    descriptors = torch.nn.functional.normalize(
        descriptors.reshape(b, c, -1), p=2, dim=1)
    descriptors = descriptors.permute(0, 2, 1)
    return descriptors


default_config = {
    'descriptor_dim': 256,
    'nms_radius': 4,
    'keypoint_threshold': 0.0005,
    'max_keypoints': 1024,
    'remove_borders': 4,
}


class SuperPoint(nn.Module):
    """SuperPoint Convolutional Detector and Descriptor

    SuperPoint: Self-Supervised Interest Point Detection and
    Description. Daniel DeTone, Tomasz Malisiewicz, and Andrew
    Rabinovich. In CVPRW, 2019. https://arxiv.org/abs/1712.07629

    """
    # default_config = {
    #     'descriptor_dim': 256,
    #     'nms_radius': 4,
    #     'keypoint_threshold': 0.005,
    #     'max_keypoints': -1,
    #     'remove_borders': 4,
    # }

    def __init__(self):
        super().__init__()
        # self.config = {**self.default_config, **config}

        self.relu = nn.ReLU(inplace=True)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        c1, c2, c3, c4, c5 = 64, 64, 128, 128, 256

        self.conv1a = nn.Conv2d(1, c1, kernel_size=3, stride=1, padding=1)
        self.conv1b = nn.Conv2d(c1, c1, kernel_size=3, stride=1, padding=1)
        self.conv2a = nn.Conv2d(c1, c2, kernel_size=3, stride=1, padding=1)
        self.conv2b = nn.Conv2d(c2, c2, kernel_size=3, stride=1, padding=1)
        self.conv3a = nn.Conv2d(c2, c3, kernel_size=3, stride=1, padding=1)
        self.conv3b = nn.Conv2d(c3, c3, kernel_size=3, stride=1, padding=1)
        self.conv4a = nn.Conv2d(c3, c4, kernel_size=3, stride=1, padding=1)
        self.conv4b = nn.Conv2d(c4, c4, kernel_size=3, stride=1, padding=1)

        self.convPa = nn.Conv2d(c4, c5, kernel_size=3, stride=1, padding=1)
        self.convPb = nn.Conv2d(c5, 65, kernel_size=1, stride=1, padding=0)

        self.convDa = nn.Conv2d(c4, c5, kernel_size=3, stride=1, padding=1)
        self.convDb = nn.Conv2d(
            c5, default_config['descriptor_dim'],
            kernel_size=1, stride=1, padding=0)

        # path = Path(__file__).parent / 'weights/superpoint_v1.pth'
        # self.load_state_dict(torch.load(str(path)))

        # mk = default_config['max_keypoints']
        # if mk == 0 or mk < -1:
        #     raise ValueError('\"max_keypoints\" must be positive or \"-1\"')

    def forward(self, data):
        """ Compute keypoints, scores, descriptors for image """
        # Shared Encoder
        x = self.relu(self.conv1a(data))
        x = self.relu(self.conv1b(x))
        x = self.pool(x)
        x = self.relu(self.conv2a(x))
        x = self.relu(self.conv2b(x))
        x = self.pool(x)
        x = self.relu(self.conv3a(x))
        x = self.relu(self.conv3b(x))
        x = self.pool(x)
        x = self.relu(self.conv4a(x))
        x = self.relu(self.conv4b(x))

        # Compute the dense keypoint scores
        cPa = self.relu(self.convPa(x))
        scores = self.convPb(cPa)
        scores = torch.nn.functional.softmax(scores, 1)[:, :-1]
        b, _, h, w = scores.shape
        scores = scores.permute(0, 2, 3, 1).reshape(b, h, w, 8, 8)
        scores = scores.permute(0, 1, 3, 2, 4).reshape(b, h * 8, w * 8)
        scores = simple_nms(scores, default_config['nms_radius'])

        # Extract keypoints
        keypoints = [
            torch.nonzero(s > default_config['keypoint_threshold'])
            for s in scores]
        scores = [s[tuple(k.t())] for s, k in zip(scores, keypoints)]
        
        # Discard keypoints near the image borders
        keypoints, scores = list(zip(*[
            remove_borders(k, s, default_config['remove_borders'], h * 8, w * 8)
            for k, s in zip(keypoints, scores)]))
        
        # Keep the k keypoints with highest score
        if default_config['max_keypoints'] >= 0:
            keypoints, scores = list(zip(*[
                top_k_keypoints(k, s, default_config['max_keypoints'])
                for k, s in zip(keypoints, scores)]))
        
        # Convert (h, w) to (x, y)
        keypoints = [torch.flip(k, [1]).float() for k in keypoints]

        # Compute the dense descriptors
        cDa = self.relu(self.convDa(x))
        descriptors = self.convDb(cDa)
        
        descriptors = torch.nn.functional.normalize(descriptors, p=2, dim=1)
   
        # Extract descriptors
        descriptors = [sample_descriptors(k[None], d[None], 8)[0]
                       for k, d in zip(keypoints, descriptors)]

        
        return {
            'keypoints': keypoints,
            'scores': scores,
            'descriptors': descriptors,
        }

I believe what’s causing this error is this line :
scores = [s[tuple(k.t())] for s, k in zip(scores, keypoints)]

Topic		Replies	Views
Could not find any implementation for node TensorRT cudnn	21	164	December 6, 2024
Tensorrt Inference in Real time Jetson Nano tensorrt , jetson-inference , gstreamer , python	8	1685	April 12, 2023
Error: Could not find any implementation for node {ForeignNode TensorRT	1	2371	February 24, 2023
I do not get any performance improvement after using TensorRT provider for object detection model Jetson Nano tensorrt , onnx	7	1382	July 12, 2022
(Could not find any implementation for node {ForeignNode[Transpose_2713 + (Unnamed Layer* 4032) [Shuffle]...MatMul_2714]}.) TensorRT	7	3144	January 12, 2023
Some PyTorch model with slicing operation fails on inference TensorRT tensorrt , pytorch , onnx , deepstream	2	1418	January 7, 2022
ERORR with ONNX2TRT : Unknown embedded device detected Jetson Xavier NX onnx	18	4517	April 27, 2022
TensorRT model always return NaN output TensorRT	2	247	June 20, 2024
Inference result gets worse when converting pytorch model to TensorRT model TensorRT pytorch	6	1079	January 19, 2022
Unable to build tensorrt engine with DLA enabled on Jetson Xavier NX Jetson Xavier NX tensorrt , cudnn	7	287	May 15, 2024

Could not find any implementation for node failure of TensorRT 8.5 when running on GPU Jetson Xavier NX

Description

Environment

Related topics