Hello,
We are using Thor chip to deploy some onnx model, and there is a interesting issue:
we cannot use trtexec with fp16 precision to convert single attention layer with following conditions:
token dim>=384 and token length >= 720 and number of attention heads>=4
but with a attention mask added to the attention matrix, the model may still be transfered
We use TensorRT version: 10.13.1, CUDA version is 12.8, Drive OS version is 7.0.3.0
We provide the following torch code to reproduce export model:
import torch
import torch.nn as nn
import torch.nn.functional as F
class Attention(nn.Module):
def __init__(self, dim, num_heads=8
):
super().__init__()
assert dim % num_heads == 0, 'dim should be divisible by num_heads'
self.dim = dim
self.num_heads = num_heads
self.head_dim = dim // num_heads
def forward(self, q, k, v, mask=None):
q_len = q.shape[2]
kv_len = k.shape[2]
q = q.reshape(-1, self.num_heads, self.head_dim, q_len).transpose(-1, -2) # (b, h*w, c)
k = k.reshape(-1, self.num_heads, self.head_dim, kv_len).transpose(-1, -2) # (b, h*w, c)
v = v.reshape(-1, self.num_heads, self.head_dim, kv_len).transpose(-1, -2) # (b, h*w, c)
dots = q @ k.transpose(-1, -2) # (B, heads, N_q, N_kv)
if mask is not None:
dots = dots + mask
atten = torch.softmax(dots, dim=-1) # (B, heads, N_q, N_kv)
out = atten @ v # (B, heads, N_q, C//h)
out = (out).transpose(-1, -2).reshape(-1, self.dim, q_len, 1) # (b, c, h, w)
return out
class SimpleAttention(nn.Module):
def __init__(self, dim, heads):
super().__init__()
self.atten = Attention(
dim,
heads
)
def forward(self, q, k ,v, mask=None):
x = self.atten(q, k, v, mask)
return x
dim = 384
head = 4
q_token_len = 720
kv_token_len = 960
torch_model = SimpleAttention(dim, head)
# Create example inputs for exporting the model. The inputs should be a tuple of tensors.
example_inputs = (
torch.randn(1, dim, q_token_len, 1),
torch.randn(1, dim, kv_token_len, 1),
torch.randn(1, dim, kv_token_len, 1),
torch.randn(1, head, q_token_len, kv_token_len)
)
torch.onnx.export(torch_model,
example_inputs,
'./atten_mask.onnx', # this model can be transfered to TRT engine under fp16
verbose=False,
opset_version=19,
)
example_inputs = (
torch.randn(1, dim, q_token_len, 1),
torch.randn(1, dim, kv_token_len, 1),
torch.randn(1, dim, kv_token_len, 1),
)
torch.onnx.export(torch_model,
example_inputs,
'./atten_womask.onnx', # this model can not be transfered to TRT engine under fp16
verbose=False,
opset_version=19,
)
We use the following code to transfer onnx model to tensorrt engine:
trtexec --onnx=./atten_mask.onnx --saveEngine=./atten_mask.fp16.trt --fp16
trtexec --onnx=./atten_womask.onnx --saveEngine=./atten_womask.fp16.trt --fp16
Thanks for any helps!