Hi, @RicardoLu. I also catch the problem. My trt model inference and return wrong output. It different with Raw model and ONNX model (ONNX model work well)
My environmet:
- Docker image:
nvcr.io/nvidia/deepstream:6.3-triton-multiarch
- TensorRT: 10.3.0
- CUDA: 12.6
- Python: 3.10.12
- Polygraphy: 0.49.24
- Transformers: 4.53.1
Here is my step:
1. I export ONNX model using optimum-cli
optimum-cli export onnx --model Qwen/Qwen3-Embedding-0.6B --task feature-extraction --opset 19
models/qwen3_embedding_0.6b_onnx
2. Use trtexec to build trt engine:
trtexec --onnx=models/qwen3_embedding_0.6b_onnx/model.onnx \
--minShapes=input_ids:1x1024,attention_mask:1x1024,position_ids:1x1024 \
--optShapes=input_ids:4x1024,attention_mask:4x1024,position_ids:4x1024 \
--maxShapes=input_ids:8x1024,attention_mask:8x1024,position_ids:8x1024 \
--fp16 \
--saveEngine=qwen3_embedding_0.6b.engine
trtexec --onnx=models/qwen3_embedding_0.6b_onnx/model.onnx \
--minShapes=input_ids:1x1024,attention_mask:1x1024,position_ids:1x1024 \
--optShapes=input_ids:4x1024,attention_mask:4x1024,position_ids:4x1024 \
--maxShapes=input_ids:8x1024,attention_mask:8x1024,position_ids:8x1024 \
--best \
--saveEngine=model_repository/qwen3_embedding_0.6b/1/qwen3_embedding_0.6b.engine
3. Use python script to check trt model:
from transformers import AutoTokenizer, AutoModel
from polygraphy.backend.trt import EngineFromBytes, TrtRunner
import time
import numpy as np
import torch
import sys
import os
def run_tensorrt_polygraphy_model(texts, engine_path):
print("\n" + "=" * 50)
print("RUNNING TENSORRT MODEL ")
print("=" * 50)
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-Embedding-0.6B")
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
try:
# Load TensorRT engine using Polygraphy
with open(engine_path, 'rb') as f:
engine_bytes = f.read()
engine = EngineFromBytes(engine_bytes)
print(f"TensorRT engine loaded from {engine_path}")
# Create TensorRT runner
with TrtRunner(engine) as runner:
# Tokenize inputs
start_time = time.time()
inputs = tokenizer(
texts,
padding='max_length',
max_length=1024,
truncation=True,
return_tensors="np"
)
tokenization_time = time.time() - start_time
print(f"Tokenization time: {tokenization_time:.4f}s")
# Print input info
print("Input shapes:")
for key, value in inputs.items():
print(f" {key}: {value.shape}")
# Prepare inputs
input_ids = inputs['input_ids'].astype(np.int64)
attention_mask = inputs['attention_mask'].astype(np.int64)
position_ids = np.arange(1024, dtype=np.int64)[np.newaxis, :].repeat(len(texts), axis=0)
print(f" position_ids: {position_ids.shape}")
# Prepare input dictionary for TensorRT
input_dict = {
'input_ids': input_ids,
'attention_mask': attention_mask,
'position_ids': position_ids
}
# Run inference
start_time = time.time()
outputs = runner.infer(input_dict)
inference_time = time.time() - start_time
print(f"Inference time: {inference_time:.4f}s")
# Get output (assuming first output is last_hidden_state)
output_names = list(outputs.keys())
print(f"Available outputs: {output_names}")
# Get the main output (should be last_hidden_state)
last_hidden_state = None
for output_name in output_names:
if 'last_hidden_state' in output_name.lower() or len(output_names) == 1:
last_hidden_state = outputs[output_name]
break
if last_hidden_state is None:
# Take the first output if no clear match
last_hidden_state = outputs[output_names[0]]
print(f"Using output: {output_names[0]}")
print(f"TensorRT output shape: {last_hidden_state.shape}")
print(f"TensorRT output range: [{last_hidden_state.min():.6f}, {last_hidden_state.max():.6f}]")
print(f"TensorRT output mean: {last_hidden_state.mean():.6f}")
print(f"TensorRT output std: {last_hidden_state.std():.6f}")
# Pool embeddings
embeddings_list = pool_embeddings(last_hidden_state, attention_mask)
embeddings = np.stack(embeddings_list)
print(f"Embeddings shape after pooling: {embeddings.shape}")
# Calculate similarity
similarity_scores, normalized_embeddings = calculate_similarity(embeddings)
# Calculate norms
norms = np.linalg.norm(normalized_embeddings, axis=1)
print(f"Embedding norms: {norms.tolist()}")
print(f"Similarity scores:")
print(f" TensorRT scores: {similarity_scores}")
return {
'embeddings': normalized_embeddings,
'raw_outputs': last_hidden_state,
'inputs': {k: v for k, v in inputs.items()},
'similarity_scores': similarity_scores,
'inference_time': inference_time,
'output_stats': {
'min': float(last_hidden_state.min()),
'max': float(last_hidden_state.max()),
'mean': float(last_hidden_state.mean()),
'std': float(last_hidden_state.std())
}
}
except Exception as e:
print(f"TensorRT Polygraphy model failed: {e}")
import traceback
traceback.print_exc()
return None
def main():
texts = [
"What is the capital of China?",
"The capital of China is Beijing.",
"Gravity is a force that attracts two bodies toward each other."
]
engine_path = "/qwen3_embedding_0.6b/1/qwen3_embedding_0.6b.engine"
tensorrt_result = run_tensorrt_polygraphy_model(texts, engine_path)
if __name__ == "__main__":
main()