Tensorrt inference api that open_clip ViT-L-14 is slowing down

Description

import os
import torch
import open_clip
import onnx
import torch.nn as nn
import onnxruntime as ort
from PIL import Image
class FCN_Clip_Vision(nn.Module):
    def __init__(self, model_name="ViT-L-14", pretrained='laion2b_s32b_b82k'):
        super(FCN_Clip_Vision, self).__init__()
        self.device = 'cpu'
        self.model, _, self.preprocess = open_clip.create_model_and_transforms(model_name, pretrained, device=self.device)
        self.model.eval()
    def forward(self, image_batch):
        return self.model.encode_image(image_batch)
class ClipOnnxModel():
    def __init__(self, model_name="ViT-L-14", pretrained='laion2b_s32b_b82k') -> None:
        self.device = "cpu"
        self.model_name = model_name
        self.pretrained = pretrained
        self.output_visual_onnx="ViT_L_14_visual.onnx"
        self.output_textual_onnx="ViT_L_14_textual.onnx"
        self.model, _, self.preprocess = open_clip.create_model_and_transforms(model_name,   pretrained, device=self.device)
        self.tokenizer = open_clip.get_tokenizer(model_name)
        self.model.eval()

    def to_onnx(self, image_paths):
        images = [self.preprocess(Image.open(image_path)) for image_path in image_paths]
        stack_images = torch.stack(images)
        self.vision_model = FCN_Clip_Vision(self.model_name, self.pretrained)
        torch.onnx.export(
            self.vision_model,  stack_images, self.output_visual_onnx,
            opset_version=14, export_params=True, do_constant_folding=True,        
            input_names=["input"],  output_names=["output"],   
            dynamic_axes={  'input': {},   'output': {} }, verbose=False
        )

clip_onnx_model = ClipOnnxModel()
clip_onnx_model.to_onnx(image_paths=image_paths)
  • onnx to trt:
trtexec --onnx=ViT_L_14_visual.onnx --saveEngine=ViT_L_14_visual.trt --optShapes=input:20x3x224x224
  • infer code:
import os
import time
import tensorrt as trt
import numpy as np
from PIL import Image
import pycuda.driver as cuda
import torch
import open_clip
import pycuda.autoinit

class Clip_Vision():
    def __init__(self, model_name="ViT-L-14", pretrained='laion2b_s32b_b82k'):
        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
        self.model, _, self.preprocess = open_clip.create_model_and_transforms(model_name,  pretrained, device=self.device)
        self.model.eval()
    def do_proprocess(self, image_paths):
        return [self.preprocess(Image.open(img_path)) for img_path in image_paths]

class ClipTensorrtWarpper():
    def __init__(self, engine_path, target_dtype = np.float32):
        self.clip_vision = Clip_Vision()
        self.trt_logger = trt.Logger()
        assert os.path.exists(engine_path)
        print("Reading engine from file {}".format(engine_path))
        with open(engine_path, "rb") as f, trt.Runtime(self.trt_logger) as runtime:
            self.engine = runtime.deserialize_cuda_engine(f.read())
            self.context = self.engine.create_execution_context()
        self.target_dtype = target_dtype
        self.output_dim = 768

    def batch_infer(self, image_paths):
        tensor_list = self.clip_vision.do_proprocess(image_paths)
        stacked_tensor = torch.stack(tensor_list)
        np_array = stacked_tensor.numpy()

        contiguous_array = np_array if not np_array.flags['C_CONTIGUOUS'] else np.ascontiguousarray(np_array)
        batch_size = len(image_paths)
        s_ts = time.time()
        d_input = cuda.mem_alloc(contiguous_array.nbytes)
        h_output = np.empty(self.output_dim * batch_size, dtype = self.target_dtype)
        d_output = cuda.mem_alloc(h_output.nbytes)
        tensor_names = [self.engine.get_tensor_name(i) for i in range(self.engine.num_io_tensors)]
        assert(len(tensor_names) == 2)
        input_shape = stacked_tensor.shape
        self.context.set_input_shape(tensor_names[0], input_shape)
        self.context.set_tensor_address(tensor_names[0], int(d_input))
        self.context.set_tensor_address(tensor_names[1], int(d_output))

        stream = cuda.Stream()
        cuda.memcpy_htod_async(d_input, contiguous_array, stream)
        self.context.execute_async_v3(stream.handle)
        cuda.memcpy_dtoh_async(h_output, d_output, stream)
        stream.synchronize()
        elapsed_time = (time.time() - s_ts) * 1000
        print(f"TensorRT infer {batch_size=} {elapsed_time=}ms")
        return h_output

    def clip_batch_infer(self, image_paths):
        batch_size = len(image_paths)
        tensor_list = self.clip_vision.do_proprocess(image_paths)
        stacked_tensor = torch.stack(tensor_list)
        s_ts = time.time()
        image_batch = stacked_tensor.to(self.clip_vision.device)
        image_features = self.clip_vision.model.encode_image(image_batch)
        elapsed_time = (time.time() - s_ts) * 1000
        print(f"open_clip infer {batch_size=} {elapsed_time=}ms")
        return image_features

def main():
    image_paths = []
    engine_path = 'ViT_L_14_visual.trt' 
    engine = ClipTensorrtWarpper(engine_path)
    for i in range(5):
        results1 = engine.batch_infer(image_paths[:20])
    for i in range(5):
        results2 = engine.clip_batch_infer(image_paths[:20])
  • elapsed time comparison
    TensorRT infer batch_size=20 elapsed_time=312.9265308380127ms
    TensorRT infer batch_size=20 elapsed_time=270.9469795227051ms
    TensorRT infer batch_size=20 elapsed_time=271.61502838134766ms
    TensorRT infer batch_size=20 elapsed_time=270.9689140319824ms
    TensorRT infer batch_size=20 elapsed_time=272.4294662475586ms
    open_clip infer batch_size=20 elapsed_time=372.6212978363037ms
    open_clip infer batch_size=20 elapsed_time=72.8156566619873ms
    open_clip infer batch_size=20 elapsed_time=25.998353958129883ms
    open_clip infer batch_size=20 elapsed_time=24.63388442993164ms
    open_clip infer batch_size=20 elapsed_time=23.57339859008789ms

Environment

TensorRT Version: 10.2
GPU Type: V100
Nvidia Driver Version: 550.90.12
CUDA Version: 12.4
CUDNN Version: 8
Operating System + Version: Ubuntu 20.04.6 LTS
Python Version (if applicable): 3.10.14
PyTorch Version (if applicable): 2.4.1+cu121


ViT_L_14_visual.trt Performance summary

ViT_L_14_visual.onnx Performance summary

Which precision is your pytorch model and your ONNX respectively ? TensorRT will only obey the precision specified in the ONNX if you compile the engine with --stronglyTyped