Description
- Project Address: GitHub - mlfoundations/open_clip: An open source implementation of CLIP..
- model_name=“ViT-L-14”, pretrained=‘laion2b_s32b_b82k’
- to_onnx code:
import os
import torch
import open_clip
import onnx
import torch.nn as nn
import onnxruntime as ort
from PIL import Image
class FCN_Clip_Vision(nn.Module):
def __init__(self, model_name="ViT-L-14", pretrained='laion2b_s32b_b82k'):
super(FCN_Clip_Vision, self).__init__()
self.device = 'cpu'
self.model, _, self.preprocess = open_clip.create_model_and_transforms(model_name, pretrained, device=self.device)
self.model.eval()
def forward(self, image_batch):
return self.model.encode_image(image_batch)
class ClipOnnxModel():
def __init__(self, model_name="ViT-L-14", pretrained='laion2b_s32b_b82k') -> None:
self.device = "cpu"
self.model_name = model_name
self.pretrained = pretrained
self.output_visual_onnx="ViT_L_14_visual.onnx"
self.output_textual_onnx="ViT_L_14_textual.onnx"
self.model, _, self.preprocess = open_clip.create_model_and_transforms(model_name, pretrained, device=self.device)
self.tokenizer = open_clip.get_tokenizer(model_name)
self.model.eval()
def to_onnx(self, image_paths):
images = [self.preprocess(Image.open(image_path)) for image_path in image_paths]
stack_images = torch.stack(images)
self.vision_model = FCN_Clip_Vision(self.model_name, self.pretrained)
torch.onnx.export(
self.vision_model, stack_images, self.output_visual_onnx,
opset_version=14, export_params=True, do_constant_folding=True,
input_names=["input"], output_names=["output"],
dynamic_axes={ 'input': {}, 'output': {} }, verbose=False
)
clip_onnx_model = ClipOnnxModel()
clip_onnx_model.to_onnx(image_paths=image_paths)
- onnx to trt:
trtexec --onnx=ViT_L_14_visual.onnx --saveEngine=ViT_L_14_visual.trt --optShapes=input:20x3x224x224
- infer code:
import os
import time
import tensorrt as trt
import numpy as np
from PIL import Image
import pycuda.driver as cuda
import torch
import open_clip
import pycuda.autoinit
class Clip_Vision():
def __init__(self, model_name="ViT-L-14", pretrained='laion2b_s32b_b82k'):
self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
self.model, _, self.preprocess = open_clip.create_model_and_transforms(model_name, pretrained, device=self.device)
self.model.eval()
def do_proprocess(self, image_paths):
return [self.preprocess(Image.open(img_path)) for img_path in image_paths]
class ClipTensorrtWarpper():
def __init__(self, engine_path, target_dtype = np.float32):
self.clip_vision = Clip_Vision()
self.trt_logger = trt.Logger()
assert os.path.exists(engine_path)
print("Reading engine from file {}".format(engine_path))
with open(engine_path, "rb") as f, trt.Runtime(self.trt_logger) as runtime:
self.engine = runtime.deserialize_cuda_engine(f.read())
self.context = self.engine.create_execution_context()
self.target_dtype = target_dtype
self.output_dim = 768
def batch_infer(self, image_paths):
tensor_list = self.clip_vision.do_proprocess(image_paths)
stacked_tensor = torch.stack(tensor_list)
np_array = stacked_tensor.numpy()
contiguous_array = np_array if not np_array.flags['C_CONTIGUOUS'] else np.ascontiguousarray(np_array)
batch_size = len(image_paths)
s_ts = time.time()
d_input = cuda.mem_alloc(contiguous_array.nbytes)
h_output = np.empty(self.output_dim * batch_size, dtype = self.target_dtype)
d_output = cuda.mem_alloc(h_output.nbytes)
tensor_names = [self.engine.get_tensor_name(i) for i in range(self.engine.num_io_tensors)]
assert(len(tensor_names) == 2)
input_shape = stacked_tensor.shape
self.context.set_input_shape(tensor_names[0], input_shape)
self.context.set_tensor_address(tensor_names[0], int(d_input))
self.context.set_tensor_address(tensor_names[1], int(d_output))
stream = cuda.Stream()
cuda.memcpy_htod_async(d_input, contiguous_array, stream)
self.context.execute_async_v3(stream.handle)
cuda.memcpy_dtoh_async(h_output, d_output, stream)
stream.synchronize()
elapsed_time = (time.time() - s_ts) * 1000
print(f"TensorRT infer {batch_size=} {elapsed_time=}ms")
return h_output
def clip_batch_infer(self, image_paths):
batch_size = len(image_paths)
tensor_list = self.clip_vision.do_proprocess(image_paths)
stacked_tensor = torch.stack(tensor_list)
s_ts = time.time()
image_batch = stacked_tensor.to(self.clip_vision.device)
image_features = self.clip_vision.model.encode_image(image_batch)
elapsed_time = (time.time() - s_ts) * 1000
print(f"open_clip infer {batch_size=} {elapsed_time=}ms")
return image_features
def main():
image_paths = []
engine_path = 'ViT_L_14_visual.trt'
engine = ClipTensorrtWarpper(engine_path)
for i in range(5):
results1 = engine.batch_infer(image_paths[:20])
for i in range(5):
results2 = engine.clip_batch_infer(image_paths[:20])
- elapsed time comparison
TensorRT infer batch_size=20 elapsed_time=312.9265308380127ms
TensorRT infer batch_size=20 elapsed_time=270.9469795227051ms
TensorRT infer batch_size=20 elapsed_time=271.61502838134766ms
TensorRT infer batch_size=20 elapsed_time=270.9689140319824ms
TensorRT infer batch_size=20 elapsed_time=272.4294662475586ms
open_clip infer batch_size=20 elapsed_time=372.6212978363037ms
open_clip infer batch_size=20 elapsed_time=72.8156566619873ms
open_clip infer batch_size=20 elapsed_time=25.998353958129883ms
open_clip infer batch_size=20 elapsed_time=24.63388442993164ms
open_clip infer batch_size=20 elapsed_time=23.57339859008789ms
Environment
TensorRT Version: 10.2
GPU Type: V100
Nvidia Driver Version: 550.90.12
CUDA Version: 12.4
CUDNN Version: 8
Operating System + Version: Ubuntu 20.04.6 LTS
Python Version (if applicable): 3.10.14
PyTorch Version (if applicable): 2.4.1+cu121