Description
I’ve been grappling with TensorRT for dynamic batch size inference and have used explicit batch sizes, and also optimization profiles. However, despite my efforts, I’m still encountering difficulties. NVIDIA’s documentation are quite complex, detailed, and challenging to comprehend. Could someone provide a clearer explanation or perhaps a step-by-step guide on how to effectively implement dynamic batch size inference using TensorRT? Any insights or tips would be greatly appreciated. I have been dealing with Tensorrt for a very long time and it is really troublesome working with dynamic batch sizes( works fine when I use onnx without dynamic axis, and convert it to Tensorrt , but for dynamic batch size I am stuck. I can’t After doing all the following I get the following errores :
[05/05/2024-11:58:38] [TRT] [E] 3: [executionContext.cpp::resolveSlots::2501] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/executionContext.cpp::resolveSlots::2501, condition: allInputDimensionsSpecified(routine)
)
[05/05/2024-11:58:38] [TRT] [E] 3: [executionContext.cpp::resolveSlots::2501] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/executionContext.cpp::resolveSlots::2501, condition: allInputDimensionsSpecified(routine)
)
[05/05/2024-11:58:38] [TRT] [E] 3: [executionContext.cpp::resolveSlots::2501] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/executionContext.cpp::resolveSlots::2501, condition: allInputDimensionsSpecified(routine)
Hugging face link for the pytorch model I am trying to convert to Tensorrt: m3hrdadfi/wav2vec2-large-xlsr-persian-v3 · Hugging Face
And here is the code I am using to try to turn a pytorch model to onnx:
def fixedBatchSize_load_onnx( model_path ,inputs_,halved,device):# for loading the model in halved and without halved precission
model = Wav2Vec2ForCTC.from_pretrained(model_path )
model.eval()
dtype = torch.float16 if halved else torch.float32
inputs = inputs_.input_values
masks = inputs_.attention_mask
onnxModelDir = os.path.join("models" , "onnx", f"{Wav2Vec2ForCTC.__name__}.onnx")
onnxHalfModelDir = os.path.join("models" , "onnx", f"{Wav2Vec2ForCTC.__name__}-halved.onnx")
if not os.path.exists(onnxModelDir) or not os.path.exists(onnxHalfModelDir):
head, _ = os.path.split(onnxModelDir)
os.makedirs(head,exist_ok = True)
# dummy_input = torch.unsqueeze(inputs , dim=0)
torch.onnx.export(
model,
(inputs, inputs),
onnxModelDir,
dynamic_axes={"input": {0: "batch_size"},
"masks": {0: "batch_size"},
"output": {0: "batch_size"}},
input_names=["input", "masks"],
output_names=["output"],
opset_version=11 )
And here is the code I am using for trying to convert that onnx model and create tensorrt engine:
def createTensorrtModel(onnxModelDir,tensorRT_model_path):
# Step 3: Convert ONNX model to TensorRT
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
builder = trt.Builder(TRT_LOGGER)
network_flags = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
profile = builder.create_optimization_profile();
profile.set_shape("input", (1, 240320), (3 , 240320), (10,240320)) # for dynamic batch-size for inputs
profile.set_shape("masks", (1, 240320), (3 , 240320), (10,240320)) # for dynamic batch-size for masks
with builder.create_network(network_flags) as network, builder.create_builder_config() as config, trt.OnnxParser(network, TRT_LOGGER) as parser:
# config.max_workspace_size = 1 << 30 # Set workspace size
# config.set_flag(trt.BuilderFlag.EXPLICIT_BATCH)
config.add_optimization_profile(profile)
with open(onnxModelDir, "rb") as f:
if not parser.parse(f.read()):
for error in range(parser.num_errors):
print(parser.get_error(error))
else:
engine = builder.build_engine(network, config)
# serialized_engine = builder.build_serialized_network(network, config)
# Step 4: Save TensorRT engine to file
with open(tensorRT_model_path, "wb") as f:
f.write(engine.serialize())
And here is how I prepare the engine and handle bindings, context, and engine :
def load_tensorrt(modelPath ,inputs_, device="cuda:0", halved=False):
tensorrtModelPath = r"Wav2Vec2ForCTC.trt"
input_ = inputs_["input_values"]
masks_ = inputs_["attention_mask"]
input_ = input_.half() if halved else input_
masks_ = masks_.half() if halved else masks_
input_ =input_.to("cuda:0")
masks_ =masks_.to("cuda:0")
if device != 'cuda':
raise Exception("for TensorRT you must only use GPUs")
if halved :
tensorrtModelDir = os.path.join("models" , "tensorrt", f"{Wav2Vec2ForCTC.__name__}.trt")
onnxModelDir = os.path.join("models" , "onnx", f"{Wav2Vec2ForCTC.__name__}.onnx")
else :
onnxModelDir = os.path.join("models" , "onnx", f"{Wav2Vec2ForCTC.__name__}.onnx")
tensorrtModelDir = os.path.join("models" , "tensorrt", f"{Wav2Vec2ForCTC.__name__}.trt")
if not os.path.exists(onnxModelDir):
model = Wav2Vec2ForCTC.from_pretrained(modelPath )
model.eval()
os.makedirs(os.path.join("models" , "onnx"), exist_ok= True)
_= fixedBatchSize_load_onnx(modelPath ,inputs_,halved,device)
if not os.path.exists(tensorrtModelDir):
model = Wav2Vec2ForCTC.from_pretrained(modelPath )
model.eval()
os.makedirs(os.path.join("models" , "tensorrt"), exist_ok= True)
_= createTensorrtModel(onnxModelDir,tensorrtModelDir)
with open(tensorrtModelDir, 'rb') as f, trt.Runtime(trt.Logger(trt.Logger.WARNING)) as runtime:
engine = runtime.deserialize_cuda_engine(f.read()) # encoder
print ("successfully Loaded the TensorRT model")
# Step 4: Inference with TensorRT model
context = engine.create_execution_context()
bindings= []
# inspector = engine.create_engine_inspector()
try :
for binding in engine:
binding_idx = engine.get_binding_index(binding)
bindingName = engine.get_tensor_name(binding_idx)
size = trt.volume(context.get_tensor_shape(bindingName))
size = abs(size)
dtype = trt.nptype(engine.get_tensor_dtype(bindingName))
tensorMode = engine.get_tensor_mode(bindingName).value
if tensorMode == 1:
if binding == "input":
bindings.append(int(input_.data_ptr()))
elif binding == "masks":
bindings.append(int(masks_.data_ptr()))
elif tensorMode ==2:
if binding == "output":
# if size<0:
# size = abs(size)
output_ = torch.zeros(size,device="cuda").unsqueeze(0)
bindings.append(int(output_.data_ptr()))
return engine, context, bindings,output_
except Exception as e:
print("An error occurred:", e)
return None
and for the final step , for inferencing :
mode == "tensorrt":
engine, context, bindings,output_ = load_tensorrt(model_path ,inputs, device, halved)
stream = torch.cuda.Stream(device="cuda")
context.set_optimization_profile_async(0, stream.cuda_stream)# for dnamic batch-size
for _ in range(nwarmup):
context.execute_async_v2(bindings=bindings, stream_handle=stream.cuda_stream)
stream.synchronize()# Transfer prediction output from the GPU.
for _ in range(numberOfTimes):
mdoel_start_time = time.time()
context.execute_async_v2(bindings=bindings, stream_handle=stream.cuda_stream)
stream.synchronize()# Transfer prediction output from the GPU.
model_time_track.append(time.time() - mdoel_start_time)
predicted_sentences,decoder_time_track = decoder_timing(decoder_time_track, output_)
result['outputs'] = predicted_sentences
> system description :
> Python 3.8.10
> NAME=“Ubuntu”
> VERSION=“20.04.5 LTS (Focal Fossa)”
> NVIDIA GeForce RTX 3090
> Cuda compilation tools, release 11.8, V11.8.89
> Build cuda_11.8.r11.8/compiler.31833905_0
***> NVIDIA-SMI 520.61.05 Driver Version: 520.61.05 CUDA Version: 11.8 ***
> docker container version : nvcr.io/nvidia/pytorch:22.12-py3
requirements.txt (5.9 KB)