Dynamic batch size for tensorrt Engine

Description

I’ve been grappling with TensorRT for dynamic batch size inference and have used explicit batch sizes, and also optimization profiles. However, despite my efforts, I’m still encountering difficulties. NVIDIA’s documentation are quite complex, detailed, and challenging to comprehend. Could someone provide a clearer explanation or perhaps a step-by-step guide on how to effectively implement dynamic batch size inference using TensorRT? Any insights or tips would be greatly appreciated. I have been dealing with Tensorrt for a very long time and it is really troublesome working with dynamic batch sizes( works fine when I use onnx without dynamic axis, and convert it to Tensorrt , but for dynamic batch size I am stuck. I can’t After doing all the following I get the following errores :

[05/05/2024-11:58:38] [TRT] [E] 3: [executionContext.cpp::resolveSlots::2501] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/executionContext.cpp::resolveSlots::2501, condition: allInputDimensionsSpecified(routine)
)
[05/05/2024-11:58:38] [TRT] [E] 3: [executionContext.cpp::resolveSlots::2501] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/executionContext.cpp::resolveSlots::2501, condition: allInputDimensionsSpecified(routine)
)
[05/05/2024-11:58:38] [TRT] [E] 3: [executionContext.cpp::resolveSlots::2501] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/executionContext.cpp::resolveSlots::2501, condition: allInputDimensionsSpecified(routine)

Hugging face link for the pytorch model I am trying to convert to Tensorrt: m3hrdadfi/wav2vec2-large-xlsr-persian-v3 · Hugging Face

And here is the code I am using to try to turn a pytorch model to onnx:

def fixedBatchSize_load_onnx( model_path ,inputs_,halved,device):# for loading the model in halved and without halved precission
    model = Wav2Vec2ForCTC.from_pretrained(model_path )
    model.eval()
    dtype = torch.float16 if halved else torch.float32
    inputs = inputs_.input_values
    masks = inputs_.attention_mask
    onnxModelDir = os.path.join("models" , "onnx",  f"{Wav2Vec2ForCTC.__name__}.onnx")
    onnxHalfModelDir = os.path.join("models" , "onnx",  f"{Wav2Vec2ForCTC.__name__}-halved.onnx")
    if not os.path.exists(onnxModelDir) or not os.path.exists(onnxHalfModelDir):
        head, _ = os.path.split(onnxModelDir)
        os.makedirs(head,exist_ok = True)
        # dummy_input = torch.unsqueeze(inputs , dim=0) 
        torch.onnx.export(
            model,
            (inputs, inputs),
            onnxModelDir,
            dynamic_axes={"input": {0: "batch_size"},
                "masks": {0: "batch_size"},
                "output": {0: "batch_size"}},
            input_names=["input", "masks"],
            output_names=["output"],
            opset_version=11 )

And here is the code I am using for trying to convert that onnx model and create tensorrt engine:

def createTensorrtModel(onnxModelDir,tensorRT_model_path):    
    # Step 3: Convert ONNX model to TensorRT
    TRT_LOGGER = trt.Logger(trt.Logger.INFO)
    builder = trt.Builder(TRT_LOGGER)
    network_flags = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
    profile = builder.create_optimization_profile();
    profile.set_shape("input", (1, 240320), (3 , 240320), (10,240320)) # for dynamic batch-size for inputs 
    profile.set_shape("masks", (1, 240320), (3 , 240320), (10,240320)) # for dynamic batch-size for masks 
    with builder.create_network(network_flags) as network, builder.create_builder_config() as config, trt.OnnxParser(network, TRT_LOGGER) as parser:
        # config.max_workspace_size = 1 << 30  # Set workspace size
        # config.set_flag(trt.BuilderFlag.EXPLICIT_BATCH)
        config.add_optimization_profile(profile)
        with open(onnxModelDir, "rb") as f:
            if not parser.parse(f.read()):
                for error in range(parser.num_errors):
                    print(parser.get_error(error))
            else:
                engine = builder.build_engine(network, config)
                # serialized_engine = builder.build_serialized_network(network, config)
    # Step 4: Save TensorRT engine to file
    with open(tensorRT_model_path, "wb") as f:
        f.write(engine.serialize())

And here is how I prepare the engine and handle bindings, context, and engine :

def load_tensorrt(modelPath ,inputs_,  device="cuda:0", halved=False):
        tensorrtModelPath = r"Wav2Vec2ForCTC.trt"
        input_ = inputs_["input_values"]
        masks_ = inputs_["attention_mask"]
        input_ = input_.half() if halved else input_ 
        masks_ = masks_.half() if halved else masks_ 
        input_ =input_.to("cuda:0")
        masks_ =masks_.to("cuda:0")
        if device != 'cuda':
            raise Exception("for TensorRT you must only use GPUs")

            
        if halved :
            tensorrtModelDir = os.path.join("models" , "tensorrt",  f"{Wav2Vec2ForCTC.__name__}.trt")
            onnxModelDir = os.path.join("models" , "onnx",  f"{Wav2Vec2ForCTC.__name__}.onnx")
        else :
            onnxModelDir = os.path.join("models" , "onnx",  f"{Wav2Vec2ForCTC.__name__}.onnx")
            tensorrtModelDir = os.path.join("models" , "tensorrt",  f"{Wav2Vec2ForCTC.__name__}.trt")
            
            
        if not os.path.exists(onnxModelDir):
            model = Wav2Vec2ForCTC.from_pretrained(modelPath )
            model.eval()
            os.makedirs(os.path.join("models" , "onnx"), exist_ok= True)
            _= fixedBatchSize_load_onnx(modelPath ,inputs_,halved,device)
            
        if not os.path.exists(tensorrtModelDir):
            model = Wav2Vec2ForCTC.from_pretrained(modelPath )
            model.eval()
            os.makedirs(os.path.join("models" , "tensorrt"), exist_ok= True)
            _= createTensorrtModel(onnxModelDir,tensorrtModelDir)
            
        with open(tensorrtModelDir, 'rb') as f, trt.Runtime(trt.Logger(trt.Logger.WARNING)) as runtime:
            engine = runtime.deserialize_cuda_engine(f.read())    # encoder
        print ("successfully Loaded the TensorRT model")
        # Step 4: Inference with TensorRT model
        context =  engine.create_execution_context() 
        bindings= [] 
        # inspector = engine.create_engine_inspector()
        try :
            for binding in engine:
                binding_idx = engine.get_binding_index(binding)
                bindingName = engine.get_tensor_name(binding_idx)
                size = trt.volume(context.get_tensor_shape(bindingName))
                size = abs(size)
                dtype = trt.nptype(engine.get_tensor_dtype(bindingName))
                tensorMode = engine.get_tensor_mode(bindingName).value
                if tensorMode == 1:
                    if binding == "input":
                        bindings.append(int(input_.data_ptr()))
                    elif binding == "masks":
                        bindings.append(int(masks_.data_ptr()))
                elif tensorMode ==2:
                    if binding == "output":
                        # if size<0:
                            # size = abs(size)
                        output_ = torch.zeros(size,device="cuda").unsqueeze(0)
                        bindings.append(int(output_.data_ptr()))
            return engine, context, bindings,output_
        except Exception as e:
            print("An error occurred:", e)
            return None

and for the final step , for inferencing :

    mode == "tensorrt":
        engine, context, bindings,output_ = load_tensorrt(model_path ,inputs,  device, halved)
        stream = torch.cuda.Stream(device="cuda") 
        context.set_optimization_profile_async(0, stream.cuda_stream)# for dnamic batch-size
        for _ in range(nwarmup):
            context.execute_async_v2(bindings=bindings, stream_handle=stream.cuda_stream)
            stream.synchronize()# Transfer prediction output from the GPU.
        for _ in range(numberOfTimes): 
            mdoel_start_time = time.time()
            context.execute_async_v2(bindings=bindings, stream_handle=stream.cuda_stream)
            stream.synchronize()# Transfer prediction output from the GPU.
            model_time_track.append(time.time() - mdoel_start_time)
            predicted_sentences,decoder_time_track = decoder_timing(decoder_time_track, output_)

    result['outputs'] = predicted_sentences

> system description :
> Python 3.8.10
> NAME=“Ubuntu”
> VERSION=“20.04.5 LTS (Focal Fossa)”
> NVIDIA GeForce RTX 3090
> Cuda compilation tools, release 11.8, V11.8.89
> Build cuda_11.8.r11.8/compiler.31833905_0
***> NVIDIA-SMI 520.61.05 Driver Version: 520.61.05 CUDA Version: 11.8 ***
> docker container version : nvcr.io/nvidia/pytorch:22.12-py3

requirements.txt (5.9 KB)