API Usage Error (Parameter check failed at: runtime/api/executionContext.cpp::resolveSlots::1480, condition: allInputDimensionsSpecified(routine)


first I converted torch model to onnx using the following code in terminal :

python -m transformers.onnx --model=monologg/bert-base-cased-goemotions-original bertbase-cased-goemotion-original/ --feature causal-lm

then using the following python code I built an engine :

import pycuda.driver as cuda
import pycuda.autoinit
import onnx
import argparse
import tensorrt as trt
import os
import sys
def build_engine(model_file, shapes, max_ws=512*1024*1024, fp16=True):
    TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE)
    builder = trt.Builder(TRT_LOGGER)
    #builder.fp16_mode = fp16

    config = builder.create_builder_config()
    # config.max_workspace_size = max_ws
    config.max_workspace_size = 1 << 31
    if fp16:
        # config.flags |= 1 << int(trt.BuilderFlag.FP16)
    profile = builder.create_optimization_profile()
    for s in shapes:
        profile.set_shape(s['name'], min=s['min'], opt=s['opt'], max=s['max'])
    #Added for further optimization
    config.set_tactic_sources(1 << int(trt.TacticSource.CUBLAS))
    explicit_batch = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
    network = builder.create_network(explicit_batch)

    with trt.OnnxParser(network, TRT_LOGGER) as parser:
        with open(model_file, 'rb') as model:
            parsed = parser.parse(model.read())

        last_layer = network.get_layer(network.num_layers - 1)
        # Check if last layer recognizes it's output
        if not last_layer.get_output(0):
            # If not, then mark the output using TensorRT API
        for i in range(parser.num_errors):
            print("TensorRT ONNX parser error:", parser.get_error(i))
        engine = builder.build_serialized_network(network, config=config)

        return engine

#from trt_utils import build_engine
model_name = "bert-base-cased-goemotions-ekman_model"

fp16 = False
engine_prec = "_fp16" if fp16 else "_fp32"
out_folder = "/3" if fp16 else "/1"

model = "/yazdani/bert-base-cased-goemotion-original/model.onnx"
output =  "/yazdani/bert-base-cased-goemotion-original/trt"

shape=[{"name": "input_ids", "min": (1,3), "opt": (1,20), "max": (1,80)},
        {"name": "attention_mask", "min": (1,3), "opt": (1,20), "max": (1,80)},
       {"name": "token_type_ids", "min": (1,3), "opt": (1,20), "max": (1,80)}
if model != "":
    print("Building model ...")
    model_engine = build_engine(model, shapes = shape ,fp16=fp16)
    if model_engine is not None:
        engine_path = os.path.join(output, "model"+engine_prec+".plan")
        with open(engine_path, 'wb') as f:
            # f.write(model_engine.serialize())
        print("Failed to build engine from", model)

finally I wanted to get inference from the engine using the following script :

import tensorrt as trt
from transformers import BertTokenizer
import torch

import pycuda.driver as cuda
import pycuda.autoinit
import time
import data_processing as dp
import numpy as np

import ctypes
import os

ctypes.CDLL("libnvinfer_plugin.so", mode=ctypes.RTLD_GLOBAL)

def is_dimension_dynamic(dim):
    return dim is None or dim <= 0

def is_shape_dynamic(shape):
    return any([is_dimension_dynamic(dim) for dim in shape])

def run_trt_engine(context, engine, tensors):

    bindings = [None]*engine.num_bindings
    for name,tensor in tensors['inputs'].items():
        idx = engine.get_binding_index(name)
        bindings[idx] = tensor.data_ptr()
        if engine.is_shape_binding(idx) and is_shape_dynamic(context.get_shape(idx)):
            context.set_shape_input(idx, tensor)
        elif is_shape_dynamic(engine.get_binding_shape(idx)):
            context.set_binding_shape(idx, tensor.shape)

    for name,tensor in tensors['outputs'].items():
        idx = engine.get_binding_index(name)
        bindings[idx] = tensor.data_ptr()

    stream = cuda.Stream() 

def load_engine(engine_filepath, trt_logger):
    with open(engine_filepath, "rb") as f, trt.Runtime(trt_logger) as runtime:
        engine = runtime.deserialize_cuda_engine(f.read())
    return engine

def engine_info(engine_filepath):
    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
    engine = load_engine(engine_filepath, TRT_LOGGER)

    binding_template = r"""{btype} {{
  name: "{bname}"
  data_type: {dtype}
  dims: {dims}}}"""
    type_mapping = {"DataType.HALF": "TYPE_FP16",
                    "DataType.FLOAT": "TYPE_FP32",
                    "DataType.INT32": "TYPE_INT32",
                    "DataType.BOOL" : "TYPE_BOOL"}

    print("engine name", engine.name)
    print("has_implicit_batch_dimension", engine.has_implicit_batch_dimension)
    start_dim = 0 if engine.has_implicit_batch_dimension else 1
    print("num_optimization_profiles", engine.num_optimization_profiles)
    print("max_batch_size:", engine.max_batch_size)
    print("device_memory_size:", engine.device_memory_size)
    print("num_layers:", engine.num_layers)

    for i in range(engine.num_bindings):
        btype = "input" if engine.binding_is_input(i) else "output"
        bname = engine.get_binding_name(i)
        dtype = engine.get_binding_dtype(i)
        bdims = engine.get_binding_shape(i)
        config_values = {
            "btype": btype,
            "bname": bname,
            "dtype": type_mapping[str(dtype)],
            "dims": list(bdims[start_dim:])
        final_binding_str = binding_template.format_map(config_values)
    return engine

engine_filepath = "/yazdani/bert-base-cased-goemotion-original/trt/model_fp32.plan"
engine = engine_info(engine_filepath)
context = engine.create_execution_context()

batch_size = 1
tokenizer = BertTokenizer.from_pretrained("monologg/bert-base-cased-goemotions-original")
inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np")

seq_len = len(inputs.input_ids[0])
bert_output = torch.zeros((batch_size, seq_len,28996)).cpu().detach().numpy()

d_input_ids = cuda.mem_alloc(batch_size * inputs.input_ids.nbytes)
d_token_type_ids = cuda.mem_alloc(batch_size * inputs.token_type_ids.nbytes)
d_attention_mask = cuda.mem_alloc(batch_size * inputs.attention_mask.nbytes)

d_output = cuda.mem_alloc(batch_size * bert_output.nbytes)

bindings = [int(d_input_ids), int(d_token_type_ids), int(d_attention_mask), int(d_output)]

stream = cuda.Stream()

cuda.memcpy_htod_async(d_input_ids, inputs.input_ids, stream)
cuda.memcpy_htod_async(d_token_type_ids, inputs.token_type_ids, stream)
cuda.memcpy_htod_async(d_attention_mask, inputs.attention_mask, stream)

context.execute_async(batch_size, bindings, stream.handle, None)

cuda.memcpy_dtoh_async(bert_output, d_output, stream)

pred = torch.tensor(bert_output)


but I encountered the error below:

[04/19/2022-12:18:02] [TRT] [E] 3: [executionContext.cpp::resolveSlots::1480] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/executionContext.cpp::resolveSlots::1480, condition: allInputDimensionsSpecified(routine)
[04/19/2022-12:18:02] [TRT] [E] 2: [executionContext.cpp::enqueueInternal::366] Error Code 2: Internal Error (Could not resolve slots: )


TensorRT Version: 8.2
GPU Type: V100
Nvidia Driver Version: 495.29.05
CUDA Version: 11.5
Operating System + Version: Linux x86
Python Version (if applicable): 3.8

solved !

Please check the below link, as they might answer your concerns


How did you solve it or what was the issue?

How did you solved it?

for context we should set input shape before cuda memcopy