Description
I’d like to run an engine with dynamic batch size. As of now, my goal is to run the model and test it. Performance is not important for now. I looked at the examples from the official repository Copied from TensorRT/samples/python at main · NVIDIA/TensorRT · GitHub but when I run my code the command context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
throws the following error:
[TensorRT] ERROR: 3: [executionContext.cpp::resolveSlots::1495] Error Code 3: Internal Error (Parameter check failed at: runtime/api/executionContext.cpp::resolveSlots::1495, condition: allInputDimensionsSpecified(routine)
)
[TensorRT] ERROR: 2: [executionContext.cpp::enqueueInternal::360] Error Code 2: Internal Error (Could not resolve slots: )
Environment
I am running my code within the official Deepstream 6.0.1 container which uses TensorRT ‘8.0.1.6’.
Steps To Reproduce
Here’s my code:
import numpy as np
import requests
from PIL import Image
import tensorrt as trt
import torch
from torchvision import transforms
import torchvision.transforms.functional as F
import pycuda.driver as cuda
import pycuda.autoinit
class HostDeviceMem(object):
""" Copied from https://github.com/NVIDIA/TensorRT/blob/main/samples/python/common.py """
def __init__(self, host_mem, device_mem):
self.host = host_mem
self.device = device_mem
def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
class MyModel:
def __init__(self, engine_path):
self.engine_path = engine_path
self.logger = trt.Logger(trt.Logger.WARNING)
self.runtime = trt.Runtime(self.logger)
self.engine = self.load_engine(self.runtime, self.engine_path)
self.inputs, self.outputs, self.bindings, self.stream = self.allocate_buffers(self.engine)
self.context = self.engine.create_execution_context()
# PyTorch preprocessing
IMAGE_SIZE = 224
NORMALIZE_MEAN = torch.tensor([0.485, 0.456, 0.406])
NORMALIZE_STD = torch.tensor([0.226, 0.226, 0.266])
self.preprocessing_transforms = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize(mean=NORMALIZE_MEAN, std=NORMALIZE_STD), # todo: is it between -1 and 1?
SquarePad(),
transforms.Resize((IMAGE_SIZE, IMAGE_SIZE)),
])
self.input_dtype = np.float32
@staticmethod
def download_image(image_url: str) -> Image.Image:
return Image.open(requests.get(image_url, stream=True).raw)
@staticmethod
def load_engine(trt_runtime, engine_path):
""" Copied from https://github.com/NVIDIA/TensorRT/blob/main/samples/python/common.py """
trt.init_libnvinfer_plugins(None, "")
with open(engine_path, 'rb') as f:
engine_data = f.read()
engine = trt_runtime.deserialize_cuda_engine(engine_data)
return engine
@staticmethod
def allocate_buffers(engine):
""" Copied from https://github.com/NVIDIA/TensorRT/blob/main/samples/python/common.py """
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for binding in engine:
size = abs(trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size)
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream
@staticmethod
def do_inference_v2(context, bindings, inputs, outputs, stream):
""" Copied from https://github.com/NVIDIA/TensorRT/blob/main/samples/python/common.py """
# Transfer input data to the GPU.
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
# Run inference.
context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
# Synchronize the stream
stream.synchronize()
# Return only the host outputs.
return [out.host for out in outputs]
def infer(self, image: Image.Image):
image = self._preprocessing(image)
batch = np.expand_dims(image, 0)
output = self._trt_infer(x=batch, batch_size=1)
return output
def _preprocessing(self, image: Image.Image):
image = self.preprocessing_transforms(image)
image = np.array(image)
return image
def _trt_infer(self, x: np.array, batch_size: int) -> np.array:
x = x.astype(self.input_dtype)
np.copyto(self.inputs[0].host, x.ravel())
return self.do_inference_v2(self.context, self.bindings, self.inputs, self.outputs, self.stream)
if __name__ == "__main__":
model = MyModel(engine_path="model.engine")
image_urls = [
"https://www.kbb.com/wp-content/uploads/2020/10/2020-ford-expedition-rear.jpg?w=300&crop=1&strip=all"
]
for image_url in image_urls:
image = model.download_image(image_url)
output = model.infer(image)
I can send the serialized engine in private if that helps with the issue.