Dear Nvidia community,
Could you please help me to fix the below issue? Thank in advance!
un.py:53: DeprecationWarning: Use get_tensor_shape instead.
size = trt.volume(engine.get_binding_shape(binding)) * batch_size
run.py:54: DeprecationWarning: Use get_tensor_dtype instead.
dtype = trt.nptype(engine.get_binding_dtype(binding))
run.py:61: DeprecationWarning: Use get_tensor_mode instead.
if engine.binding_is_input(binding):
Traceback (most recent call last):
File "run.py", line 71, in <module>
cuda.memcpy_htod_async(inp['device'], inp['host'], stream)
pycuda._driver.LogicError: cuMemcpyHtoDAsync failed: invalid argument
The full code:
import torch
import numpy as np
import pickle
import torchvision
from PIL import Image as PIL_Image
from utils.language_utils import tokens2description
import time
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
img_size = 384
with open('./demo_material/demo_coco_tokens.pickle', 'rb') as f:
coco_tokens = pickle.load(f)
sos_idx = coco_tokens['word2idx_dict'][coco_tokens['sos_str']]
eos_idx = coco_tokens['word2idx_dict'][coco_tokens['eos_str']]
# Pre-Processing
def preprocess_image(image_path):
transf_1 = torchvision.transforms.Compose([torchvision.transforms.Resize((img_size, img_size))])
transf_2 = torchvision.transforms.Compose([torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])])
pil_image = PIL_Image.open(image_path)
if pil_image.mode != 'RGB':
pil_image = PIL_Image.new("RGB", pil_image.size)
preprocess_pil_image = transf_1(pil_image)
image = torchvision.transforms.ToTensor()(preprocess_pil_image)
image = transf_2(image)
return image.unsqueeze(0)
# Build TensorRT engine
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
trt_runtime = trt.Runtime(TRT_LOGGER)
def build_engine(model_path):
with open(model_path, "rb") as f:
engine_data = f.read()
engine = trt_runtime.deserialize_cuda_engine(engine_data)
return engine
# we test the generalization of the graph by testing on two images
image_1 = preprocess_image('./demo_material/napoleon.jpg')
# generate optimized graph
print("Testing first image on TensorRT")
engine = build_engine('./trt_fp16.engine')
context = engine.create_execution_context()
batch_size = 1
inputs, outputs, bindings, stream = [], [], [], cuda.Stream()
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append({'host': host_mem, 'device': device_mem})
else:
outputs.append({'host': host_mem, 'device': device_mem})
# Set input values
inputs[0]['host'] = image_1.numpy().ravel()
inputs[1]['host'] = np.array([0])
inputs[2]['host'] = np.array([sos_idx])
# Transfer input data to the GPU.
for inp in inputs:
cuda.memcpy_htod_async(inp['device'], inp['host'], stream)
# Execute model
start = time.time()
context.execute_async_v2(batch_size=batch_size,bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
for out in outputs:
cuda.memcpy_dtoh_async(out['host'], out['device'], stream)
# Synchronize the stream
stream.synchronize()
output_caption = tokens2description(outputs[0]['host'].tolist(), coco_tokens['idx2word_list'], sos_idx, eos_idx)
print(f"inference time = {time.time - start}")
print(output_caption)