TensorRT 8.5.2-1+cuda11.8: pycuda._driver.LogicError: cuMemcpyHtoDAsync failed: invalid argument

un.py:53: DeprecationWarning: Use get_tensor_shape instead.
  size = trt.volume(engine.get_binding_shape(binding)) * batch_size
run.py:54: DeprecationWarning: Use get_tensor_dtype instead.
  dtype = trt.nptype(engine.get_binding_dtype(binding))
run.py:61: DeprecationWarning: Use get_tensor_mode instead.
  if engine.binding_is_input(binding):
Traceback (most recent call last):
  File "run.py", line 71, in <module>
    cuda.memcpy_htod_async(inp['device'], inp['host'], stream)
pycuda._driver.LogicError: cuMemcpyHtoDAsync failed: invalid argument

The full code:

import torch
import numpy as np    
import pickle
import torchvision
from PIL import Image as PIL_Image
from utils.language_utils import tokens2description
import time
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit

img_size = 384

with open('./demo_material/demo_coco_tokens.pickle', 'rb') as f:
    coco_tokens = pickle.load(f)
    sos_idx = coco_tokens['word2idx_dict'][coco_tokens['sos_str']]
    eos_idx = coco_tokens['word2idx_dict'][coco_tokens['eos_str']]

# Pre-Processing
def preprocess_image(image_path):
    transf_1 = torchvision.transforms.Compose([torchvision.transforms.Resize((img_size, img_size))])
    transf_2 = torchvision.transforms.Compose([torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                                                                std=[0.229, 0.224, 0.225])])

    pil_image = PIL_Image.open(image_path)
    if pil_image.mode != 'RGB':
        pil_image = PIL_Image.new("RGB", pil_image.size)
    preprocess_pil_image = transf_1(pil_image)
    image = torchvision.transforms.ToTensor()(preprocess_pil_image)
    image = transf_2(image)
    return image.unsqueeze(0)

# Build TensorRT engine
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
trt_runtime = trt.Runtime(TRT_LOGGER)

def build_engine(model_path):
    with open(model_path, "rb") as f:
        engine_data = f.read()
    engine = trt_runtime.deserialize_cuda_engine(engine_data)
    return engine

# we test the generalization of the graph by testing on two images
image_1 = preprocess_image('./demo_material/napoleon.jpg')
# generate optimized graph
print("Testing first image on TensorRT")
engine = build_engine('./trt_fp16.engine')
context = engine.create_execution_context()
batch_size = 1
inputs, outputs, bindings, stream = [], [], [], cuda.Stream()
for binding in engine:
    size = trt.volume(engine.get_binding_shape(binding)) * batch_size
    dtype = trt.nptype(engine.get_binding_dtype(binding))
    # Allocate host and device buffers
    host_mem = cuda.pagelocked_empty(size, dtype)
    device_mem = cuda.mem_alloc(host_mem.nbytes)
    # Append the device buffer to device bindings.
    # Append to the appropriate list.
    if engine.binding_is_input(binding):
        inputs.append({'host': host_mem, 'device': device_mem})
        outputs.append({'host': host_mem, 'device': device_mem})
# Set input values
inputs[0]['host'] = image_1.numpy().ravel()
inputs[1]['host'] = np.array([0])
inputs[2]['host'] = np.array([sos_idx])
# Transfer input data to the GPU.
for inp in inputs:
    cuda.memcpy_htod_async(inp['device'], inp['host'], stream)
# Execute model
start = time.time()
context.execute_async_v2(batch_size=batch_size,bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
for out in outputs:
    cuda.memcpy_dtoh_async(out['host'], out['device'], stream)
# Synchronize the stream
output_caption = tokens2description(outputs[0]['host'].tolist(), coco_tokens['idx2word_list'], sos_idx, eos_idx)
print(f"inference time = {time.time - start}")


This looks like a Jetson issue. Please refer to the below samples in case useful.

For any further assistance, we will move this post to to Jetson related forum.


Hi @NVES, Thank you for your response. The issue was resolved for the fp32 model, but I’m still not getting accurate results for the fp16 and int8 models. Is it possible that my model is not compatible with fp formats lower than fp32?models was converted using trtexec utlity.