Description
I have an invalid resource handle error when removing onnx code.
Environment
I am working on Jetson Xavier AGX.
Relevant Files
import os
import numpy as np
import tensorrt as trt
import onnxruntime
import pycuda.driver as cuda
import pycuda.autoinit
TRT_LOGGER = trt.Logger()
ses = onnxruntime.InferenceSession('Processing/models/***.onnx')
engine_file_path = "Processing/models/***.trt"
I used the previous code in order to run inference when modifying the inference from onnx to tensorrt. (Code at head of foolowing code)
The inference was working fine with the tensorrt model.
I started to refactor the code and remove the inference session and I started to have [TensorRT] ERROR: …/rtSafe/cuda/reformat.cu (925) - Cuda Error in NCHWToNCHHW2: 400 (invalid resource handle)
And when I rewrite the inferencesession line the code works fine. ATM the inference is in a thread but I also tried outside the thread and the problem persist.
I saw that this is a quite common error but I didn t find a solution for my problem.
Inference code :
TRT_LOGGER = trt.Logger()
def get_engine(onnx_file_path, engine_file_path=engine_file_path):
"""Attempts to load a serialized engine if available, otherwise builds a new TensorRT engine and saves it."""
if os.path.exists(engine_file_path):
# If a serialized engine exists, use it instead of building an engine.
print("Reading engine from file {}".format(engine_file_path))
with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
return runtime.deserialize_cuda_engine(f.read())
# Simple helper data class that's a little nicer to use than a 2-tuple.
class HostDeviceMem(object):
def __init__(self, host_mem, device_mem):
self.host = host_mem
self.device = device_mem
def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
# Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
def allocate_buffers(engine):
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream
# This function is generalized for multiple inputs/outputs.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
# Transfer input data to the GPU.
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
# Run inference.
context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
# Synchronize the stream
stream.synchronize()
# Return only the host outputs.
return [out.host for out in outputs]
engine = get_engine(onnx_file_path, engine_file_path)
context = engine.create_execution_context()
trt_outputs = []
inputs, outputs, bindings, stream = allocate_buffers(engine)
def ***_segmentation(image, queue):
x_tensor = np.expand_dims(image, axis=0)
x_tensor = np.ascontiguousarray(x_tensor)
# Do inference
# Set host input to the image. The common.do_inference function will copy the input to the GPU before executing.
inputs[0].host = x_tensor
trt_outputs = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)
pr_mask = np.array(trt_outputs[0]).squeeze().round()
queue.put(pr_mask)
Can you can give me some tips on why with inferencesession it is working and in what extend it can help me to debug.
PS: some part of the code is truncated