We can not try a different version of jp, since it’s already in production. I already shared the model, please use that model with random input and you will find the gap between them easily. And it is under AGX Xavier.
def get_onnx_output(path, a):
ort_sess = ort.InferenceSession(path)
input_name = ort_sess.get_inputs()[0].name
output_names = [i.name for i in ort_sess.get_outputs()]
output = ort_sess.run(output_names, {
input_name: a
})
return output
class HostDeviceMem(object):
def __init__(self, host_mem, device_mem):
self.host = host_mem
self.device = device_mem
def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
class TrtModel:
def __init__(self, engine_path, input_shape=[512,512], batch_size=1, max_batch_size=1, dtype=np.float32):
self.input_shape = input_shape
self.engine_path = engine_path
self.batch_size = batch_size
self.dtype = dtype
self.logger = trt.Logger(trt.Logger.WARNING)
self.runtime = trt.Runtime(self.logger)
self.engine = self.load_engine(self.runtime, self.engine_path)
self.max_batch_size = max_batch_size
self.inputs, self.outputs, self.bindings, self.stream = self.allocate_buffers()
self.context = self.engine.create_execution_context()
# self.context.set_binding_shape(batch_size, (1, 512, 512))
@staticmethod
def load_engine(trt_runtime, engine_path):
trt.init_libnvinfer_plugins(None, "")
with open(engine_path, 'rb') as f:
engine_data = f.read()
engine = trt_runtime.deserialize_cuda_engine(engine_data)
return engine
def allocate_buffers(self):
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for binding in self.engine:
size = trt.volume(self.engine.get_binding_shape(binding))
if self.engine.get_binding_shape(binding)[0] == -1:
size = -1 * size * self.batch_size
host_mem = cuda.pagelocked_empty(size, self.dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
bindings.append(int(device_mem))
if self.engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream
def __call__(self, x: np.ndarray, batch_size=1):
x = x.astype(self.dtype)
np.copyto(self.inputs[0].host, x.ravel())
for inp in self.inputs:
cuda.memcpy_htod_async(inp.device, inp.host, self.stream)
self.context.execute_async(
batch_size=batch_size, bindings=self.bindings, stream_handle=self.stream.handle)
for out in self.outputs:
cuda.memcpy_dtoh_async(out.host, out.device, self.stream)
self.stream.synchronize()
return [out.host.reshape(batch_size, -1) for out in self.outputs]
class TensorRTInfer:
"""
Implements inference for the TensorRT engine.
"""
def __init__(self, engine_path, batch_size=4):
"""
:param engine_path: The path to the serialized engine to load from disk.
"""
# Load TRT engine
self.logger = trt.Logger(trt.Logger.ERROR)
with open(engine_path, "rb") as f, trt.Runtime(self.logger) as runtime:
self.engine = runtime.deserialize_cuda_engine(f.read())
self.context = self.engine.create_execution_context()
assert self.engine
assert self.context
# Setup I/O bindings
self.inputs = []
self.outputs = []
self.allocations = []
for i in range(self.engine.num_bindings):
is_input = False
if self.engine.binding_is_input(i):
is_input = True
name = self.engine.get_binding_name(i)
dtype = self.engine.get_binding_dtype(i)
shape = self.engine.get_binding_shape(i)
self.batch_size = batch_size
size = np.dtype(trt.nptype(dtype)).itemsize
for s in shape:
size *= s
if size < 0:
size = size * -1
allocation = cuda.mem_alloc(size)
binding = {
'index': i,
'name': name,
'dtype': np.dtype(trt.nptype(dtype)),
'shape': list(shape),
'allocation': allocation,
}
self.allocations.append(allocation)
if self.engine.binding_is_input(i):
self.inputs.append(binding)
else:
self.outputs.append(binding)
assert self.batch_size > 0
assert len(self.inputs) > 0
assert len(self.outputs) > 0
assert len(self.allocations) > 0
def input_spec(self):
return self.inputs[0]['shape'], self.inputs[0]['dtype']
def output_spec(self):
return self.outputs[0]['shape'], self.outputs[0]['dtype']
def infer(self, batch, top=1):
# Prepare the output data
output = np.zeros(*self.output_spec())
# Process I/O and execute the network
cuda.memcpy_htod(self.inputs[0]['allocation'], np.ascontiguousarray(batch))
self.context.execute_v2(self.allocations)
cuda.memcpy_dtoh(output, self.outputs[0]['allocation'])
return output
a = np.random.random((1, 1, 512, 512)).astype(np.float32)