I want to use pycuda and tensorrt to inference,but failed.
If I set times=1
, I cannot get all result like ground truth. But if I set times=10
I can get correct results of the model.
I also test the same model (same onnx → tensort) in 2080 and can get correct result.
How can I get correct result by just inference once?
code like this:
import numpy as np
import tensorrt as trt
import pycuda.autoinit
import pycuda.driver as cuda
import time
TRT_LOGGER = trt.Logger()
# from tensorrt.common
class HostDeviceMem(object):
def __init__(self, host_mem, device_mem):
self.cpu = self.host = host_mem
self.gpu = self.device = device_mem
def __str__(self):
return "Host: \n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
class TRTEngine:
def __init__(self, engine_file_path, mode) -> None:
with open(engine_file_path, "rb") as f, trt.Runtime(
TRT_LOGGER
) as runtime:
self.engine = runtime.deserialize_cuda_engine(f.read())
self.mode = mode
self.context = self.engine.create_execution_context()
inputs, outputs, bindings, stream = self.allocate_buffers(self.engine)
self.inputs = inputs
self.outputs = outputs
self.bindings = bindings
self.stream = stream
def allocate_buffers(self, engine):
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for binding in engine:
size = trt.volume(
engine.get_binding_shape(binding)
) # * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# FIXME
if self.mode == "async":
# https://documen.tician.de/pycuda/driver.html?highlight=htod#pycuda.driver.memcpy_htod
host_mem = cuda.pagelocked_empty(size, dtype)
else:
host_mem = np.zeros(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
bindings.append(int(device_mem))
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
print(size, dtype)
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream
def exec_sync(self, times):
# cpu -> gpu
[
cuda.memcpy_htod(inp.device, inp.host)
for inp in self.inputs
]
for i in range(times):
self.context.execute(
batch_size=1,
bindings=self.bindings,
)
# gpu -> cpu
[
cuda.memcpy_dtoh(out.host, out.device)
for out in self.outputs
]
return [out.host for out in self.outputs]
# from tensorrt.common
def exec_async(self, times=100):
[
cuda.memcpy_htod_async(inp.device, inp.host, self.stream)
for inp in self.inputs
]
for i in range(times):
self.context.execute_async_v2(
bindings=self.bindings, stream_handle=self.stream.handle
)
[
cuda.memcpy_dtoh_async(out.host, out.device, self.stream)
for out in self.outputs
]
self.stream.synchronize()
return [out.host for out in self.outputs]
def __call__(self, inputs, times=1):
for i in range(len(inputs)):
# self.inputs[i].host = np.ascontiguousarray(inputs[i])
np.copyto(self.inputs[i].host, inputs[i].ravel().reshape(-1))
if self.mode == "async":
return self.exec_async(times)
else:
return self.exec_sync(times)
def transfer_pickle_cuda_to_numpy():
import pickle5 as pickle
for i in range(20):
fname = f"/home/dev/gz/trt_test/transmot_batch_data_numsDT_20/batch_data_for_transmot_{i}.pickle"
with open(fname, "rb") as fp:
ss = pickle.load(fp)
a = {}
for k in ss.keys():
if k == "matching":
a[k] = ss[k]
#continue
else:
a[k] = ss[k].cpu().numpy()
with open(fname + "new", "wb") as fp:
pickle.dump(a, fp)
print("dump", i)
def compare_pth_trt(mode, times):
import pickle5 as pickle
from IPython import embed
net_trt = TRTEngine(
"./wheeljack_20220411_epoch_260_numsDT_20.trt",
#("./transmot_numsDT20.trt")
mode=mode
)
for i in range(20):
with open(
f"/home/dev/gz/trt_test/transmot_batch_data_numsDT_20/batch_data_for_transmot_{i}.picklenew", "rb"
) as f:
data = pickle.load(f)
keys = [
"tracklets_norm_xyz",
"tracklets_norm_xyzs",
"detection_norm_xyz",
"tracklets_motion",
"out_mask",
]
inputs = []
for k in keys:
inputs.append(data[k].astype("float32"))
print(k, inputs[-1].shape, inputs[-1].dtype)
_start = time.time()
# inference start
indices0 = net_trt(inputs, times=times)
matching = {i: j for i, j in enumerate(indices0[0]) if j != -1}
# inference end
_end = time.time()
_tot = 1000 * (_end - _start)
print("diff:", matching, data["matching"])
print(
"finished {} times. total = {} ms, avg = {} ms, qps = {}.".format(
"%d" % (times),
"%.2f" % (_tot),
"%.2f" % (_tot / times),
"%.3f" % (1000 * times / _tot),
)
)
if __name__ == "__main__":
# transfer_pickle_cuda_to_numpy()
compare_pth_trt(mode="async", times=1)
# 1. 不论是sync or async,只有跑多次exec才能有正确结果
# 2. 在ppl中跑的时候报错:
# [TensorRT] ERROR: 1: [slice.cu::launchNaiveSliceImpl::148] Error Code 1: Cuda Runtime (invalid resource handle)
# https://stackoverflow.com/questions/58369731/adding-multiple-inference-on-tensorrt-invalid-resource-handle-error