For quick use, we think trt-infer is more efficient. Could you check the code?
def infer(img, context, stream, devide_in, devide_out, host_in, host_out ):
bindings = [int(devide_in), int(devide_out)]
np.copyto(host_in, img.ravel())
cuda.memcpy_htod_async(devide_in, host_in, stream)
context.execute_async(bindings=bindings, stream_handle=stream.handle)
cuda.memcpy_dtoh_async(host_out, devide_out, stream)
stream.synchronize()
return host_out
And before the infer, the project will start
### Load TensorRT engine
trt_logger = trt.Logger(trt.Logger.INFO)
runtime = trt.Runtime(trt_logger)
with open(ENGINE_PATH, "rb") as f:
engine = runtime.deserialize_cuda_engine(f.read())
### Prepare TRT execution context, CUDA stream and necessary buffers
context = engine.create_execution_context()
stream = cuda.Stream()
host_in = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), dtype=INPUT_DATA_TYPE)
host_out = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)), dtype=INPUT_DATA_TYPE)
devide_in = cuda.mem_alloc(host_in.nbytes)
devide_out = cuda.mem_alloc(host_out.nbytes)
And final we use it to infer, but get the strange result.
Full code is here:
ENGINE_PATH = 'final_trt_model.trt' # ADJUST
CLASSES = ['OK', 'NG1', 'NG2', 'NG3'] # ADJUST
CROP_SIZE = (3072, 2048) # ADJUST
INPUT_DATA_TYPE = np.float32 # ADJUST
MEASURE_TIME = True # ADJUST
CALC_VAL_ACCURACY = True # ADJUST
### Load TensorRT engine
trt_logger = trt.Logger(trt.Logger.INFO)
runtime = trt.Runtime(trt_logger)
with open(ENGINE_PATH, "rb") as f:
engine = runtime.deserialize_cuda_engine(f.read())
### Prepare TRT execution context, CUDA stream and necessary buffers
context = engine.create_execution_context()
stream = cuda.Stream()
host_in = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), dtype=INPUT_DATA_TYPE)
host_out = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)), dtype=INPUT_DATA_TYPE)
devide_in = cuda.mem_alloc(host_in.nbytes)
devide_out = cuda.mem_alloc(host_out.nbytes)
### Load and prepare input image
def prepare_image(img_in, crop_size):
# img = utils.resize_and_crop(img_in, crop_size)
img = img_in
img = img.astype(INPUT_DATA_TYPE)
img = img.transpose(2, 0, 1) # to CHW
return img
INPUT_IMAGE_PATH = 'actest0001.jpg' # ADJUST
img = imageio.imread(INPUT_IMAGE_PATH, pilmode='RGB')
img = prepare_image(img, CROP_SIZE)
### Run inference
def infer(img):
bindings = [int(devide_in), int(devide_out)]
np.copyto(host_in, img.ravel())
cuda.memcpy_htod_async(devide_in, host_in, stream)
context.execute_async(bindings=bindings, stream_handle=stream.handle)
cuda.memcpy_dtoh_async(host_out, devide_out, stream)
stream.synchronize()
return host_out
out = infer(img)
print('Input : {}'.format(INPUT_IMAGE_PATH))
print('Output: {}'.format(out))
print('Prediction: {}'.format(CLASSES[np.argmax(out)]))
### Measure execution time
if MEASURE_TIME:
import time
TIMEIT_N_SKIP = 10 # ADJUST
TIMEIT_N_RUN = 20 # ADJUST
imfer_time_arr = []
for _ in range(TIMEIT_N_SKIP):
out = infer(img)
for _ in range(TIMEIT_N_RUN):
time_start = time.time()
out = infer(img)
imfer_time_arr.append(time.time() - time_start)
print('Inference time: {:.3f} +- {:.3f} ms (Avg over {} runs, {} skipped)'.format(
np.mean(imfer_time_arr)*1000.,
np.std(imfer_time_arr)*1000.,
TIMEIT_N_RUN, TIMEIT_N_SKIP))