I was using TensorRT to accelerate inference of ResNet18. As described, when batch size was set to 1, there is a 3 times acceleration compared to Pytorch, while batch size was set to a bigger number, the acceleration is not that significant.
Batch size 1 to run 1000 inference:
TensorRT cost time: 0.9361522197723389
PyTorch cost time: 2.2295029163360596
Batch size 10 to run 1000 inference:
TensorRT cost time: 3.9054813385009766
PyTorch cost time: 4.895483493804932
Batch size 32 to run 1000 inference:
TensorRT cost time: 10.404768943786621
PyTorch cost time: 13.346660375595093
Here is my code:
batch_size = 10
onnx_path = 'resnet18_v2.onnx'
import torchvision.models as models
import torch
resnet18 = models.resnet18(pretrained=True).cuda()
for name, v in resnet18.named_parameters():
print(name)
def export_onnx_model(model, input_shape, onnx_path, input_names=None, output_names=None, dynamic_axes=None):
inputs = torch.ones(*input_shape).cuda()
model(inputs)
torch.onnx.export(model, inputs, onnx_path, verbose=True, input_names=input_names, output_names=output_names, dynamic_axes=dynamic_axes)
input_shape = (batch_size, 3, 224, 224)
input_names=['input']
output_names=['output']
# dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}}
export_onnx_model(resnet18, input_shape, onnx_path, input_names, output_names)
import tensorrt as trt
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
import time
model_path = "resnet18_v2.onnx"
input_size = 224
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
# The Onnx path is used for Onnx models.
def build_engine_onnx_batch(model_file, batch_size):
with trt.Builder(TRT_LOGGER) as builder, \
builder.create_network(1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) as network, \
trt.OnnxParser(network, TRT_LOGGER) as parser:
builder.max_workspace_size = 1<<30
builder.max_batch_size = batch_size
# Load the Onnx model and parse it in order to populate the TensorRT network.
with open(model_file, 'rb') as model:
if not parser.parse(model.read()):
print ('ERROR: Failed to parse the ONNX file.')
for error in range(parser.num_errors):
print (parser.get_error(error))
return None
return builder.build_cuda_engine(network)
engine = build_engine_onnx_batch(onnx_path, batch_size)
def alloc_buf(engine):
# host cpu mem
h_in_size = trt.volume(engine.get_binding_shape(0))
h_out_size = trt.volume(engine.get_binding_shape(1))
h_in_dtype = trt.nptype(engine.get_binding_dtype(0))
h_out_dtype = trt.nptype(engine.get_binding_dtype(1))
in_cpu = cuda.pagelocked_empty(h_in_size, h_in_dtype)
out_cpu = cuda.pagelocked_empty(h_out_size, h_out_dtype)
# allocate gpu mem
in_gpu = cuda.mem_alloc(in_cpu.nbytes)
out_gpu = cuda.mem_alloc(out_cpu.nbytes)
stream = cuda.Stream()
return in_cpu, out_cpu, in_gpu, out_gpu, stream
def inference_naive(engine, context, inputs, out_cpu, in_gpu, out_gpu, stream, batch_size):
# sync version
cuda.memcpy_htod(in_gpu, inputs)
context.execute(batch_size, [int(in_gpu), int(out_gpu)])
cuda.memcpy_dtoh(out_cpu, out_gpu)
return out_cpu
inputs = np.random.random((batch_size, 3, input_size, input_size)).astype(np.float32)
context = engine.create_execution_context()
in_cpu, out_cpu, in_gpu, out_gpu, stream = alloc_buf(engine)
cal = []
for _ in range(1100):
res = inference_naive(engine, context, inputs, out_cpu, in_gpu, out_gpu, stream, batch_size)
# print(res.shape)
res = res.reshape((batch_size, -1))
cal.append(time.time())
# print(res.shape)
print("TensorRT cost time: ", time.time()-cal[100])
t1 = time.time()
resnet18.eval()
cal = []
for _ in range(1100):
outputs = resnet18(torch.Tensor(inputs).cuda())
cal.append(time.time())
# print(outputs.shape)
print("PyTorch cost time: ", time.time()-cal[100])
print(np.allclose(res, outputs.data.cpu().detach().numpy(), rtol=1e-05, atol=1e-05))
Some environment variables:
Linux: Ubuntu 18.04
GPU: RTX 2080 Ti
Driver Version: 440.59
CUDA Version: 10.2
CuDNN Version: 7.6
Python Version: 3.6.9
Pytorch Version: 1.4.0
TensorRT Version: 7.0.0.11