ResNet18: Batch size 1 works, but batch size 10, 32 only has minor acceleration

st19930921 · February 19, 2020, 10:19pm

I was using TensorRT to accelerate inference of ResNet18. As described, when batch size was set to 1, there is a 3 times acceleration compared to Pytorch, while batch size was set to a bigger number, the acceleration is not that significant.

Batch size 1 to run 1000 inference:

TensorRT cost time: 0.9361522197723389
PyTorch cost time: 2.2295029163360596

Batch size 10 to run 1000 inference:

TensorRT cost time: 3.9054813385009766
PyTorch cost time: 4.895483493804932

Batch size 32 to run 1000 inference:

TensorRT cost time: 10.404768943786621
PyTorch cost time: 13.346660375595093

Here is my code:

batch_size = 10
onnx_path = 'resnet18_v2.onnx'

import torchvision.models as models
import torch 
resnet18 = models.resnet18(pretrained=True).cuda()

for name, v in resnet18.named_parameters():
    print(name)

def export_onnx_model(model, input_shape, onnx_path, input_names=None, output_names=None, dynamic_axes=None):
    inputs = torch.ones(*input_shape).cuda()
    model(inputs)
    torch.onnx.export(model, inputs, onnx_path, verbose=True, input_names=input_names, output_names=output_names, dynamic_axes=dynamic_axes)
    
input_shape = (batch_size, 3, 224, 224)
input_names=['input']
output_names=['output']
# dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}}
export_onnx_model(resnet18, input_shape, onnx_path, input_names, output_names)

import tensorrt as trt
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda 
import time

model_path = "resnet18_v2.onnx"
input_size = 224

TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

# The Onnx path is used for Onnx models.
def build_engine_onnx_batch(model_file, batch_size):
    with trt.Builder(TRT_LOGGER) as builder, \
        builder.create_network(1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) as network, \
        trt.OnnxParser(network, TRT_LOGGER) as parser:
        builder.max_workspace_size = 1<<30
        builder.max_batch_size = batch_size
        # Load the Onnx model and parse it in order to populate the TensorRT network.
        with open(model_file, 'rb') as model:
            if not parser.parse(model.read()):
                print ('ERROR: Failed to parse the ONNX file.')
                for error in range(parser.num_errors):
                    print (parser.get_error(error))
                return None
        return builder.build_cuda_engine(network)

engine = build_engine_onnx_batch(onnx_path, batch_size)


def alloc_buf(engine):
    # host cpu mem
    h_in_size = trt.volume(engine.get_binding_shape(0)) 
    h_out_size = trt.volume(engine.get_binding_shape(1))
    h_in_dtype = trt.nptype(engine.get_binding_dtype(0))
    h_out_dtype = trt.nptype(engine.get_binding_dtype(1))
    in_cpu = cuda.pagelocked_empty(h_in_size, h_in_dtype)
    out_cpu = cuda.pagelocked_empty(h_out_size, h_out_dtype)
    # allocate gpu mem
    in_gpu = cuda.mem_alloc(in_cpu.nbytes)
    out_gpu = cuda.mem_alloc(out_cpu.nbytes)
    stream = cuda.Stream()
    return in_cpu, out_cpu, in_gpu, out_gpu, stream

def inference_naive(engine, context, inputs, out_cpu, in_gpu, out_gpu, stream, batch_size):

    # sync version
    cuda.memcpy_htod(in_gpu, inputs)
    context.execute(batch_size, [int(in_gpu), int(out_gpu)])
    cuda.memcpy_dtoh(out_cpu, out_gpu)
    return out_cpu

inputs = np.random.random((batch_size, 3, input_size, input_size)).astype(np.float32)
context = engine.create_execution_context()
in_cpu, out_cpu, in_gpu, out_gpu, stream = alloc_buf(engine)
cal = []
for _ in range(1100):
    res = inference_naive(engine, context, inputs, out_cpu, in_gpu, out_gpu, stream, batch_size)
    # print(res.shape)
    res = res.reshape((batch_size, -1))
    cal.append(time.time())
    # print(res.shape)
print("TensorRT cost time: ", time.time()-cal[100])

t1 = time.time()
resnet18.eval()
cal = []
for _ in range(1100): 
    outputs = resnet18(torch.Tensor(inputs).cuda())
    cal.append(time.time())
    # print(outputs.shape)
print("PyTorch cost time: ", time.time()-cal[100])

print(np.allclose(res, outputs.data.cpu().detach().numpy(), rtol=1e-05, atol=1e-05))

Some environment variables:

Linux: Ubuntu 18.04
GPU: RTX 2080 Ti
Driver Version: 440.59
CUDA Version: 10.2
CuDNN Version: 7.6
Python Version: 3.6.9
Pytorch Version: 1.4.0
TensorRT Version: 7.0.0.11

st19930921 · February 19, 2020, 10:22pm

I am wondering if this is the common cases or my implementation is wrong. Any idea will be appreciated.

SunilJB · February 20, 2020, 7:16pm

Hi,

Could you please share the nvidia profiler output as well so we can help better?
You can use Nvidia visual profiler. Please refer below link for more details:

I think it might be because all the data movement and reshape operation is also taken into consideration and not just inference time.

Thanks