ResNet18: Batch size 1 works, but batch size 10, 32 only has minor acceleration

I was using TensorRT to accelerate inference of ResNet18. As described, when batch size was set to 1, there is a 3 times acceleration compared to Pytorch, while batch size was set to a bigger number, the acceleration is not that significant.

Batch size 1 to run 1000 inference:

TensorRT cost time: 0.9361522197723389
PyTorch cost time: 2.2295029163360596

Batch size 10 to run 1000 inference:

TensorRT cost time: 3.9054813385009766
PyTorch cost time: 4.895483493804932

Batch size 32 to run 1000 inference:

TensorRT cost time: 10.404768943786621
PyTorch cost time: 13.346660375595093

Here is my code:

batch_size = 10
onnx_path = 'resnet18_v2.onnx'

import torchvision.models as models
import torch 
resnet18 = models.resnet18(pretrained=True).cuda()

for name, v in resnet18.named_parameters():
    print(name)

def export_onnx_model(model, input_shape, onnx_path, input_names=None, output_names=None, dynamic_axes=None):
    inputs = torch.ones(*input_shape).cuda()
    model(inputs)
    torch.onnx.export(model, inputs, onnx_path, verbose=True, input_names=input_names, output_names=output_names, dynamic_axes=dynamic_axes)
    
input_shape = (batch_size, 3, 224, 224)
input_names=['input']
output_names=['output']
# dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}}
export_onnx_model(resnet18, input_shape, onnx_path, input_names, output_names)

import tensorrt as trt
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda 
import time

model_path = "resnet18_v2.onnx"
input_size = 224

TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

# The Onnx path is used for Onnx models.
def build_engine_onnx_batch(model_file, batch_size):
    with trt.Builder(TRT_LOGGER) as builder, \
        builder.create_network(1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) as network, \
        trt.OnnxParser(network, TRT_LOGGER) as parser:
        builder.max_workspace_size = 1<<30
        builder.max_batch_size = batch_size
        # Load the Onnx model and parse it in order to populate the TensorRT network.
        with open(model_file, 'rb') as model:
            if not parser.parse(model.read()):
                print ('ERROR: Failed to parse the ONNX file.')
                for error in range(parser.num_errors):
                    print (parser.get_error(error))
                return None
        return builder.build_cuda_engine(network)

engine = build_engine_onnx_batch(onnx_path, batch_size)


def alloc_buf(engine):
    # host cpu mem
    h_in_size = trt.volume(engine.get_binding_shape(0)) 
    h_out_size = trt.volume(engine.get_binding_shape(1))
    h_in_dtype = trt.nptype(engine.get_binding_dtype(0))
    h_out_dtype = trt.nptype(engine.get_binding_dtype(1))
    in_cpu = cuda.pagelocked_empty(h_in_size, h_in_dtype)
    out_cpu = cuda.pagelocked_empty(h_out_size, h_out_dtype)
    # allocate gpu mem
    in_gpu = cuda.mem_alloc(in_cpu.nbytes)
    out_gpu = cuda.mem_alloc(out_cpu.nbytes)
    stream = cuda.Stream()
    return in_cpu, out_cpu, in_gpu, out_gpu, stream

def inference_naive(engine, context, inputs, out_cpu, in_gpu, out_gpu, stream, batch_size):

    # sync version
    cuda.memcpy_htod(in_gpu, inputs)
    context.execute(batch_size, [int(in_gpu), int(out_gpu)])
    cuda.memcpy_dtoh(out_cpu, out_gpu)
    return out_cpu

inputs = np.random.random((batch_size, 3, input_size, input_size)).astype(np.float32)
context = engine.create_execution_context()
in_cpu, out_cpu, in_gpu, out_gpu, stream = alloc_buf(engine)
cal = []
for _ in range(1100):
    res = inference_naive(engine, context, inputs, out_cpu, in_gpu, out_gpu, stream, batch_size)
    # print(res.shape)
    res = res.reshape((batch_size, -1))
    cal.append(time.time())
    # print(res.shape)
print("TensorRT cost time: ", time.time()-cal[100])

t1 = time.time()
resnet18.eval()
cal = []
for _ in range(1100): 
    outputs = resnet18(torch.Tensor(inputs).cuda())
    cal.append(time.time())
    # print(outputs.shape)
print("PyTorch cost time: ", time.time()-cal[100])

print(np.allclose(res, outputs.data.cpu().detach().numpy(), rtol=1e-05, atol=1e-05))

Some environment variables:

Linux: Ubuntu 18.04
GPU: RTX 2080 Ti
Driver Version: 440.59
CUDA Version: 10.2
CuDNN Version: 7.6
Python Version: 3.6.9
Pytorch Version: 1.4.0
TensorRT Version: 7.0.0.11

I am wondering if this is the common cases or my implementation is wrong. Any idea will be appreciated.

Hi,

Could you please share the nvidia profiler output as well so we can help better?
You can use Nvidia visual profiler. Please refer below link for more details:

I think it might be because all the data movement and reshape operation is also taken into consideration and not just inference time.

Thanks