uff inference time large than pb time when process vgg 19

gpu:tesla k80

i can not upload the pb file.the file load from http://download.tensorflow.org/models/vgg_19_2016_08_28.tar.gz
and process by https://github.com/NVIDIA-AI-IOT/tf_to_trt_image_classification/blob/master/scripts/models_to_frozen_graphs.py.

when batch size is 30,the pb time cost is less than uff.but when batch size is 1, the uff time is much less than pb.
i wonder is that because of my own environment or other reason? thanks.


are you saying

  • TF inference is faster than TRT if batchsize is large?
  • TRT is faster than TF inference if batchsize is 1?

can you please share a small repro that demonstrates the performance difference?

NVIDIA Enterprise Support

when batch size is 30.the TF inference is faster than TRT.
i wonder if this is right or my own mistake?


this is unexpected. please share a small repro that demonstrates the performance difference?

DIR: TensorRT-

uff test code

# This sample uses a UFF MNIST model to create a TensorRT Inference Engine
from random import randint
from PIL import Image
import numpy as np

import pycuda.driver as cuda
# This import causes pycuda to automatically manage CUDA context creation and cleanup.
import pycuda.autoinit

import tensorrt as trt
import time

import sys, os
sys.path.insert(1, os.path.join(sys.path[0], ".."))
import common

# You can set the logger severity higher to suppress messages (or lower to display more messages).
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

batch_size = 30

class ModelData(object):
    MODEL_FILE = os.path.join(os.path.dirname(__file__), "models/vgg_19.uff")
    INPUT_NAME ="input"
    INPUT_SHAPE = (3, 224, 224)
    OUTPUT_NAME = "vgg_19/fc8/BiasAdd"
    DTYPE = trt.float32

def build_engine(model_file):
    # For more information on TRT basics, refer to the introductory samples.
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network() as network, trt.UffParser() as parser:
        builder.max_batch_size = batch_size
        builder.max_workspace_size = common.GiB(8)
        # Parse the Uff Network
        parser.register_input(ModelData.INPUT_NAME, ModelData.INPUT_SHAPE)
        parser.parse(model_file, network)
        # Build and return an engine.
        return builder.build_cuda_engine(network)

# Loads a test case into the provided pagelocked_buffer.
def load_normalized_test_case(data_path, pagelocked_buffer, case_num=randint(0, 9)):
#    test_case_path = os.path.join(data_path, str(case_num) + ".pgm")
    # Flatten the image into a 1D array, normalize, and copy to pagelocked memory.
    def normalize_image(image):
        # Resize, antialias and transpose the image to CHW.
        c, h, w = ModelData.INPUT_SHAPE
        return np.asarray(image.resize((w, h), Image.ANTIALIAS)).transpose([2, 0, 1]).astype(trt.nptype(ModelData.DTYPE))
    test_case_path = "./a.jpg"
    img = normalize_image(Image.open(test_case_path))
    img_array = []
    for i in range(batch_size):
    img_array = np.array(img_array, dtype=trt.nptype(ModelData.DTYPE))
    img_array = img_array.ravel()
    np.copyto(pagelocked_buffer, img_array)
    return case_num

def main():
#    data_path = common.find_sample_data(description="Runs an MNIST network using a UFF model file", subfolder="mnist")
    data_path = "/home/bjxiangboren/tools/TensorRT-"
    model_file = ModelData.MODEL_FILE

#    with open("inception_batch.engine", "rb") as f, trt.Runtime(TRT_LOGGER) as runtime: 
#        engine = runtime.deserialize_cuda_engine(f.read())
    with build_engine(model_file) as engine:
        # Build an engine, allocate buffers and create a stream.
        # For more information on buffer allocation, refer to the introductory samples.
        with open("inception_batch.engine", "wb") as f:
        inputs, outputs, bindings, stream = common.allocate_buffers(engine)
        with engine.create_execution_context() as context:
            case_num = load_normalized_test_case(data_path, pagelocked_buffer=inputs[0].host)
            # For more information on performing inference, refer to the introductory samples.
            # The common.do_inference function will return a list of outputs - we only have one in this case.
            while True:
                start_time = time.time()
                [output] = common.do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream, batch_size=batch_size)
                end_time = time.time()
                print "time dis is %s" % (end_time - start_time)
#            output = output.reshape((30,1001))
#            print output
#            print output.shape
#            print np.argmax(output, axis=1)
#                pred = np.argmax(output)
#                print("Test Case: " + str(case_num))
#                print("Prediction: " + str(pred))

if __name__ == '__main__':

pb test code

import os
import sys
import numpy as np
import tensorflow as tf
from PIL import Image
import time

test_case_path = "./a.jpg"
img = np.array(Image.open(test_case_path).resize((224,224)))
img_array = []
for i in range(30):

frame_rgb = np.array(img_array, dtype=np.float32)

graph_def = tf.GraphDef.FromString(open("data/frozen_graphs/vgg_19.pb", 'rb').read())
_inception_graph = tf.Graph()
with _inception_graph.as_default():
    _ = tf.import_graph_def(graph_def, name='')
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.43)
    session = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
    while True:
        start_time = time.time()
        predict_res = session.run('vgg_19/fc8/BiasAdd:0', feed_dict = {"input:0" : frame_rgb})
        end_time = time.time()
        print "time dis is %s" % (end_time - start_time)
#    print predict_res
#    print np.argmax(predict_res, axis=1)

performance on different batchsize

net batchsize avg time on pb avg time on uff
vgg_19 30 0.433251 0.544959
vgg_19 10 0.253375 0.233249
vgg_19 1 0.12613 0.0267076

net batchsize avg_time_on_pb avg_time_on_uff
vgg_19 30 0.433251 0.544959
vgg_19 10 0.253375 0.233249
vgg_19 1 0.12613 0.0267076

uff file:

pb file:


just to be complete. can you please share a.jpg used in the test?


The uff.py script requires an a.jpg test image. Can you share for repro completeness? Can you share the test file too?

  • Do you have access to another GPU? non-k80? do you see same result on the other GPU architectures?
  • You are using non-power-of-2 batchsizes, what if you used power of 2 batches?