(TensorRT 5.0RC, TensorFlow 1.11, Python 3.5.2, Ubuntu 14.04, CUDA 9.2). I have successfully converted the following all-conv model (after training) to UFF:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.InputLayer(input_shape=[INPUT_SIZE,INPUT_SIZE, 3]))
model.add(tf.keras.layers.Conv2D(input_shape = [INPUT_SIZE, INPUT_SIZE, 3], filters=256, kernel_size=3, padding="same", activation=tf.nn.relu))
model.add(tf.keras.layers.Conv2D(filters=128, kernel_size=3, padding="same", activation=tf.nn.relu))
model.add(tf.keras.layers.Conv2D(filters=32, kernel_size=3, padding="same", activation=tf.nn.relu))
model.add(tf.keras.layers.Conv2D(filters=16, kernel_size=3, padding="same", activation=tf.nn.relu))
model.add(tf.keras.layers.Conv2D(filters=8, kernel_size=3, padding="same", activation=tf.nn.relu))
model.add(tf.keras.layers.Conv2D(filters=1, kernel_size=3, padding="same", activation=tf.nn.relu))
and constructed a TensorRT inference engine. However, when testing the inference engine with the same repeated input - two (128,128,3) images with all values equal 0.5 (the minibatch has shape (128,128,3,2)), I receive an output minibatch (128,128,1,2) with two different inference results. In my particular example, the two images in the output minibatch are the same for 96.61% of the entries where the remaining entries are different (the difference is larger than the numeric precision).
It is possible that I am not doing the inference on minibatches correctly. It should be noted that inference on one image (as in the original end-to-end MNIST example) is working properly for me when adapted to the network above (compared to Keras model.predict).
Here are some relevant code parts (based on NVIDIA end-to-end MNIST example):
class ModelData(object):
MODEL_FILE = os.path.join(os.path.dirname(__file__), "models/all_conv.uff")
INPUT_NAME ="input_1"
INPUT_SHAPE = (3, INPUT_SIZE, INPUT_SIZE) # CHW
OUTPUT_NAME = "conv2d_5/Relu"#"dense_1/Softmax"
def build_engine(model_file):
# For more information on TRT basics, refer to the introductory samples.
with trt.Builder(TRT_LOGGER) as builder, builder.create_network() as network, trt.UffParser() as parser:
builder.max_workspace_size = common.GiB(1)
builder.max_batch_size = MINIBATCH_SIZE
# Parse the Uff Network
parser.register_input(ModelData.INPUT_NAME, ModelData.INPUT_SHAPE)
parser.register_output(ModelData.OUTPUT_NAME)
parser.parse(model_file, network)
# Build and return an engine.
return builder.build_cuda_engine(network)
def allocate_buffers(engine):
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for binding in engine:
#size = trt.volume(engine.get_binding_shape(binding))
size_in = (INPUT_SIZE, INPUT_SIZE, 3, MINIBATCH_SIZE)
size_out = (INPUT_SIZE, INPUT_SIZE, 1, MINIBATCH_SIZE)
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem_in = cuda.pagelocked_empty(size_in, dtype)
host_mem_out = cuda.pagelocked_empty(size_out, dtype)
device_mem = cuda.mem_alloc(host_mem_in.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem_in, device_mem))
else:
outputs.append(HostDeviceMem(host_mem_out, device_mem))
return inputs, outputs, bindings, stream
def do_inference(context, bindings, inputs, outputs, stream):
# Transfer input data to the GPU.
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
# Run inference.
context.execute_async(bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
# Synchronize the stream
stream.synchronize()
# Return only the host outputs.
return [out.host for out in outputs]