If tensorflow graph is trained with NHWC channel order, Tensorrt result will be incorrect.
As show in the following script,
If I set following line
parser.register_input(“input_1”, (3, 50, 100), trt.UffInputOrder.NHWC)
result cannot match:
tensorflow result: [0.3743012 0.3759891 0.45476994 0.561791 0.5528796 ]
tensorRT result: [0.445955 0.2790466 0.38440198 0.5397778 0.7453178 ]
If I change it to:
parser.register_input(“input_1”, (3, 50, 100), trt.UffInputOrder.NCHW)
and change input to
cuda.memcpy_htod_async(d_input, temp, stream)
The result still cannot match.
The only trick that can make it working is modify the original tensorflow model and set it
to data_format=‘channels_first’
This approach is not optimal, since training the original model takes longs time. I want to use the trained
model even if it is in NHWC format. Also there is scenario we don’t have access to original training data.
test environment
- nvidia tensorflow docker
================
== TensorFlow ==
================
NVIDIA Release 18.11 (build 838556) - tensorrt is
root@ws3:/workspace# python -c “import tensorrt;print(tensorrt.version)”
5.0.2.6
pycuda-2018.1.1.tar.gz
======= sample script ======
from future import print_function
import numpy as np
import tensorflow as tf
import tensorrt as trt
import uff
import pycuda.driver as cuda
import pycuda.autoinit
---- define network
with tf.Graph().as_default():
input = tf.placeholder(tf.float32, [None, 50, 100, 3], name=‘input_1’)
x = tf.layers.Conv2D(10, 2, strides=1, padding=‘same’, name=‘conv1’)(input)
x = tf.layers.MaxPooling2D(2, 2, padding=‘valid’, name=‘pool1’)(x)
x = tf.layers.Flatten()(x)
net = tf.layers.Dense(5, activation=tf.nn.sigmoid, name=‘out’)(x)
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
output_names = [net.op.name]
graphdef = tf.get_default_graph().as_graph_def()
frozen_graph = tf.graph_util.convert_variables_to_constants(sess, graphdef, output_names)
frozen_graph = tf.graph_util.remove_training_nodes(frozen_graph)
# ---- save to pb
tf.io.write_graph(frozen_graph, './', 'toy.pb', as_text=False)
---- model to uff
uff_model = uff.from_tensorflow(frozen_graph, output_names)
G_LOGGER = trt.Logger(trt.Logger.Severity.ERROR)
with trt.Builder(G_LOGGER) as builder, builder.create_network() as network, trt.UffParser() as parser:
builder.max_batch_size = 1
builder.max_workspace_size = 1 ** 30
parser.register_input(“input_1”, (3, 50, 100), trt.UffInputOrder.NHWC)
#parser.register_input(“input_1”, (100, 100, 3))
print(‘register input:’, input.op.name)
for name in output_names:
print(‘register output:’, name)
parser.register_output(name)
parser.parse_buffer(uff_model, network)
engine = builder.build_cuda_engine(network)
---- random input and save to bin/npy
np.random.seed(100)
temp = np.random.rand(1, 50, 100, 3).astype(np.float32)
temp2 = np.rollaxis(temp, -1, 1).copy()
print(‘input2 shape’)
print(temp2.shape)
temp.tofile(“in.bin”)
np.save(“in.npy”, temp)
---- tensorflow inference
tf_results = sess.run(net, feed_dict={input: temp})
---- tensorRT inference
h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), dtype=trt.nptype(trt.float32))
h_output = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)), dtype=trt.nptype(trt.float32))
d_input = cuda.mem_alloc(h_input.nbytes)
d_output = cuda.mem_alloc(h_output.nbytes)
context = engine.create_execution_context()
bindings = [int(d_input), int(d_output)]
stream = cuda.Stream()
cuda.memcpy_htod_async(d_input, temp2, stream)
context.execute_async(1, bindings, stream.handle, None)
cuda.memcpy_dtoh_async(h_output, d_output, stream)
stream.synchronize()
---- let’s see
print("tensorflow result: ", tf_results[0])
print("tensorRT result: ", h_output)
print((tf_results[0]==h_output).all())