TensorRT UFF parser register_input() cannot handle original graph in NHWC format

If tensorflow graph is trained with NHWC channel order, Tensorrt result will be incorrect.
As show in the following script,

If I set following line
parser.register_input(“input_1”, (3, 50, 100), trt.UffInputOrder.NHWC)
result cannot match:
tensorflow result: [0.3743012 0.3759891 0.45476994 0.561791 0.5528796 ]
tensorRT result: [0.445955 0.2790466 0.38440198 0.5397778 0.7453178 ]

If I change it to:
parser.register_input(“input_1”, (3, 50, 100), trt.UffInputOrder.NCHW)
and change input to
cuda.memcpy_htod_async(d_input, temp, stream)

The result still cannot match.

The only trick that can make it working is modify the original tensorflow model and set it
to data_format=‘channels_first’

This approach is not optimal, since training the original model takes longs time. I want to use the trained
model even if it is in NHWC format. Also there is scenario we don’t have access to original training data.

test environment

  1. nvidia tensorflow docker
    ================
    == TensorFlow ==
    ================
    NVIDIA Release 18.11 (build 838556)
  2. tensorrt is
    root@ws3:/workspace# python -c “import tensorrt;print(tensorrt.version)”
    5.0.2.6
    pycuda-2018.1.1.tar.gz

======= sample script ======

from future import print_function
import numpy as np
import tensorflow as tf
import tensorrt as trt
import uff
import pycuda.driver as cuda
import pycuda.autoinit

---- define network

with tf.Graph().as_default():
input = tf.placeholder(tf.float32, [None, 50, 100, 3], name=‘input_1’)
x = tf.layers.Conv2D(10, 2, strides=1, padding=‘same’, name=‘conv1’)(input)
x = tf.layers.MaxPooling2D(2, 2, padding=‘valid’, name=‘pool1’)(x)
x = tf.layers.Flatten()(x)
net = tf.layers.Dense(5, activation=tf.nn.sigmoid, name=‘out’)(x)

init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)

output_names = [net.op.name]
graphdef = tf.get_default_graph().as_graph_def()
frozen_graph = tf.graph_util.convert_variables_to_constants(sess, graphdef, output_names)
frozen_graph = tf.graph_util.remove_training_nodes(frozen_graph)

# ---- save to pb
tf.io.write_graph(frozen_graph, './', 'toy.pb', as_text=False)

---- model to uff

uff_model = uff.from_tensorflow(frozen_graph, output_names)
G_LOGGER = trt.Logger(trt.Logger.Severity.ERROR)
with trt.Builder(G_LOGGER) as builder, builder.create_network() as network, trt.UffParser() as parser:
builder.max_batch_size = 1
builder.max_workspace_size = 1 ** 30
parser.register_input(“input_1”, (3, 50, 100), trt.UffInputOrder.NHWC)
#parser.register_input(“input_1”, (100, 100, 3))
print(‘register input:’, input.op.name)
for name in output_names:
print(‘register output:’, name)
parser.register_output(name)
parser.parse_buffer(uff_model, network)
engine = builder.build_cuda_engine(network)

---- random input and save to bin/npy

np.random.seed(100)
temp = np.random.rand(1, 50, 100, 3).astype(np.float32)
temp2 = np.rollaxis(temp, -1, 1).copy()
print(‘input2 shape’)
print(temp2.shape)
temp.tofile(“in.bin”)
np.save(“in.npy”, temp)

---- tensorflow inference

tf_results = sess.run(net, feed_dict={input: temp})

---- tensorRT inference

h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), dtype=trt.nptype(trt.float32))
h_output = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)), dtype=trt.nptype(trt.float32))
d_input = cuda.mem_alloc(h_input.nbytes)
d_output = cuda.mem_alloc(h_output.nbytes)

context = engine.create_execution_context()
bindings = [int(d_input), int(d_output)]
stream = cuda.Stream()
cuda.memcpy_htod_async(d_input, temp2, stream)
context.execute_async(1, bindings, stream.handle, None)
cuda.memcpy_dtoh_async(h_output, d_output, stream)
stream.synchronize()

---- let’s see

print("tensorflow result: ", tf_results[0])
print("tensorRT result: ", h_output)
print((tf_results[0]==h_output).all())

def save_engine(engine, engine_dest_path):

buf = engine.serialize()

with open(engine_dest_path, ‘wb’) as f:

f.write(buf)

def load_engine(trt_runtime, engine_path):

with open(engine_path, ‘rb’) as f:

engine_data = f.read()

engine = trt_runtime.deserialize_cuda_engine(engine_data)

return engine

That seems like it should be converting to NCHW. Can you share the model with me (DM me if you don’t want to share it publicly)?

The code to generate the model is already posted above.

Converting the original model to NCHW will work. But the problem is sometimes we only has the trained model parameter file already in NHWC format. We just want to deploy the model using TensorRT on GeForce hardware.
The TensorRT UFF parser document claims it can handle NHWC format. If not, we need to know that limitation and whether there is future plan to address it.

Our engineers are looking at it. I’ll let you know when I have more information.
2469593

It looks like there’s an early fix being tested in engineering. It’s still a long ways from release, but it’s moving forward.

Here’s the latest update from engineering:

There’s an issue with the Flatten op in UFF, so if you use Reshape instead, it should work for you.
We’re working on a fix for the Flatten layer, but hopefully the WAR can unblock you for now.

import tensorflow as tf

def volume(it):
	vol = 1
	for elem in it:
		vol *= elem
	return vol

# ---- define network
with tf.Graph().as_default():
	input = tf.placeholder(tf.float32, [None, 50, 100, 3], name='input_1')
	x = tf.layers.Conv2D(10, 2, strides=1, padding='same', name='conv1')(input)
	x = tf.layers.MaxPooling2D(2, 2, padding='valid', name='pool1')(x)
	# Do flatten with a reshape.
	chw_vol = volume(x.shape[1:])
	x = tf.reshape(x, shape=(-1, chw_vol))
	net = tf.layers.Dense(5, activation=tf.nn.sigmoid, name='out')(x)

	init = tf.global_variables_initializer()
	sess = tf.Session()
	sess.run(init)

	output_names = [net.op.name]
	graphdef = tf.get_default_graph().as_graph_def()
	frozen_graph = tf.graph_util.convert_variables_to_constants(sess, graphdef, output_names)
	frozen_graph = tf.graph_util.remove_training_nodes(frozen_graph)

# ---- save to pb
tf.io.write_graph(frozen_graph, './', 'toy_no_flatten.pb', as_text=False)

It is still not working, here is the code we are using for testing.

from future import print_function
import numpy as np
import tensorflow as tf
import tensorrt as trt
import uff
import pycuda.driver as cuda
import pycuda.autoinit

def volume(x):
vol = 1
for i in x:
vol *= i
return vol

---- define network

with tf.Graph().as_default():
input = tf.placeholder(tf.float32, [None, 100, 100, 3], name=‘input_1’)
x = tf.layers.Conv2D(10, 2, strides=1, padding=‘same’, name=‘conv1’)(input)
x = tf.layers.MaxPooling2D(2, 2, padding=‘valid’, name=‘pool1’)(x)
#x = tf.layers.Flatten()(x)
vol = volume(x.shape[1:])
x = tf.reshape(x, shape=(-1, vol))
net = tf.layers.Dense(5, activation=tf.nn.sigmoid, name=‘out’)(x)

init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)

output_names = [net.op.name]
graphdef = tf.get_default_graph().as_graph_def()
frozen_graph = tf.graph_util.convert_variables_to_constants(sess, graphdef, output_names)
frozen_graph = tf.graph_util.remove_training_nodes(frozen_graph)

# ---- save to pb
# tf.io.write_graph(frozen_graph, './', 'toy.pb', as_text=False)

---- model to uff

uff_model = uff.from_tensorflow(frozen_graph, output_names)
G_LOGGER = trt.Logger(trt.Logger.Severity.ERROR)
with trt.Builder(G_LOGGER) as builder, builder.create_network() as network, trt.UffParser() as parser:
builder.max_batch_size = 1
builder.max_workspace_size = 1 ** 30
parser.register_input(“input_1”, (3, 100, 100), trt.UffInputOrder.NHWC)
print(‘register input:’, input.op.name)
for name in output_names:
print(‘register output:’, name)
parser.register_output(name)
parser.parse_buffer(uff_model, network)
engine = builder.build_cuda_engine(network)

---- random input and save to bin/npy

np.random.seed(100)
temp = np.random.rand(1, 100, 100, 3).astype(np.float32)

---- tensorflow inference

tf_results = sess.run(net, feed_dict={input: temp})

---- tensorRT inference

h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), dtype=trt.nptype(trt.float32))
h_output = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)), dtype=trt.nptype(trt.float32))
d_input = cuda.mem_alloc(h_input.nbytes)
d_output = cuda.mem_alloc(h_output.nbytes)

context = engine.create_execution_context()
bindings = [int(d_input), int(d_output)]
stream = cuda.Stream()
cuda.memcpy_htod_async(d_input, temp, stream)
context.execute_async(1, bindings, stream.handle, None)
cuda.memcpy_dtoh_async(h_output, d_output, stream)
stream.synchronize()

---- let’s see

print("tensorflow result: ", tf_results[0])
print("tensorRT result: ", h_output)

Hello,

engineering has made a fix “Fixes Flatten op in UFF so that it transposes when necessary”, and is scheduled for the next TRT release.