Hi, Could you please check this?
The code below contains a very simple and basic network (2 conv + 1 fc) and its conversion to UFF. However, I am getting different results between Tensorflow and TensorRT. Where is wrong?
For information, unlike the provided MNIST example, in this use case, an input has 3 channels (just like usual color images). Should I transpose the input somehow (e.g., NCHW)?
from __future__ import print_function
import numpy as np
import tensorflow as tf
import tensorrt as trt
from tensorrt.parsers import uffparser
import uff
import pycuda.driver as cuda
import pycuda.autoinit
slim = tf.contrib.slim
# ---- define network
with tf.Graph().as_default():
image_placeholder = tf.placeholder(tf.float32, [None, 100, 100, 3])
net = slim.repeat(image_placeholder, 2, slim.conv2d, 64, [3, 3], scope='conv1')
net = slim.flatten(net)
net = slim.fully_connected(net, 5, scope='pred/label_fc1')
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
output_names = [net.op.name] # which is "pred/label_fc1/Relu"
graphdef = tf.get_default_graph().as_graph_def()
frozen_graph = tf.graph_util.convert_variables_to_constants(sess, graphdef, output_names)
frozen_graph = tf.graph_util.remove_training_nodes(frozen_graph)
# ---- model to uff
uff_model = uff.from_tensorflow(frozen_graph, output_names)
G_LOGGER = trt.infer.ConsoleLogger(trt.infer.LogSeverity.ERROR)
parser = uffparser.create_uff_parser()
parser.register_input("Placeholder", (3,100,100), 0)
for output_name in output_names:
print('register output:', output_name)
parser.register_output(output_name)
engine = trt.utils.uff_to_trt_engine(G_LOGGER, uff_model, parser, 1, 1 << 20)
parser.destroy()
# ---- tensorflow inference
temp = np.random.rand(1, 100, 100, 3).astype(np.float32) # random input
tf_results = sess.run(net, feed_dict={image_placeholder:temp})
# ---- tensorRT inference
runtime = trt.infer.create_infer_runtime(G_LOGGER)
context = engine.create_execution_context()
tr_result = np.empty(5, dtype=np.float32)
d_input = cuda.mem_alloc(1 * temp.size * temp.dtype.itemsize)
d_labels = cuda.mem_alloc(1 * tr_result.size * tr_result.dtype.itemsize)
bindings = [int(d_input), int(d_labels)]
stream = cuda.Stream()
cuda.memcpy_htod_async(d_input, temp, stream)
context.enqueue(1, bindings, stream.handle, None)
cuda.memcpy_dtoh_async(tr_result, d_labels, stream)
stream.synchronize()
# ---- let's see
print("tensorflow result: ", tf_results[0])
print("tensorRT result: ", tr_result)
Result:
tensorflow result: [0. 0. 0. 0.09752206 0. ]
tensorRT result: [0. 0. 0.00887266 0.11281744 0. ]