Different outputs between the output of the tensorflow and the output of ternsorrt.

I’m try run inference network using tensorrt on jetson nano.
But outputs of tensorflow model and of ternsorrt model are very different.

How can i get the correct output?

Tensorflow code:

import numpy as np
import cv2
import tensorflow as tf

from tensorflow.python.platform import gfile

with tf.Graph().as_default():
    with gfile.FastGFile("landmarks_5points.pb",'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
        tf.import_graph_def(graph_def, name='')

        sess=tf.Session()

        input_x = sess.graph.get_tensor_by_name("input_image_tensor:0")
        output = sess.graph.get_tensor_by_name("logits/BiasAdd:0")

        graph_nodes=[n for n in graph_def.node]
        wts = [n for n in graph_nodes if n.op=='Const']

image_path = "627.jpg"
image = cv2.imread(image_path)
image = cv2.resize(image, (128,128))
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
image = image.astype(np.float32)

input_data = np.empty((1,128,128,3), dtype = np.float32)
#input_data.fill(0)
input_data[0] = image

#save input data    
f = open("input_pb.txt", "w")
for i in input_data.ravel():
    f.write(str(i) + " ")
f.close()

#perform
np_image, arr_output = sess.run([input_x, output], 
                   feed_dict = {input_x: input_data})

#save output data
arr_output = arr_output[0]
f = open("output_pb.txt", "w")
for i in arr_output.ravel():
    f.write(str(i) + " ")
f.close()

Tensorrt code:

import time
import numpy as np
import os
import cv2

import graphsurgeon as gs
import tensorrt as trt 
import pycuda.driver as cuda
import pycuda.autoinit

class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()

class ModelTRT:
    def __init__(self,  plan_filename):
        TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
        self.runtime = trt.Runtime(TRT_LOGGER)  

        with open(plan_filename, "rb") as f:
            self.engine = self.runtime.deserialize_cuda_engine(f.read())

        self.inputs = []
        self.outputs = []
        self.bindings = []
        for binding in self.engine:
            size = trt.volume(self.engine.get_binding_shape(binding)) * self.engine.max_batch_size
            dtype = np.float32
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            device_mem = cuda.mem_alloc(host_mem.nbytes)
            # Append the device buffer to device bindings.
            self.bindings.append(int(device_mem))
            # Append to the appropriate list.
            if self.engine.binding_is_input(binding):
                self.inputs.append(HostDeviceMem(host_mem, device_mem))
            else:
                self.outputs.append(HostDeviceMem(host_mem, device_mem))
        
        self.stream = cuda.Stream()
        self.context = self.engine.create_execution_context()

    def predict(self,  img):
        np.copyto(self.inputs[0].host, img.ravel())
        [cuda.memcpy_htod(inp.device, inp.host) for inp in self.inputs]
        self.context.execute(batch_size = img.shape[0],
                                   bindings = self.bindings)
        [cuda.memcpy_dtoh(out.host, out.device) for out in self.outputs]

        return self.outputs

path = "627.jpg";

image = cv2.imread(path)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB, 3)
image = cv2.resize(image, (128,128))

models = ModelTRT('landmarks_5points.plan')

input_data = np.empty((1,128,128,3), dtype = np.float32)
#input_data.fill(0)
input_data[0] = image
back_input_data = input_data

for j in np.arange(4):
    if (j==1):
        input_data = np.transpose(back_input_data, (0, 2, 1, 3))
    if (j==2):
        input_data = np.transpose(back_input_data, (0, 3, 1, 2))
    if (j==3):
        input_data = np.transpose(back_input_data, (0, 3, 2, 1))

    f = open("input_trt.txt" + "__" + str(j), "w")

    for i in input_data.ravel():
        f.write(str(i) + " ")
        f.write(os.linesep)
    f.close()

    output =  models.predict(input_data)
    arr_output= output[0].host;

    f = open("output_trt.txt" + "__" + str(j), "w")
    for i in arr_output.ravel():
        f.write(str(i) + " ")
    f.close()

Tensorflow version: 1.13.0-rc0
Tensorrt version: 5.1.6.1

Tensorflow pb file: https://drive.google.com/file/d/1Xmfs4Klgbg-IItamTRMqcoZIy4pXRb-c/view?usp=sharing

Tensorrt plan file: https://drive.google.com/file/d/14xlFH3MZJeGizbI5KCG9VJk7QYAfw_l8/view?usp=sharing

Image file: https://drive.google.com/file/d/1qGx6qNAlJ38dqAUWxyZgWIz6FH2-L4Ql/view?usp=sharing

These examples https://github.com/NVIDIA-AI-IOT/tf_to_trt_image_classification work correctly.