INT8 calibration deeplabv3+ failed

Ubuntu 16.04
GPU: Nvidia 1080ti

Nvidia driver version: 384.130
Cuda: 9.0
Cudnn: 7
Python: 3.5
Tensroflow version: 1.9.0
TensorRT version: 4.0.1

My Deeplabv3+ frozen graph and calibration dataset is upload to the google drive: https://drive.google.com/open?id=1sl6QI3aAQquHYGm06umYsCY6_sAc8UZR

problem description:

i want to quantize the deeplabv3 model, i have generated the TRTINT8Cali.pb file ,but when i calibrate the model using dataset , it turns out that the dimension is mismatch. what happened ?

My whole repo is:

# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
# !/bin/env python -tt
r""" TF-TensorRT integration sample script """

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf
from tensorflow.python.ops import data_flow_ops
import tensorflow.contrib.tensorrt as trt

import numpy as np
import time
from tensorflow.python.platform import gfile
from tensorflow.python.client import timeline
import argparse, sys, itertools, datetime
import json
import utils.preprocessing as preprocessing
from utils.segmentation_metric import Evaluator
tf.logging.set_verbosity(tf.logging.INFO)
import scipy.misc as misc
#import cv2
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # selects a specific device
IMAGE_CHN = 3
_MIN_SCALE = 0.5
_MAX_SCALE = 2.0
_HEIGHT = 513
_WIDTH = 513
_IGNORE_LABEL = 255




def read_tensor_from_image_file(file_name, input_height=224, input_width=224,
                                input_mean=0, input_std=255):
    """ Read a jpg image file and return a tensor """
    input_name = "file_reader"
    output_name = "normalized"
    file_reader = tf.read_file(file_name, input_name)
    image_reader = tf.image.decode_png(file_reader, channels=3,
                                       name='jpg_reader')
    float_caster = tf.cast(image_reader, tf.float32)
    dims_expander = tf.expand_dims(float_caster, 0)
    resized = tf.image.resize_bilinear(dims_expander, [input_height, input_width])
    normalized = tf.divide(tf.subtract(resized, [input_mean]), [input_std])
    sess = tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=0.50)))
    result = sess.run([normalized, tf.transpose(normalized, perm=(0, 3, 1, 2))])
    del sess

    return result


def parse_example_proto(example_serialized):
  """Parse the unserialized feature data from the serialized data.

  Args:
  * example_serialized: serialized example data

  Returns:
  * features: unserialized feature data
  """
  # parse features from the serialized data
  feature_map = {
    'image/encoded': tf.FixedLenFeature([], dtype=tf.string, default_value=''),
    # 'image/format': tf.FixedLenFeature([], dtype=tf.string, default_value='jpeg'),
    # 'image/filename': tf.FixedLenFeature((), dtype=tf.string, default_value=''),
    'image/shape': tf.FixedLenFeature([3], dtype=tf.int64),
    'image/height': tf.FixedLenFeature([1], dtype=tf.int64),
    'image/width': tf.FixedLenFeature([1], dtype=tf.int64),
    'label/shape': tf.FixedLenFeature([3], dtype=tf.int64),
    'image/segmentation/label': tf.FixedLenFeature([], dtype=tf.string, default_value=''),
  }
  features = tf.parse_single_example(example_serialized, feature_map)

  return features

def parse_fn(example_serialized, is_train=False):
  """Parse image & objects from the serialized data.

  Args:
  * example_serialized: serialized example data
  * is_train: whether to construct the training subset

  Returns:
  * image: image tensor
  * objects: one tensor with all the annotations packed together
  """

  # obtain the image data

  features = parse_example_proto(example_serialized)
  # features = tf.parse_single_example(example_serialized, keys_to_features)
  height = tf.cast(features['image/height'], tf.int32)
  width = tf.cast(features['image/width'], tf.int32)
  image_shape = tf.cast(features['image/shape'], tf.int32)
  label_shape = tf.cast(features['label/shape'], tf.int32)
  image = tf.to_float(tf.reshape(tf.decode_raw(features['image/encoded'], tf.uint8), shape=image_shape))

  # label = tf.to_int32(tf.reshape(tf.decode_raw(features['image/segmentation/label'], tf.uint8), shape=label_shape))
  label = tf.to_int32(tf.reshape(tf.decode_raw(features['image/segmentation/label'], tf.uint8),
                                 shape=[label_shape[0], label_shape[1], label_shape[2]]))


  if is_train:

    image, label = preprocessing.random_rescale_image_and_label(
      image, label, _MIN_SCALE, _MAX_SCALE)  # return the value of depth is 3
    #
    #     # Randomly crop or pad a [_HEIGHT, _WIDTH] section of the image and label.
    image, label = preprocessing.random_crop_or_pad_image_and_label(
      image, label, _HEIGHT, _WIDTH, _IGNORE_LABEL)  # return the value of depth is 3
    # #
    #     # Randomly flip the image and label horizontally.
    image, label = preprocessing.random_flip_left_right_image_and_label(
      image, label)
    #
    image = preprocessing.mean_image_subtraction(image)
    image.set_shape([_HEIGHT, _WIDTH, 3])
    label.set_shape([_HEIGHT, _WIDTH, 1])
    # image.set_shape([None, None, 3])
    # label.set_shape([None, None, 1])
  else:

    image = tf.image.resize_images(image, [_HEIGHT, _WIDTH],
                                method=tf.image.ResizeMethod.BILINEAR)
    label = tf.image.resize_images(label, [_HEIGHT, _WIDTH],
                                   method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
    image.set_shape([None, None, 3])
    label.set_shape([None, None, 1])
    image = preprocessing.mean_image_subtraction(image)  # shape 无法确定 所以报错
    label = label
    #image.set_shape([None, None, 3])
    #image = preprocessing.mean_image_subtraction(image)  # shape 无法确定 所以报错
    #label = label
    #label.set_shape([None, None, 1])
    # image, label = preprocessing.random_rescale_image_and_label(
    #   image, label, _MIN_SCALE, _MAX_SCALE)  # return the value of depth is 3
    # #
    # #     # Randomly crop or pad a [_HEIGHT, _WIDTH] section of the image and label.
    # image, label = preprocessing.random_crop_or_pad_image_and_label(
    #   image, label, _HEIGHT, _WIDTH, _IGNORE_LABEL)  # return the value of depth is 3
    # #
    # image = preprocessing.mean_image_subtraction(image)
    # image.set_shape([_HEIGHT, _WIDTH, 3])
    # label.set_shape([_HEIGHT, _WIDTH, 1])

  # image_info = {'image': image, 'shape': shape}

  return image, label


def updateGraphDef(fileName):
    with gfile.FastGFile(fileName, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    tf.reset_default_graph()
    g = tf.Graph()
    with g.as_default():
        tf.import_graph_def(graph_def, name="")
        with gfile.FastGFile(fileName, 'wb') as f:
            f.write(g.as_graph_def().SerializeToString())


def getResnet_v2_101():
    with gfile.FastGFile("resnetV1_50_frozen.pb", 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    return graph_def

def getInceptionV4():
    with gfile.FastGFile("inception_v4.pb", 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    return graph_def                              # input node: input  output node: scores

def getDeeplabv3_plus():
    with gfile.FastGFile("/workspace/deeplabv3_plus_frozen_model_513.pb", 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    return graph_def                               # input node: inputs_placeholder1  output node: predictions


def printStats(graphName, timings, batch_size):
    if timings is None:
        return
    times = np.array(timings)
    speeds = batch_size / times
    avgTime = np.mean(timings)
    avgSpeed = batch_size / avgTime
    stdTime = np.std(timings)
    stdSpeed = np.std(speeds)
    print("images/s : %.1f +/- %.1f, s/batch: %.5f +/- %.5f" % (avgSpeed, stdSpeed, avgTime, stdTime))
    print("RES, %s, %s, %.2f, %.2f, %.5f, %.5f" % (graphName, batch_size, avgSpeed, stdSpeed, avgTime, stdTime))


def getFP32(batch_size=3, workspace_size=1 << 30):
    trt_graph = trt.create_inference_graph(getDeeplabv3_plus(), ["predictions"],
                                           max_batch_size=batch_size,
                                           max_workspace_size_bytes=workspace_size,
                                           precision_mode="FP32")  # Get optimized graph
    with gfile.FastGFile("deeplabv3_plus_TRTFP32.pb", 'wb') as f:
        f.write(trt_graph.SerializeToString())
    return trt_graph


def getFP16(batch_size=3, workspace_size=1 << 30):
    trt_graph = trt.create_inference_graph(getDeeplabv3_plus(), ["predictions"],
                                           max_batch_size=batch_size,
                                           max_workspace_size_bytes=workspace_size,
                                           precision_mode="FP16")  # Get optimized graph
    with gfile.FastGFile("deeplabv3_plus_TRTFP16.pb", 'wb') as f:
        f.write(trt_graph.SerializeToString())
    return trt_graph


def getINT8CalibGraph(batch_size=3, workspace_size=1 << 30):
    trt_graph = trt.create_inference_graph(getDeeplabv3_plus(), ["predictions"],
                                           max_batch_size=batch_size,
                                           max_workspace_size_bytes=workspace_size,
                                           precision_mode="INT8")  # calibration
    with gfile.FastGFile("deeplabv3_plus_TRTINT8Calib.pb", 'wb') as f:
        f.write(trt_graph.SerializeToString())
    return trt_graph


def getINT8InferenceGraph(calibGraph):
    trt_graph = trt.calib_graph_to_infer_graph(calibGraph)
    with gfile.FastGFile("deeplabv3_plus_TRTINT8.pb", 'wb') as f:
        f.write(trt_graph.SerializeToString())
    return trt_graph


def timeGraph(gdef, batch_size=1, num_loops=100, dummy_input=None, timelineName=None):

    data_files = '/workspace/tensorrt_tf_tmp/val_pascalAug.tfrecords'
    tf.logging.info("Starting execution")
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.50)
    tf.reset_default_graph()
    g = tf.Graph()
    if dummy_input is None:
        dummy_input = np.random.random_sample((batch_size, 224, 224, 3))
    outlist = []
    labelist = []
    imagelist = []
    with g.as_default():

        dataset = tf.data.TFRecordDataset(data_files)
        dataset = dataset.apply(tf.contrib.data.map_and_batch(map_func=parse_fn, batch_size=batch_size, num_parallel_calls=3))  # parse the tfrecord file
        dataset = dataset.repeat(3)
        dataset = dataset.prefetch(tf.contrib.data.AUTOTUNE)
        iterator = dataset.make_one_shot_iterator()

        next_element, labels = iterator.get_next()
        out = tf.import_graph_def(
            graph_def=gdef,
            input_map={"input": next_element},
            return_elements=["predictions"]
        )                                                           # return the operations or tensor corresponding to names in return elements
        out = out[0].outputs[0]
        outlist.append(out)
        labelist.append(labels)
        imagelist.append(next_element)

    timings = []
    root = '/workspace/'
    with tf.Session(graph=g, config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
        run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)   # tensorboard 默认是不会记录每个节点的运行时间和内存占用
        run_metadata = tf.RunMetadata()
        # for i in range(3):
        sess.run(tf.local_variables_initializer())
        #   img_name = 'img' + '_' + str(i) + '.jpg'
        #  lb_name = 'lb' + '_' + str(i) + '.png'
        #   img, lb = sess.run([next_element, labels])
        #   misc.imsave(os.path.join(root, img_name), img[0,:,:,:])
        #  misc.imsave(os.path.join(root, lb_name), lb[0,:,:,0])
        tf.logging.info("Starting Warmup cycle")

        def mergeTraceStr(mdarr):
            tl = timeline.Timeline(mdarr[0][0].step_stats)
            ctf = tl.generate_chrome_trace_format()
            Gtf = json.loads(ctf)
            deltat = mdarr[0][1][1]
            for md in mdarr[1:]:
                tl = timeline.Timeline(md[0].step_stats)  # 创建timeline对象
                ctf = tl.generate_chrome_trace_format()  # 写成json文件
                tmp = json.loads(ctf)
                deltat = 0
                Gtf["traceEvents"].extend(tmp["traceEvents"])
                deltat = md[1][1]

            return json.dumps(Gtf, indent=2)

        rmArr = [[tf.RunMetadata(), 0] for x in range(20)]
        if timelineName:
            if gfile.Exists(timelineName):
                gfile.Remove(timelineName)
            ttot = int(0)
            tend = time.time()
            for i in range(20):
                tstart = time.time()
                valt = sess.run(outlist, options=run_options, run_metadata=rmArr[i][0])
                tend = time.time()
                rmArr[i][1] = (int(tstart * 1.e6), int(tend * 1.e6))
            with gfile.FastGFile(timelineName, "a") as tlf:
                tlf.write(mergeTraceStr(rmArr))
        else:

            evaluator = Evaluator(21)
            evaluator.reset()

            for i in range(1449):
                valt, lb = sess.run([outlist, labelist])
                #print(type(valt), type(lb))
                #print(i)
                #valt = np.argmax(valt, axis=3)
                evaluator.add_batch(lb[0], valt[0])

            mIoU = evaluator.Mean_Intersection_over_Union()
            print(" the mIoU of val set is:{}".format(mIoU))
        tf.logging.info("Warmup done. Starting real timing")

        num_iters = 50
        for i in range(num_loops):
            tstart = time.time()
            for k in range(num_iters):
                val = sess.run(outlist)
            timings.append((time.time() - tstart) / float(num_iters))
            print("iter ", i, " ", timings[-1])
        # comp = sess.run(tf.reduce_all(tf.equal(val[0], valt[0])))  #
        # print("Comparison=", comp)
        sess.close()
        tf.logging.info("Timing loop done!")
        return timings, None, None, None


def score(nat, trt, topN=5):
    ind = np.argsort(nat)[:, -topN:]
    tind = np.argsort(trt)[:, -topN:]
    return np.array_equal(ind, tind), howClose(nat, trt, topN)


def topX(arr, X):
    ind = np.argsort(arr)[:, -X:][:, ::-1]
    return arr[np.arange(np.shape(arr)[0])[:, np.newaxis], ind], ind


def howClose(arr1, arr2, X):
    val1, ind1 = topX(arr1, X)
    val2, ind2 = topX(arr2, X)
    ssum = 0.
    for i in range(X):
        in1 = ind1[0]
        in2 = ind2[0]
        if (in1[i] == in2[i]):
            ssum += 1
        else:
            pos = np.where(in2 == in1[i])
            pos = pos[0]
            if pos.shape[0]:
                if np.abs(pos[0] - i) < 2:
                    ssum += 0.5
    return ssum / X


def getLabels(labels, ids):
    return [labels[str(x + 1)] for x in ids]


if "__main__" in __name__:
    P = argparse.ArgumentParser(prog="test")
    P.add_argument('--FP32', action='store_true')
    P.add_argument('--FP16', action='store_true')
    P.add_argument('--INT8', action='store_true')
    P.add_argument('--native', action='store_true')
    P.add_argument('--num_loops', type=int, default=20)
    P.add_argument('--topN', type=int, default=10)
    P.add_argument('--batch_size', type=int, default=1)
    P.add_argument('--dump_diff', action='store_true')
    P.add_argument('--with_timeline', action='store_true')
    P.add_argument('--workspace_size', type=int, default=1 << 10, help="workspace size in MB")
    P.add_argument('--update_graphdef', action='store_true')

    f, unparsed = P.parse_known_args()
    print(f)
    valnative = None
    valfp32 = None
    valfp16 = None
    valint8 = None
    res = [None, None, None, None]
    print("Starting at", datetime.datetime.now())
    if f.update_graphdef:
        updateGraphDef("/workspace/deeplabv3_plus_frozen_model_513.pb")  #
    # dummy_input = np.random.random_sample((f.batch_size, 224, 224, 3))
    # with open("labellist.json", "r") as lf:
    #     labels = json.load(lf)
    # imageName = "grace_hopper.jpg"
    # t = read_tensor_from_image_file(imageName,
    #                                 input_height=224,
    #                                 input_width=224,
    #                                 input_mean=0,
    #                                 input_std=1.0)  # return a [image_tensor, transposed image_tensor]
    # tshape = list(t[0].shape)
    # tshape[0] = f.batch_size
    # tnhwcbatch = np.tile(t[0], (f.batch_size, 1, 1, 1))
    dummy_input = None
    wsize = f.workspace_size << 20
    timelineName = None
    if f.native:
        if f.with_timeline: timelineName = "NativeTimeline.json"
        timings, comp, valnative, mdstats = timeGraph(getDeeplabv3_plus(), f.batch_size,
                                                      f.num_loops, dummy_input, timelineName)
        printStats("Native", timings, f.batch_size)
        printStats("NativeRS", mdstats, f.batch_size)  #
        # print()
    if f.FP32:
        if f.with_timeline: timelineName = "FP32Timeline.json"
        timings, comp, valfp32, mdstats = timeGraph(getFP32(f.batch_size, wsize), f.batch_size, f.num_loops,
                                                    dummy_input, timelineName)
        printStats("TRT-FP32", timings, f.batch_size)
        printStats("TRT-FP32RS", mdstats, f.batch_size)
    if f.FP16:
        k = 0
        if f.with_timeline: timelineName = "FP16Timeline.json"
        timings, comp, valfp16, mdstats = timeGraph(getFP16(f.batch_size, wsize), f.batch_size,
                                                    f.num_loops, dummy_input, timelineName)
        printStats("TRT-FP16", timings, f.batch_size)
        printStats("TRT-FP16RS", mdstats, f.batch_size)
    if f.INT8:
        calibGraph = getINT8CalibGraph(f.batch_size, wsize)
        print("Running Calibration")
        timings, comp, _, mdstats = timeGraph(calibGraph, f.batch_size, 1, dummy_input)
        print("Creating inference graph")
        int8Graph = getINT8InferenceGraph(calibGraph)
        del calibGraph
        if f.with_timeline: timelineName = "INT8Timeline.json"
        timings, comp, valint8, mdstats = timeGraph(int8Graph, f.batch_size,
                                                    f.num_loops, dummy_input, timelineName)  # dummy
        printStats("TRT-INT8", timings, f.batch_size)
        printStats("TRT-INT8RS", mdstats, f.batch_size)
    vals = [valnative, valfp32, valfp16, valint8]
    enabled = [(f.native, "native", valnative),
               (f.FP32, "FP32", valfp32),
               (f.FP16, "FP16", valfp16),
               (f.INT8, "INT8", valint8)]
    print("Done timing", datetime.datetime.now())

    ##
    # for i in enabled:
    #     if i[0]:
    #         print(i[1], getLabels(labels, topX(i[2], f.topN)[1][0]))

    sys.exit(0)

below is the runtime logs:

2019-05-06 02:02:55.396641: I tensorflow/core/grappler/devices.cc:51] Number of eligible GPUs (core count >= 8): 1
2019-05-06 02:02:58.289008: I tensorflow/contrib/tensorrt/convert/convert_graph.cc:438] MULTIPLE tensorrt candidate conversion: 12
2019-05-06 02:02:58.291838: E tensorflow/contrib/tensorrt/log/trt_logger.cc:38] DefaultLogger Parameter check failed at: ../builder/Network.cpp::addInput::364, condition: isValidDims(dims)
2019-05-06 02:02:58.291866: W tensorflow/contrib/tensorrt/convert/convert_graph.cc:507] subgraph conversion error for subgraph_index:0 due to: "Invalid argument: Failed to create Input layer" SKIPPING......( 26 nodes)
2019-05-06 02:02:58.295488: I tensorflow/contrib/tensorrt/convert/convert_nodes.cc:3198] Max batch size= 1 max workspace size= 6114703
2019-05-06 02:02:58.295517: I tensorflow/contrib/tensorrt/convert/convert_nodes.cc:3212] finished op preparation
2019-05-06 02:02:58.295538: I tensorflow/contrib/tensorrt/convert/convert_nodes.cc:3220] OK
2019-05-06 02:02:58.295548: I tensorflow/contrib/tensorrt/convert/convert_nodes.cc:3221] finished op building
2019-05-06 02:02:58.297883: I tensorflow/contrib/tensorrt/convert/convert_nodes.cc:3198] Max batch size= 1 max workspace size= 6114703
2019-05-06 02:02:58.297906: I tensorflow/contrib/tensorrt/convert/convert_nodes.cc:3212] finished op preparation
2019-05-06 02:02:58.297925: I tensorflow/contrib/tensorrt/convert/convert_nodes.cc:3220] OK
2019-05-06 02:02:58.297932: I tensorflow/contrib/tensorrt/convert/convert_nodes.cc:3221] finished op building
2019-05-06 02:02:58.300170: I tensorflow/contrib/tensorrt/convert/convert_nodes.cc:3198] Max batch size= 1 max workspace size= 6114703
2019-05-06 02:02:58.300193: I tensorflow/contrib/tensorrt/convert/convert_nodes.cc:3212] finished op preparation
2019-05-06 02:02:58.300211: I tensorflow/contrib/tensorrt/convert/convert_nodes.cc:3220] OK
2019-05-06 02:02:58.300219: I tensorflow/contrib/tensorrt/convert/convert_nodes.cc:3221] finished op building
2019-05-06 02:02:58.302386: I tensorflow/contrib/tensorrt/convert/convert_nodes.cc:3198] Max batch size= 1 max workspace size= 6114703
2019-05-06 02:02:58.302409: I tensorflow/contrib/tensorrt/convert/convert_nodes.cc:3212] finished op preparation
2019-05-06 02:02:58.302427: I tensorflow/contrib/tensorrt/convert/convert_nodes.cc:3220] OK
2019-05-06 02:02:58.302434: I tensorflow/contrib/tensorrt/convert/convert_nodes.cc:3221] finished op building
2019-05-06 02:02:58.310410: W tensorflow/contrib/tensorrt/convert/convert_graph.cc:507] subgraph conversion error for subgraph_index:5 due to: "Invalid argument: Output node 'aspp/concat-5-LayoutOptimizer' is weights not tensor" SKIPPING......( 16 nodes)
2019-05-06 02:02:58.351016: I tensorflow/contrib/tensorrt/convert/convert_nodes.cc:3198] Max batch size= 1 max workspace size= 25681752
2019-05-06 02:02:58.351078: I tensorflow/contrib/tensorrt/convert/convert_nodes.cc:3212] finished op preparation
2019-05-06 02:02:58.351129: I tensorflow/contrib/tensorrt/convert/convert_nodes.cc:3220] OK
2019-05-06 02:02:58.351138: I tensorflow/contrib/tensorrt/convert/convert_nodes.cc:3221] finished op building
2019-05-06 02:02:58.366046: I tensorflow/contrib/tensorrt/convert/convert_nodes.cc:3198] Max batch size= 1 max workspace size= 17121168
2019-05-06 02:02:58.366088: I tensorflow/contrib/tensorrt/convert/convert_nodes.cc:3212] finished op preparation
2019-05-06 02:02:58.366126: I tensorflow/contrib/tensorrt/convert/convert_nodes.cc:3220] OK
2019-05-06 02:02:58.366135: I tensorflow/contrib/tensorrt/convert/convert_nodes.cc:3221] finished op building
2019-05-06 02:02:58.386840: I tensorflow/contrib/tensorrt/convert/convert_nodes.cc:3198] Max batch size= 1 max workspace size= 20789990
2019-05-06 02:02:58.386890: I tensorflow/contrib/tensorrt/convert/convert_nodes.cc:3212] finished op preparation
2019-05-06 02:02:58.386921: I tensorflow/contrib/tensorrt/convert/convert_nodes.cc:3220] OK
2019-05-06 02:02:58.386929: I tensorflow/contrib/tensorrt/convert/convert_nodes.cc:3221] finished op building
2019-05-06 02:02:58.389213: I tensorflow/contrib/tensorrt/convert/convert_nodes.cc:3198] Max batch size= 1 max workspace size= 6114703
2019-05-06 02:02:58.389243: I tensorflow/contrib/tensorrt/convert/convert_nodes.cc:3212] finished op preparation
2019-05-06 02:02:58.389265: I tensorflow/contrib/tensorrt/convert/convert_nodes.cc:3220] OK
2019-05-06 02:02:58.389274: I tensorflow/contrib/tensorrt/convert/convert_nodes.cc:3221] finished op building
2019-05-06 02:02:58.598352: I tensorflow/contrib/tensorrt/convert/convert_nodes.cc:3198] Max batch size= 1 max workspace size= 922097216
2019-05-06 02:02:58.598671: I tensorflow/contrib/tensorrt/convert/convert_nodes.cc:3212] finished op preparation
2019-05-06 02:02:58.598929: I tensorflow/contrib/tensorrt/convert/convert_nodes.cc:3220] OK
2019-05-06 02:02:58.598942: I tensorflow/contrib/tensorrt/convert/convert_nodes.cc:3221] finished op building
2019-05-06 02:02:58.601519: I tensorflow/contrib/tensorrt/convert/convert_nodes.cc:3198] Max batch size= 1 max workspace size= 6114703
2019-05-06 02:02:58.601543: I tensorflow/contrib/tensorrt/convert/convert_nodes.cc:3212] finished op preparation
2019-05-06 02:02:58.601561: I tensorflow/contrib/tensorrt/convert/convert_nodes.cc:3220] OK
2019-05-06 02:02:58.601568: I tensorflow/contrib/tensorrt/convert/convert_nodes.cc:3221] finished op building
Running Calibration
INFO:tensorflow:Starting execution
Traceback (most recent call last):
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/importer.py", line 418, in import_graph_def
    graph._c_graph, serialized, options)  # pylint: disable=protected-access
tensorflow.python.framework.errors_impl.InvalidArgumentError: Dimensions must be equal, but are 1024 and 512 for 'import/resnet_v2_101/block4/unit_1/bottleneck_v2/conv3/Conv2D' (op: 'Conv2D') with input shapes: [?,1024,33,33], [1,1,512,2048].

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "optimize_compress.py", line 447, in <module>
    timings, comp, _, mdstats = timeGraph(calibGraph, f.batch_size, 1, dummy_input)
  File "optimize_compress.py", line 273, in timeGraph
    return_elements=["predictions"]
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/util/deprecation.py", line 432, in new_func
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/importer.py", line 422, in import_graph_def
    raise ValueError(str(e))
ValueError: Dimensions must be equal, but are 1024 and 512 for 'import/resnet_v2_101/block4/unit_1/bottleneck_v2/conv3/Conv2D' (op: 'Conv2D') with input shapes: [?,1024,33,33], [1,1,512,2048].
2019-05-06 02:03:01.003784: I ./tensorflow/contrib/tensorrt/resources/trt_resources.h:48] Destroying Calibration Resource

@nick2nie hello ,I got the same error,did u solve this ? but my project is ok on FP32 and FP16
I will appriciate for your reply