TensorRT4 for tensorpack fast-rcnn model is wrong

I am trying to use TensorRT-4.0.1.6 to perform inference on a fast-rcnn model that I have trained with tensorpack.
I am able to freeze the tensorflow graph and convert it with trt.create_inference_graph and use it to inference, I get the flow error:

2019-01-31 11:57:08.651514: E tensorflow/contrib/tensorrt/log/trt_logger.cc:38] DefaultLogger Parameter check failed at: ../builder/Network.cpp::addInput::364, condition: isValidDims(dims)
2019-01-31 11:57:08.651541: W tensorflow/contrib/tensorrt/convert/convert_graph.cc:418] subgraph conversion error for subgraph_index:13 due to: "Invalid argument: Failed to create Input layer" SKIPPING......( 4 nodes)

Here is the program I am trying to run:

#coding:utf-8
import cv2
import argparse
import tensorflow as tf
import time
import numpy as np
import tensorflow.contrib.tensorrt as trt
from graph_utils import force_nms_cpu as f_force_nms_cpu
from graph_utils import replace_relu6 as f_replace_relu6
from graph_utils import remove_assert as f_remove_assert

def resize_image(img, size=600, max_size=1024):
    h, w = img.shape[:2]
    scale = size * 1.0 / min(h, w)
    if h < w:
        newh, neww = size, scale * w
    else:
        newh, neww = scale * h, size
    if max(newh, neww) > max_size:
        scale = max_size * 1.0 / max(newh, neww)
        newh = newh * scale
        neww = neww * scale
    neww = int(neww + 0.5)
    newh = int(newh + 0.5)
    ret = cv2.resize(img, (neww, newh), interpolation=cv2.INTER_LINEAR)
    return ret

def inference(graph_path,mode):
    """Run the pruned and frozen inference graph. """
    tf_config = tf.ConfigProto(allow_soft_placement=True)
    tf_config.gpu_options.allow_growth = True
    output_nodes = ['output/boxes', 'output/scores', 'output/labels', 'output/all_probs']
    input_nodes = ['image']

    frozen_graph = tf.GraphDef()
    with tf.gfile.GFile(graph_path, "rb") as f:
        frozen_graph.ParseFromString(f.read())

    graph = tf.Graph()
    if mode in ['FP32','FP16']:
        print ('----------tensorRT-----------')
        trt_graph = trt.create_inference_graph(
            input_graph_def=frozen_graph,
            outputs=output_nodes,
            max_batch_size=1,
            max_workspace_size_bytes=1 << 30,
            precision_mode=mode
        )
        with graph.as_default():
            tf.import_graph_def(trt_graph,name='')
    else:
        with graph.as_default():
            tf.import_graph_def(frozen_graph,name='')

    tf_input = graph.get_tensor_by_name('image:0')
    tf_scores = graph.get_tensor_by_name('output/scores:0')
    tf_boxes = graph.get_tensor_by_name('output/boxes:0')
    tf_labels = graph.get_tensor_by_name('output/labels:0')
    tf_probs = graph.get_tensor_by_name('output/all_probs:0')

    with tf.Session(config=tf_config,graph=graph) as sess:

        input_file = "./samples/201804191100300605810503418731_18_1524103754819.jpg"
        img = cv2.imread(input_file, cv2.IMREAD_COLOR)

        resized_img = resize_image(img)
        feed_dict = {tf_input: resized_img}

        scores, boxes, labels, probs = sess.run(
               [tf_scores, tf_boxes, tf_labels, tf_probs], feed_dict=feed_dict)

        num_samples = 10
        start = time.time()
        for i in range(num_samples):
            scores, boxes, labels, probs = sess.run(
                [tf_scores, tf_boxes, tf_labels, tf_probs], feed_dict=feed_dict)
        end = time.time()
        print('Average runtime: %f seconds' % (float(end - start)/num_samples))

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--precision_mode', type=str, default='native')
    parser.add_argument('--model_path', type=str)
    pars, unparsed = parser.parse_known_args()

    graph_path = str(pars.model_path)
    mode = str(pars.precision_mode)
    inference(
        graph_path,
        mode
    )

The whole output of this program is:

----------tensorRT-----------
2019-01-31 11:57:04.945507: I tensorflow/core/grappler/devices.cc:51] Number of eligible GPUs (core count >= 8): 2
2019-01-31 11:57:08.259189: I tensorflow/contrib/tensorrt/convert/convert_graph.cc:383] MULTIPLE tensorrt candidate conversion: 264
2019-01-31 11:57:08.608475: W tensorflow/contrib/tensorrt/convert/convert_graph.cc:418] subgraph conversion error for subgraph_index:0 due to: "Unimplemented: Require 4 dimensional input. Got 2 fastrcnn/outputs/box/MatMul" SKIPPING......( 3 nodes)
2019-01-31 11:57:08.611419: W tensorflow/contrib/tensorrt/convert/convert_graph.cc:418] subgraph conversion error for subgraph_index:1 due to: "Unimplemented: Require 4 dimensional input. Got 2 fastrcnn/outputs/class/MatMul" SKIPPING......( 3 nodes)
2019-01-31 11:57:08.614149: W tensorflow/contrib/tensorrt/convert/convert_graph.cc:418] subgraph conversion error for subgraph_index:2 due to: "Unimplemented: Require 4 dimensional input. Got 5 fastrcnn/gn2/moments/SquaredDifference" SKIPPING......( 12 nodes)
2019-01-31 11:57:08.617048: W tensorflow/contrib/tensorrt/convert/convert_graph.cc:418] subgraph conversion error for subgraph_index:3 due to: "Unimplemented: Require 4 dimensional input. Got 5 fastrcnn/gn0/moments/SquaredDifference" SKIPPING......( 12 nodes)
2019-01-31 11:57:08.626805: I tensorflow/contrib/tensorrt/convert/convert_nodes.cc:2660] Max batch size= 184326 max workspace size= 2951191
2019-01-31 11:57:08.626828: I tensorflow/contrib/tensorrt/convert/convert_nodes.cc:2664] Using FP16 precision mode
2019-01-31 11:57:08.626836: I tensorflow/contrib/tensorrt/convert/convert_nodes.cc:2666] starting build engine
2019-01-31 11:57:08.626858: E tensorflow/contrib/tensorrt/log/trt_logger.cc:38] DefaultLogger Tensor: multilevel_roi_align/GatherV2 at max batch size of 184326 exceeds the maximum element count of 2147483647
2019-01-31 11:57:08.626869: I tensorflow/contrib/tensorrt/convert/convert_nodes.cc:2671] Built network
2019-01-31 11:57:08.626890: W tensorflow/contrib/tensorrt/convert/convert_graph.cc:418] subgraph conversion error for subgraph_index:4 due to: "Internal: Engine building failure" SKIPPING......( 6 nodes)
2019-01-31 11:57:08.629621: W tensorflow/contrib/tensorrt/convert/convert_graph.cc:418] subgraph conversion error for subgraph_index:5 due to: "Unimplemented: Require 4 dimensional input. Got 2 multilevel_roi_align/roi_level5/roi_align/crop_and_resize/transform_fpcoor_for_tf/div_3" SKIPPING......( 4 nodes)
2019-01-31 11:57:08.632334: W tensorflow/contrib/tensorrt/convert/convert_graph.cc:418] subgraph conversion error for subgraph_index:6 due to: "Unimplemented: Require 4 dimensional input. Got 2 multilevel_roi_align/roi_level3/roi_align/crop_and_resize/transform_fpcoor_for_tf/div_3" SKIPPING......( 4 nodes)
2019-01-31 11:57:08.635073: W tensorflow/contrib/tensorrt/convert/convert_graph.cc:418] subgraph conversion error for subgraph_index:7 due to: "Unimplemented: Require 4 dimensional input. Got 2 multilevel_roi_align/roi_level2/roi_align/crop_and_resize/transform_fpcoor_for_tf/div_3" SKIPPING......( 4 nodes)
2019-01-31 11:57:08.637887: W tensorflow/contrib/tensorrt/convert/convert_graph.cc:418] subgraph conversion error for subgraph_index:8 due to: "Unimplemented: Require 4 dimensional input. Got 1 multilevel_roi_align/fpn_map_rois_to_levels/Log" SKIPPING......( 4 nodes)
2019-01-31 11:57:08.640590: W tensorflow/contrib/tensorrt/convert/convert_graph.cc:418] subgraph conversion error for subgraph_index:9 due to: "Unimplemented: Require 4 dimensional input. Got 1 multilevel_roi_align/fpn_map_rois_to_levels/Sqrt" SKIPPING......( 4 nodes)
2019-01-31 11:57:08.643387: W tensorflow/contrib/tensorrt/convert/convert_graph.cc:418] subgraph conversion error for subgraph_index:10 due to: "Unimplemented: Require 4 dimensional input. Got 5 group2/block11/conv3/gn/moments/SquaredDifference" SKIPPING......( 12 nodes)
2019-01-31 11:57:08.646105: W tensorflow/contrib/tensorrt/convert/convert_graph.cc:418] subgraph conversion error for subgraph_index:11 due to: "Unimplemented: Require 4 dimensional input. Got 5 group2/block13/conv1/gn/moments/SquaredDifference" SKIPPING......( 12 nodes)
2019-01-31 11:57:08.648803: E tensorflow/contrib/tensorrt/log/trt_logger.cc:38] DefaultLogger Parameter check failed at: ../builder/Network.cpp::addInput::364, condition: isValidDims(dims)
2019-01-31 11:57:08.648831: W tensorflow/contrib/tensorrt/convert/convert_graph.cc:418] subgraph conversion error for subgraph_index:12 due to: "Invalid argument: Failed to create Input layer" SKIPPING......( 4 nodes)
2019-01-31 11:57:08.651514: E tensorflow/contrib/tensorrt/log/trt_logger.cc:38] DefaultLogger Parameter check failed at: ../builder/Network.cpp::addInput::364, condition: isValidDims(dims)
2019-01-31 11:57:08.651541: W tensorflow/contrib/tensorrt/convert/convert_graph.cc:418] subgraph conversion error for subgraph_index:13 due to: "Invalid argument: Failed to create Input layer" SKIPPING......( 4 nodes)
2019-01-31 11:57:08.654259: W tensorflow/contrib/tensorrt/convert/convert_graph.cc:418] subgraph conversion error for subgraph_index:14 due to: "Unimplemented: Require 4 dimensional input. Got 5 group2/block11/conv1/gn/moments/SquaredDifference" SKIPPING......( 12 nodes)
2019-01-31 11:57:08.657160: W tensorflow/contrib/tensorrt/convert/convert_graph.cc:418] subgraph conversion error for subgraph_index:15 due to: "Unimplemented: Require 4 dimensional input. Got 5 group2/block10/conv1/gn/moments/SquaredDifference" SKIPPING......( 12 nodes)
2019-01-31 11:57:08.659879: W tensorflow/contrib/tensorrt/convert/convert_graph.cc:418] subgraph conversion error for subgraph_index:16 due to: "Unimplemented: Require 4 dimensional input. Got 5 group3/block0/convshortcut/gn/moments/SquaredDifference" SKIPPING......( 12 nodes)
2019-01-31 11:57:08.662639: W tensorflow/contrib/tensorrt/convert/convert_graph.cc:418] subgraph conversion error for subgraph_index:17 due to: "Unimplemented: Require 4 dimensional input. Got 5 group2/block9/conv3/gn/moments/SquaredDifference" SKIPPING......( 12 nodes)
2019-01-31 11:57:08.665370: W tensorflow/contrib/tensorrt/convert/convert_graph.cc:418] subgraph conversion error for subgraph_index:18 due to: "Unimplemented: Require 4 dimensional input. Got 5 group2/block3/conv1/gn/moments/SquaredDifference" SKIPPING......( 12 nodes)
2019-01-31 11:57:08.668084: W tensorflow/contrib/tensorrt/convert/convert_graph.cc:418] subgraph conversion error for subgraph_index:19 due to: "Unimplemented: Require 4 dimensional input. Got 5 group2/block9/conv2/gn/moments/SquaredDifference" SKIPPING......( 12 nodes)
2019-01-31 11:57:08.670779: E tensorflow/contrib/tensorrt/log/trt_logger.cc:38] DefaultLogger Parameter check failed at: ../builder/Network.cpp::addInput::364, condition: isValidDims(dims)
2019-01-31 11:57:08.670807: W tensorflow/contrib/tensorrt/convert/convert_graph.cc:418] subgraph conversion error for 
...
2019-01-31 11:57:15.566272: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1435] Adding visible gpu devices: 0, 1
2019-01-31 11:57:15.947373: I tensorflow/core/common_runtime/gpu/gpu_device.cc:923] Device interconnect StreamExecutor with strength 1 edge matrix:
2019-01-31 11:57:15.947430: I tensorflow/core/common_runtime/gpu/gpu_device.cc:929]      0 1
2019-01-31 11:57:15.947442: I tensorflow/core/common_runtime/gpu/gpu_device.cc:942] 0:   N N
2019-01-31 11:57:15.947449: I tensorflow/core/common_runtime/gpu/gpu_device.cc:942] 1:   N N
2019-01-31 11:57:15.948129: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1053] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 14266 MB memory) -> physical GPU (device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:04:00.0, compute capability: 6.0)
2019-01-31 11:57:15.949269: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1053] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:1 with 14758 MB memory) -> physical GPU (device: 1, name: Tesla P100-PCIE--16GB, pci bus id: 0000:84:00.0, compute capability: 6.0)
Average runtime: 0.120971 seconds