I bought Jetson Xavier with the hope that it would increase the speed of inference. But using an imag resolution of 640P with mask rcnn I’m getting a very low fps of around 2 or 3. Here, is my code for converting a frozen graph to RT.
from object_detection.protos import pipeline_pb2
from object_detection import exporter
import os
import subprocess
from google.protobuf import text_format
import tensorflow.contrib.tensorrt as trt
import tensorflow as tf
def make_const6(const6_name='const6'):
graph = tf.Graph()
with graph.as_default():
tf_6 = tf.constant(dtype=tf.float32, value=6.0, name=const6_name)
return graph.as_graph_def()
def make_relu6(output_name, input_name, const6_name='const6'):
graph = tf.Graph()
with graph.as_default():
tf_x = tf.placeholder(tf.float32, [10, 10], name=input_name)
tf_6 = tf.constant(dtype=tf.float32, value=6.0, name=const6_name)
with tf.name_scope(output_name):
tf_y1 = tf.nn.relu(tf_x, name='relu1')
tf_y2 = tf.nn.relu(tf.subtract(tf_x, tf_6, name='sub1'), name='relu2')
#tf_y = tf.nn.relu(tf.subtract(tf_6, tf.nn.relu(tf_x, name='relu1'), name='sub'), name='relu2')
#tf_y = tf.subtract(tf_6, tf_y, name=output_name)
tf_y = tf.subtract(tf_y1, tf_y2, name=output_name)
graph_def = graph.as_graph_def()
graph_def.node[-1].name = output_name
# remove unused nodes
for node in graph_def.node:
if node.name == input_name:
graph_def.node.remove(node)
for node in graph_def.node:
if node.name == const6_name:
graph_def.node.remove(node)
for node in graph_def.node:
if node.op == '_Neg':
node.op = 'Neg'
return graph_def
def convert_relu6(graph_def, const6_name='const6'):
# add constant 6
has_const6 = False
for node in graph_def.node:
if node.name == const6_name:
has_const6 = True
if not has_const6:
const6_graph_def = make_const6(const6_name=const6_name)
graph_def.node.extend(const6_graph_def.node)
for node in graph_def.node:
if node.op == 'Relu6':
input_name = node.input[0]
output_name = node.name
relu6_graph_def = make_relu6(output_name, input_name, const6_name=const6_name)
graph_def.node.remove(node)
graph_def.node.extend(relu6_graph_def.node)
return graph_def
def remove_node(graph_def, node):
for n in graph_def.node:
if node.name in n.input:
n.input.remove(node.name)
ctrl_name = '^' + node.name
if ctrl_name in n.input:
n.input.remove(ctrl_name)
graph_def.node.remove(node)
def remove_op(graph_def, op_name):
matches = [node for node in graph_def.node if node.op == op_name]
for match in matches:
remove_node(graph_def, match)
def f_force_nms_cpu(frozen_graph):
for node in frozen_graph.node:
if 'NonMaxSuppression' in node.name:
node.device = '/device:CPU:0'
return frozen_graph
def f_replace_relu6(frozen_graph):
return convert_relu6(frozen_graph)
def f_remove_assert(frozen_graph):
remove_op(frozen_graph, 'Assert')
return frozen_graph
def build_detection_graph(config, checkpoint,
batch_size=1,
score_threshold=None,
iou_threshold=None,
force_nms_cpu=True,
replace_relu6=True,
remove_assert=True,
input_shape=None,
output_dir='.generated_model'):
"""Builds a frozen graph for a pre-trained object detection model"""
config_path = config
checkpoint_path = checkpoint
# parse config from file
config = pipeline_pb2.TrainEvalPipelineConfig()
with open(config_path, 'r') as f:
text_format.Merge(f.read(), config, allow_unknown_extension=True)
# override some config parameters
if config.model.HasField('ssd'):
config.model.ssd.feature_extractor.override_base_feature_extractor_hyperparams = True
if score_threshold is not None:
config.model.ssd.post_processing.batch_non_max_suppression.score_threshold = score_threshold
if iou_threshold is not None:
config.model.ssd.post_processing.batch_non_max_suppression.iou_threshold = iou_threshold
if input_shape is not None:
config.model.ssd.image_resizer.fixed_shape_resizer.height = input_shape[0]
config.model.ssd.image_resizer.fixed_shape_resizer.width = input_shape[1]
elif config.model.HasField('faster_rcnn'):
if score_threshold is not None:
config.model.faster_rcnn.second_stage_post_processing.score_threshold = score_threshold
if input_shape is not None:
config.model.faster_rcnn.image_resizer.fixed_shape_resizer.height = input_shape[0]
config.model.faster_rcnn.image_resizer.fixed_shape_resizer.width = input_shape[1]
if os.path.isdir(output_dir):
subprocess.call(['rm', '-rf', output_dir])
tf_config = tf.ConfigProto()
tf_config.gpu_options.allow_growth = True
# export inference graph to file (initial)
with tf.Session(config=tf_config) as tf_sess:
with tf.Graph().as_default() as tf_graph:
exporter.export_inference_graph(
'image_tensor',
config,
checkpoint_path,
output_dir,
input_shape=[batch_size, None, None, 3]
)
# read frozen graph from file
frozen_graph = tf.GraphDef()
with open(os.path.join(output_dir, FROZEN_GRAPH_NAME), 'rb') as f:
frozen_graph.ParseFromString(f.read())
# apply graph modifications
if force_nms_cpu:
frozen_graph = f_force_nms_cpu(frozen_graph)
if replace_relu6:
frozen_graph = f_replace_relu6(frozen_graph)
if remove_assert:
frozen_graph = f_remove_assert(frozen_graph)
# get input names
# TODO: handle mask_rcnn
input_names = [INPUT_NAME]
output_names = [BOXES_NAME, CLASSES_NAME, SCORES_NAME, NUM_DETECTIONS_NAME]
# remove temporary directory
subprocess.call(['rm', '-rf', output_dir])
return frozen_graph, input_names, output_names
config_path = "./data/pipeline.config"
checkpoint_path = "./data/frozen_inference_graph.pb"
frozen_graph, input_names, output_names = build_detection_graph(
config=config_path,
checkpoint=checkpoint_path,
score_threshold=0.3,
iou_threshold=0.5,
batch_size=1
)
# print(output_names)
trt_graph = trt.create_inference_graph(
input_graph_def=frozen_graph,
outputs=output_names,
max_batch_size=1,
max_workspace_size_bytes=1 << 25,
precision_mode='FP16',
minimum_segment_size=50
)
with open('./data/trt_graph.pb', 'wb') as f:
f.write(trt_graph.SerializeToString())
For inference my code is:
import tensorflow as tf
import numpy as np
def get_frozen_graph(graph_file):
"""Read Frozen Graph file from disk."""
with tf.gfile.FastGFile(graph_file, "rb") as f:
graph_def = tf.GraphDef()
graph_def.ParseFromString(f.read())
return graph_def
# The TensorRT inference graph file downloaded from Colab or your local machine.
pb_fname = "./data/trt_graph.pb"
trt_graph = get_frozen_graph(pb_fname)
input_names = ['image_tensor']
# Create session and load graph
tf_config = tf.ConfigProto()
tf_config.gpu_options.allow_growth = True
tf_sess = tf.Session(config=tf_config)
tf.import_graph_def(trt_graph, name='')
tf_input = tf_sess.graph.get_tensor_by_name(input_names[0] + ':0')
tf_scores = tf_sess.graph.get_tensor_by_name('detection_scores:0')
tf_boxes = tf_sess.graph.get_tensor_by_name('detection_boxes:0')
tf_classes = tf_sess.graph.get_tensor_by_name('detection_classes:0')
tf_num_detections = tf_sess.graph.get_tensor_by_name('num_detections:0')
import cv2
IMAGE_PATH = "./test.jpg"
image = cv2.imread(IMAGE_PATH)
# image = cv2.resize(image, (300, 300))
scores, boxes, classes, num_detections = tf_sess.run([tf_scores, tf_boxes, tf_classes, tf_num_detections], feed_dict={
tf_input: image[None, ...]
})
boxes = boxes[0] # index by 0 to remove batch dimension
scores = scores[0]
classes = classes[0]
num_detections = int(num_detections[0])
# Boxes unit in pixels (image coordinates).
boxes_pixels = []
for i in range(num_detections):
# scale box to image coordinates
box = boxes[i] * np.array([image.shape[0],
image.shape[1], image.shape[0], image.shape[1]])
box = np.round(box).astype(int)
boxes_pixels.append(box)
boxes_pixels = np.array(boxes_pixels)
def non_max_suppression(boxes, probs=None, nms_threshold=0.3):
"""Non-max suppression
Arguments:
boxes {np.array} -- a Numpy list of boxes, each one are [x1, y1, x2, y2]
Keyword arguments
probs {np.array} -- Probabilities associated with each box. (default: {None})
nms_threshold {float} -- Overlapping threshold 0~1. (default: {0.3})
Returns:
list -- A list of selected box indexes.
"""
# if there are no boxes, return an empty list
if len(boxes) == 0:
return []
# if the bounding boxes are integers, convert them to floats -- this
# is important since we'll be doing a bunch of divisions
if boxes.dtype.kind == "i":
boxes = boxes.astype("float")
# initialize the list of picked indexes
pick = []
# grab the coordinates of the bounding boxes
x1 = boxes[:, 0]
y1 = boxes[:, 1]
x2 = boxes[:, 2]
y2 = boxes[:, 3]
# compute the area of the bounding boxes and grab the indexes to sort
# (in the case that no probabilities are provided, simply sort on the
# bottom-left y-coordinate)
area = (x2 - x1 + 1) * (y2 - y1 + 1)
idxs = y2
# if probabilities are provided, sort on them instead
if probs is not None:
idxs = probs
# sort the indexes
idxs = np.argsort(idxs)
# keep looping while some indexes still remain in the indexes list
while len(idxs) > 0:
# grab the last index in the indexes list and add the index value
# to the list of picked indexes
last = len(idxs) - 1
i = idxs[last]
pick.append(i)
# find the largest (x, y) coordinates for the start of the bounding
# box and the smallest (x, y) coordinates for the end of the bounding
# box
xx1 = np.maximum(x1[i], x1[idxs[:last]])
yy1 = np.maximum(y1[i], y1[idxs[:last]])
xx2 = np.minimum(x2[i], x2[idxs[:last]])
yy2 = np.minimum(y2[i], y2[idxs[:last]])
# compute the width and height of the bounding box
w = np.maximum(0, xx2 - xx1 + 1)
h = np.maximum(0, yy2 - yy1 + 1)
# compute the ratio of overlap
overlap = (w * h) / area[idxs[:last]]
# delete all indexes from the index list that have overlap greater
# than the provided overlap threshold
idxs = np.delete(idxs, np.concatenate(([last],
np.where(overlap > nms_threshold)[0])))
# return only the bounding boxes indexes
return pick
def draw_label(image, point, label, font=cv2.FONT_HERSHEY_SIMPLEX,
font_scale=0.5, thickness=2):
size = cv2.getTextSize(label, font, font_scale, thickness)[0]
x, y = point
cv2.rectangle(image, (x, y - size[1]),
(x + size[0], y), (255, 0, 0), cv2.FILLED)
cv2.putText(image, label, point, font, font_scale,
(255, 255, 255), thickness)
# for i in pick:
for i in range(num_detections):
# if scores[i] > 0.05:
box = boxes_pixels[i]
box = np.round(box).astype(int)
# Draw bounding box.
image = cv2.rectangle(
image, (box[1], box[0]), (box[3], box[2]), (0, 255, 0), 2)
label = "{}:{:.2f}".format(int(classes[i]), scores[i])
# Draw label (class index and probability).
draw_label(image, (box[1], box[0]), label)
# Save and display the labeled image.
cv2.imwrite("./out.jpg", image)
I don’t know what mistake I’m making. I don’t think that a mask rcnn with an 640P input would run at 2-3 fps on Xavier. Please help me out.