Description
Running trt converted model using Tensorflow 1.15.2 (Nvidia Release 20.02-tf1) takes up too much of CPU RAM than expected (~7 GB) as opposed to running the model without conversion to trt (~2.5 GB)
This issue is irrespective of whether model is converted dynamically or statically.
Environment
TensorRT Version:
GPU Type: GeForce RTX 2080
Nvidia Driver Version: 440.33.01
CUDA Version: 10.2
CUDNN Version: 7.6.5
Operating System + Version: Ubuntu 18.04.3 LTS
Python Version (if applicable): 3.6.9
TensorFlow Version (if applicable): 1.15.2
PyTorch Version (if applicable):
Baremetal or Container (if container which image + tag): Nvidia Release 20.02-tf1
Steps To Reproduce
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import tensorflow as tf
from tensorflow.python.compiler.tensorrt import trt_convert as trt
import numpy as np
import sys
from timeit import default_timer as timer
from tensorflow.python.platform import gfile
from tensorflow.core.protobuf import saved_model_pb2
from tensorflow.python.util import compat
import time
oModelDir = "./saved_model/"
#oModelDir = "./saved_model/trt_model"
iBatchSize = int(sys.argv[1])
iNumTimesToRun = int(sys.argv[2])
#1 - trt convert, else tensorflow
iTrtConvert = int(sys.argv[3])
#Data for inference - random data. Input size hardcoded
data = np.random.rand(416, 416, 3)
data = (data*255).astype(np.uint8)
data = np.expand_dims(data, 0)
data = np.repeat(data, iBatchSize, axis=0)
with gfile.FastGFile(oModelDir+"/saved_model.pb", 'rb') as f:
file_read = compat.as_bytes(f.read())
sm = saved_model_pb2.SavedModel()
sm.ParseFromString(file_read)
with tf.Graph().as_default() as dumyGraph :
tf.import_graph_def(sm.meta_graphs[0].graph_def)
operations = dumyGraph.get_operations()
operations =[op for op in operations if op.type!="NoOp"]
input_node = (dumyGraph.get_tensor_by_name(operations[0].name+":0"))
output_node = (dumyGraph.get_tensor_by_name(operations[-1].name+":0"))
outputName = output_node.name.split("/")[-1]
#Convert to trt engine
if iTrtConvert == 1:
converter = trt.TrtGraphConverter(input_saved_model_dir=oModelDir,
nodes_blacklist=[outputName],
max_batch_size=iBatchSize,
max_workspace_size_bytes=2000,precision_mode="FP16")
trt_graph = converter.convert()
with tf.Graph().as_default() as dumyGraph:
tf.import_graph_def(trt_graph)
input_node = dumyGraph.get_tensor_by_name(operations[0].name+":0")
output_node = dumyGraph.get_tensor_by_name(operations[-1].name+":0")
#Save model if needed
#converter.save("./saved_model/trt_model/")
#Start a session
sess = None
with tf.device("/device:GPU:0"):
cfg = dict({'allow_soft_placement': True,'log_device_placement': False})
cfg['gpu_options'] = tf.GPUOptions(per_process_gpu_memory_fraction = 0.3, allow_growth = True)
cfg['allow_soft_placement'] = False
cfg['device_count'] = {'GPU': 1}
sess =tf.compat.v1.Session(graph=dumyGraph, config = tf.compat.v1.ConfigProto(**cfg))
#Warmup run
output = sess.run([output_node], feed_dict={input_node: np.array(data)})
start_timer = timer()
for i in range(iNumTimesToRun):
output = sess.run([output_node], feed_dict={input_node: np.array(data)})
end_timer = timer()
total_time = end_timer - start_timer
average_time = total_time/float(iNumTimesToRun)
print (len(output[0]))
print ("Total time : ", total_time)
print ("Average time(ms)/image : ", (average_time/iBatchSize)*1000)
print ("FPS : ", 1/(average_time/iBatchSize))