Platform: TX2
JetPack3.3
cuDnn version: 7.1
tensorflow version: 1.11
This my test code:
from __future__ import print_function
import tensorflow as tf
import numpy as np
import argparse
import os
FLAGS = None
def get_dtype():
return tf.float16 if FLAGS.use_fp16 else tf.float32
def benchmark():
graph = tf.Graph()
with graph.as_default():
tf_input1 = tf.Variable(tf.truncated_normal([FLAGS.batch, FLAGS.size, FLAGS.size, 1], dtype=get_dtype()))
tf_input2 = tf.Variable(tf.truncated_normal([50, 50, 1, 1], dtype=get_dtype()))
#tf_output = tf.matmul(tf_input1, tf_input2)
tf_output = tf.nn.convolution(tf_input1, tf_input2, padding='SAME')
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
with tf.Session(graph=graph, config=config) as session:
tf.global_variables_initializer().run()
print("Initialized")
for i in range(FLAGS.times):
out = session.run([tf_output])
print("Done")
def parse():
global FLAGS
os.environ['TF_FP16_CONV_MODE'] = 'FAST'
print(os.environ['TF_FP16_CONV_MODE'])
parser = argparse.ArgumentParser()
parser.add_argument('--use_fp16', default=False, help='use FP16 for benchmarking', action='store_true')
parser.add_argument('--size', default=4096, type=int, help='size of matrices to multiply')
parser.add_argument('--times', default=10, type=int, help='amount of multiplications')
parser.add_argument('--batch', default=1, type=int, help='batch size')
FLAGS, unparsed = parser.parse_known_args()
if __name__ == "__main__":
parse()
benchmark()
Use cmd:
nvprof python3.5 fp16.py --use_fp16 --size 1000
And I get the cuDNN log:
I! CuDNN (v7105) function cudnnSetConvolutionMathType() called:
i! mathType: type=cudnnMathType_t; val=CUDNN_DEFAULT_MATH (0);
i! Time: 2019-03-14T15:43:38.355962 (0d+0h+0m+47s since start)
i! Process=7554; Thread=7580; GPU=NULL; Handle=NULL; StreamId=NULL.
I! CuDNN (v7105) function cudnnGetConvolutionForwardWorkspaceSize() called:
i! handle: type=cudnnHandle_t; streamId=0x77fddb0;
i! xDesc: type=cudnnTensorDescriptor_t:
i! dataType: type=cudnnDataType_t; val=CUDNN_DATA_HALF (2);
i! nbDims: type=int; val=4;
i! dimA: type=int; val=[4,1,1001,1001];
i! strideA: type=int; val=[1002001,1002001,1001,1];
i! wDesc: type=cudnnFilterDescriptor_t:
i! dataType: type=cudnnDataType_t; val=CUDNN_DATA_HALF (2);
i! vect: type=int; val=0;
i! nbDims: type=int; val=4;
i! dimA: type=int; val=[1,1,50,50];
i! format: type=cudnnTensorFormat_t; val=CUDNN_TENSOR_NCHW (0);
i! convDesc: type=cudnnConvolutionDescriptor_t:
i! mode: type=cudnnConvolutionMode_t; val=CUDNN_CROSS_CORRELATION (1);
i! dataType: type=cudnnDataType_t; val=CUDNN_DATA_HALF (2);
i! mathType: type=cudnnMathType_t; val=CUDNN_DEFAULT_MATH (0);
i! arrayLength: type=int; val=2;
i! padA: type=int; val=[24,24];
i! strideA: type=int; val=[1,1];
i! dilationA: type=int; val=[1,1];
i! groupCount: type=int; val=1;
i! yDesc: type=cudnnTensorDescriptor_t:
i! dataType: type=cudnnDataType_t; val=CUDNN_DATA_HALF (2);
i! nbDims: type=int; val=4;
i! dimA: type=int; val=[4,1,1000,1000];
i! strideA: type=int; val=[1000000,1000000,1000,1];
i! algo: type=cudnnConvolutionFwdAlgo_t; val=CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM (0);
i! Time: 2019-03-14T15:43:38.356034 (0d+0h+0m+47s since start)
i! Process=7554; Thread=7580; GPU=0; Handle=0x7f181043a0; StreamId=0x77fddb0.
I! CuDNN (v7105) function cudnnConvolutionForward() called:
i! handle: type=cudnnHandle_t; streamId=0x77fddb0;
i! alpha: type=CUDNN_DATA_FLOAT; val=1.000000;
i! xDesc: type=cudnnTensorDescriptor_t:
i! dataType: type=cudnnDataType_t; val=CUDNN_DATA_HALF (2);
i! nbDims: type=int; val=4;
i! dimA: type=int; val=[4,1,1001,1001];
i! strideA: type=int; val=[1002001,1002001,1001,1];
i! xData: location=dev; addr=0xfc2a70000;
i! wDesc: type=cudnnFilterDescriptor_t:
i! dataType: type=cudnnDataType_t; val=CUDNN_DATA_HALF (2);
i! vect: type=int; val=0;
i! nbDims: type=int; val=4;
i! dimA: type=int; val=[1,1,50,50];
i! format: type=cudnnTensorFormat_t; val=CUDNN_TENSOR_NCHW (0);
i! wData: location=dev; addr=0xfc1871900;
i! convDesc: type=cudnnConvolutionDescriptor_t:
i! mode: type=cudnnConvolutionMode_t; val=CUDNN_CROSS_CORRELATION (1);
i! dataType: type=cudnnDataType_t; val=CUDNN_DATA_HALF (2);
i! mathType: type=cudnnMathType_t; val=CUDNN_DEFAULT_MATH (0);
i! arrayLength: type=int; val=2;
i! padA: type=int; val=[24,24];
i! strideA: type=int; val=[1,1];
i! dilationA: type=int; val=[1,1];
i! groupCount: type=int; val=1;
i! algo: type=cudnnConvolutionFwdAlgo_t; val=CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM (0);
i! workSpace: location=dev; addr=NULL_PTR;
i! workSpaceSizeInBytes: type=size_t; val=0;
i! beta: type=CUDNN_DATA_FLOAT; val=0.000000;
i! yDesc: type=cudnnTensorDescriptor_t:
i! dataType: type=cudnnDataType_t; val=CUDNN_DATA_HALF (2);
i! nbDims: type=int; val=4;
i! dimA: type=int; val=[4,1,1000,1000];
i! strideA: type=int; val=[1000000,1000000,1000,1];
i! yData: location=dev; addr=0xfc3215100;
i! Time: 2019-03-14T15:43:38.356186 (0d+0h+0m+47s since start)
i! Process=7554; Thread=7580; GPU=0; Handle=0x7f181043a0; StreamId=0x77fddb0.
...
But the speed is same as float32.javascript:void(0);
It seems some thing wrong, because TX2 has fp16 optimization.
Thanks.