TX2 cuDNN TRUE_HALF_CONFIG can't be faster than float32

Platform: TX2
JetPack3.3

cuDnn version: 7.1
tensorflow version: 1.11


This my test code:

from __future__ import print_function

import tensorflow as tf
import numpy as np
import argparse
import os

FLAGS = None

def get_dtype():
  return tf.float16 if FLAGS.use_fp16 else tf.float32

def benchmark():
  graph = tf.Graph()
  with graph.as_default():
    tf_input1 = tf.Variable(tf.truncated_normal([FLAGS.batch, FLAGS.size, FLAGS.size, 1], dtype=get_dtype()))
    tf_input2 = tf.Variable(tf.truncated_normal([50, 50, 1, 1], dtype=get_dtype()))
    #tf_output = tf.matmul(tf_input1, tf_input2)
    tf_output = tf.nn.convolution(tf_input1, tf_input2, padding='SAME')

  config = tf.ConfigProto()
  config.gpu_options.allow_growth = True	
  with tf.Session(graph=graph, config=config) as session:
    tf.global_variables_initializer().run()
    print("Initialized")
    for i in range(FLAGS.times):
      out = session.run([tf_output])
    print("Done")

def parse():
  global FLAGS
  os.environ['TF_FP16_CONV_MODE'] = 'FAST'
  print(os.environ['TF_FP16_CONV_MODE'])
  parser = argparse.ArgumentParser()
  parser.add_argument('--use_fp16', default=False, help='use FP16 for benchmarking', action='store_true')
  parser.add_argument('--size', default=4096, type=int, help='size of matrices to multiply')
  parser.add_argument('--times', default=10, type=int, help='amount of multiplications')
  parser.add_argument('--batch', default=1, type=int, help='batch size')
  FLAGS, unparsed = parser.parse_known_args()

if __name__ == "__main__":
  parse()
  benchmark()

Use cmd:
nvprof python3.5 fp16.py --use_fp16 --size 1000

And I get the cuDNN log:

I! CuDNN (v7105) function cudnnSetConvolutionMathType() called:
i!     mathType: type=cudnnMathType_t; val=CUDNN_DEFAULT_MATH (0);
i! Time: 2019-03-14T15:43:38.355962 (0d+0h+0m+47s since start)
i! Process=7554; Thread=7580; GPU=NULL; Handle=NULL; StreamId=NULL.


I! CuDNN (v7105) function cudnnGetConvolutionForwardWorkspaceSize() called:
i!     handle: type=cudnnHandle_t; streamId=0x77fddb0;
i!     xDesc: type=cudnnTensorDescriptor_t:
i!         dataType: type=cudnnDataType_t; val=CUDNN_DATA_HALF (2);
i!         nbDims: type=int; val=4;
i!         dimA: type=int; val=[4,1,1001,1001];
i!         strideA: type=int; val=[1002001,1002001,1001,1];
i!     wDesc: type=cudnnFilterDescriptor_t:
i!         dataType: type=cudnnDataType_t; val=CUDNN_DATA_HALF (2);
i!         vect: type=int; val=0;
i!         nbDims: type=int; val=4;
i!         dimA: type=int; val=[1,1,50,50];
i!         format: type=cudnnTensorFormat_t; val=CUDNN_TENSOR_NCHW (0);
i!     convDesc: type=cudnnConvolutionDescriptor_t:
i!         mode: type=cudnnConvolutionMode_t; val=CUDNN_CROSS_CORRELATION (1);
i!         dataType: type=cudnnDataType_t; val=CUDNN_DATA_HALF (2);
i!         mathType: type=cudnnMathType_t; val=CUDNN_DEFAULT_MATH (0);
i!         arrayLength: type=int; val=2;
i!         padA: type=int; val=[24,24];
i!         strideA: type=int; val=[1,1];
i!         dilationA: type=int; val=[1,1];
i!         groupCount: type=int; val=1;
i!     yDesc: type=cudnnTensorDescriptor_t:
i!         dataType: type=cudnnDataType_t; val=CUDNN_DATA_HALF (2);
i!         nbDims: type=int; val=4;
i!         dimA: type=int; val=[4,1,1000,1000];
i!         strideA: type=int; val=[1000000,1000000,1000,1];
i!     algo: type=cudnnConvolutionFwdAlgo_t; val=CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM (0);
i! Time: 2019-03-14T15:43:38.356034 (0d+0h+0m+47s since start)
i! Process=7554; Thread=7580; GPU=0; Handle=0x7f181043a0; StreamId=0x77fddb0.


I! CuDNN (v7105) function cudnnConvolutionForward() called:
i!     handle: type=cudnnHandle_t; streamId=0x77fddb0;
i!     alpha: type=CUDNN_DATA_FLOAT; val=1.000000;
i!     xDesc: type=cudnnTensorDescriptor_t:
i!         dataType: type=cudnnDataType_t; val=CUDNN_DATA_HALF (2);
i!         nbDims: type=int; val=4;
i!         dimA: type=int; val=[4,1,1001,1001];
i!         strideA: type=int; val=[1002001,1002001,1001,1];
i!     xData: location=dev; addr=0xfc2a70000;
i!     wDesc: type=cudnnFilterDescriptor_t:
i!         dataType: type=cudnnDataType_t; val=CUDNN_DATA_HALF (2);
i!         vect: type=int; val=0;
i!         nbDims: type=int; val=4;
i!         dimA: type=int; val=[1,1,50,50];
i!         format: type=cudnnTensorFormat_t; val=CUDNN_TENSOR_NCHW (0);
i!     wData: location=dev; addr=0xfc1871900;
i!     convDesc: type=cudnnConvolutionDescriptor_t:
i!         mode: type=cudnnConvolutionMode_t; val=CUDNN_CROSS_CORRELATION (1);
i!         dataType: type=cudnnDataType_t; val=CUDNN_DATA_HALF (2);
i!         mathType: type=cudnnMathType_t; val=CUDNN_DEFAULT_MATH (0);
i!         arrayLength: type=int; val=2;
i!         padA: type=int; val=[24,24];
i!         strideA: type=int; val=[1,1];
i!         dilationA: type=int; val=[1,1];
i!         groupCount: type=int; val=1;
i!     algo: type=cudnnConvolutionFwdAlgo_t; val=CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM (0);
i!     workSpace: location=dev; addr=NULL_PTR;
i!     workSpaceSizeInBytes: type=size_t; val=0;
i!     beta: type=CUDNN_DATA_FLOAT; val=0.000000;
i!     yDesc: type=cudnnTensorDescriptor_t:
i!         dataType: type=cudnnDataType_t; val=CUDNN_DATA_HALF (2);
i!         nbDims: type=int; val=4;
i!         dimA: type=int; val=[4,1,1000,1000];
i!         strideA: type=int; val=[1000000,1000000,1000,1];
i!     yData: location=dev; addr=0xfc3215100;
i! Time: 2019-03-14T15:43:38.356186 (0d+0h+0m+47s since start)
i! Process=7554; Thread=7580; GPU=0; Handle=0x7f181043a0; StreamId=0x77fddb0.
...

But the speed is same as float32.javascript:void(0);
It seems some thing wrong, because TX2 has fp16 optimization.

Thanks.