Hi, I think I’m seeing a regression in cuDNN that’s caused by convolution algorithm selection heuristics.
Background:
- I received report of perf regression (on V100) comparing cuda11+cudnn8 vs cuda10+cudnn7
- I can reproduce the regression on a GTX1080Ti comparing cuda10.2+cudnn8.0.2 vs cuda10.2+cudnn7.6.5. cudnn8.0.2 is 5%~10% slower.
- Our TensorFlow job requires variable-sized input, so it uses TF_CUDNN_USE_AUTOTUNE=0 (because autotune would be slow). If we change the job to use fixed-size input and enable autotune, the regression disappeared.
Therefore I think the change in algorithm selection heuristics causes the perf regression.
I managed to take a few layers from the model that suffers the most from the regression, and make a repro script below.
The script is tested on GTX1080Ti with TensorFlow2.3 (built with v1 compat mode), pip install tensorpack
, cuda10.2, and cudnn7.6.5 vs cudnn8.0.2.
The results are following:
cudnn7.6.5, TF_CUDNN_USE_AUTOTUNE=0: 30.2it/s
cudnn7.6.5, TF_CUDNN_USE_AUTOTUNE=1: 41.7it/s
cudnn8.0.2, TF_CUDNN_USE_AUTOTUNE=0: 26.9it/s
cudnn8.0.2, TF_CUDNN_USE_AUTOTUNE=1: 39.4it/s
The results show that: (1) the algorithm selection heuristics are much worse than optimal (2) the algorithm selection heuristics has a 10% regression in cudnn8.
Note that tensorflow changes from using cudnnGetConvolutionForwardAlgorithm to cudnnGetConvolutionForwardAlgorithm_v7 in cudnn8 because the old API was deleted. So the regression perhaps is between these two APIs, but not between v7 and v8. But I have not verified this.
import tensorflow.compat.v1 as tf
tf.disable_eager_execution()
import numpy as np
import time
from tensorpack.models import Conv2D, MaxPooling
from tensorpack import argscope
def fpn_model(features):
assert len(features) == 4, features
num_channel = 256
def upsample2x(name, x):
resize = tf.image.resize_images
with tf.name_scope(name):
shp2d = tf.shape(x)[2:]
x = tf.transpose(x, [0, 2, 3, 1])
x = resize(x, shp2d * 2, 'nearest')
x = tf.transpose(x, [0, 3, 1, 2])
return x
with argscope(Conv2D, data_format='channels_first', activation=tf.identity, use_bias=True):
lat_2345 = [Conv2D('lateral_1x1_c{}'.format(i + 2), c, num_channel, 1)
for i, c in enumerate(features)]
lat_sum_5432 = []
for idx, lat in enumerate(lat_2345[::-1]):
if idx == 0:
lat_sum_5432.append(lat)
else:
lat = lat + upsample2x('upsample_lat{}'.format(6 - idx), lat_sum_5432[-1])
lat_sum_5432.append(lat)
p2345 = [Conv2D('posthoc_3x3_p{}'.format(i + 2), c, num_channel, 3)
for i, c in enumerate(lat_sum_5432[::-1])]
p6 = MaxPooling('maxpool_p6', p2345[-1], pool_size=1, strides=2, data_format='channels_first', padding='VALID')
return p2345 + [p6]
def get_data(name, shape):
var = tf.get_variable(name, shape=shape)
shape[2] = None
shape[3] = None
# use this trick to clear static shapes and prevent some optimization
return tf.placeholder_with_default(var, shape=shape)
def model():
x1 = get_data('x1', [1, 256, 200, 304])
x2 = get_data('x2', [1, 512, 100, 152])
x3 = get_data('x3', [1, 1024, 50, 76])
x4 = get_data('x4', [1, 2048, 25, 38])
c2345 = [x1, x2, x3, x4]
out = fpn_model( c2345)
loss = tf.add_n([tf.reduce_mean(x) for x in out])
train_op = tf.train.GradientDescentOptimizer(0.01).minimize(loss,
var_list=tf.trainable_variables())
return train_op
def benchmark(op, nr_iter=300, nr_warmup=10):
for k in range(nr_warmup):
op.run()
start = time.perf_counter()
for k in range(nr_iter):
op.run()
end = time.perf_counter()
itr_per_sec = nr_iter * 1. / (end - start)
return itr_per_sec
with tf.device('/gpu:0'):
train_op = model()
config = tf.ConfigProto()
config.allow_soft_placement = True
sess = tf.Session(config=config)
with sess.as_default():
sess.run(tf.global_variables_initializer())
speed = benchmark(train_op)
print(speed)
I also parsed the debug output of cuDNN API call, and these are the convolution operators where v7 and v8 disagree on algorithms:
v7 cudnnConvolutionForward
OrderedDict([('xDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('xDesc.dimA', '[1,256,25,38]'),
('xDesc.strideA', '[243200,950,38,1]'),
('wDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('wDesc.dimA', '[256,256,3,3]'),
('wDesc.format', 'CUDNN_TENSOR_NCHW (0)'),
('convDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('convDesc.padA', '[1,1]'),
('convDesc.strideA', '[1,1]'),
('convDesc.dilationA', '[1,1]'),
('convDesc.groupCount', '1'),
('algo', 'CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD (6)'),
('yDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('yDesc.dimA', '[1,256,25,38]'),
('yDesc.strideA', '[243200,950,38,1]')])
v8 cudnnConvolutionForward
OrderedDict([('xDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('xDesc.dimA', '[1,256,25,38]'),
('xDesc.strideA', '[243200,950,38,1]'),
('wDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('wDesc.dimA', '[256,256,3,3]'),
('wDesc.format', 'CUDNN_TENSOR_NCHW (0)'),
('convDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('convDesc.padA', '[1,1]'),
('convDesc.strideA', '[1,1]'),
('convDesc.dilationA', '[1,1]'),
('convDesc.groupCount', '1'),
('algo', 'CUDNN_CONVOLUTION_FWD_ALGO_GEMM (2)'),
('yDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('yDesc.dimA', '[1,256,25,38]'),
('yDesc.strideA', '[243200,950,38,1]')])
--------------------
v7 cudnnConvolutionBackwardFilter
OrderedDict([('xDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('xDesc.dimA', '[1,256,50,76]'),
('xDesc.strideA', '[972800,3800,76,1]'),
('dyDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('dyDesc.dimA', '[1,256,50,76]'),
('dyDesc.strideA', '[972800,3800,76,1]'),
('convDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('convDesc.padA', '[1,1]'),
('convDesc.strideA', '[1,1]'),
('convDesc.dilationA', '[1,1]'),
('convDesc.groupCount', '1'),
('algo', 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0 (0)'),
('dwDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('dwDesc.dimA', '[256,256,3,3]'),
('dwDesc.format', 'CUDNN_TENSOR_NCHW (0)')])
v8 cudnnConvolutionBackwardFilter
OrderedDict([('xDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('xDesc.dimA', '[1,256,50,76]'),
('xDesc.strideA', '[972800,3800,76,1]'),
('dyDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('dyDesc.dimA', '[1,256,50,76]'),
('dyDesc.strideA', '[972800,3800,76,1]'),
('convDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('convDesc.padA', '[1,1]'),
('convDesc.strideA', '[1,1]'),
('convDesc.dilationA', '[1,1]'),
('convDesc.groupCount', '1'),
('algo',
'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED (5)'),
('dwDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('dwDesc.dimA', '[256,256,3,3]'),
('dwDesc.format', 'CUDNN_TENSOR_NCHW (0)')])
--------------------
v7 cudnnConvolutionBackwardFilter
OrderedDict([('xDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('xDesc.dimA', '[1,256,25,38]'),
('xDesc.strideA', '[243200,950,38,1]'),
('dyDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('dyDesc.dimA', '[1,256,25,38]'),
('dyDesc.strideA', '[243200,950,38,1]'),
('convDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('convDesc.padA', '[1,1]'),
('convDesc.strideA', '[1,1]'),
('convDesc.dilationA', '[1,1]'),
('convDesc.groupCount', '1'),
('algo', 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0 (0)'),
('dwDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('dwDesc.dimA', '[256,256,3,3]'),
('dwDesc.format', 'CUDNN_TENSOR_NCHW (0)')])
v8 cudnnConvolutionBackwardFilter
OrderedDict([('xDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('xDesc.dimA', '[1,256,25,38]'),
('xDesc.strideA', '[243200,950,38,1]'),
('dyDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('dyDesc.dimA', '[1,256,25,38]'),
('dyDesc.strideA', '[243200,950,38,1]'),
('convDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('convDesc.padA', '[1,1]'),
('convDesc.strideA', '[1,1]'),
('convDesc.dilationA', '[1,1]'),
('convDesc.groupCount', '1'),
('algo',
'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED (5)'),
('dwDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('dwDesc.dimA', '[256,256,3,3]'),
('dwDesc.format', 'CUDNN_TENSOR_NCHW (0)')])
--------------------
v7 cudnnConvolutionBackwardData
OrderedDict([('wDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('wDesc.dimA', '[256,256,1,1]'),
('wDesc.format', 'CUDNN_TENSOR_NCHW (0)'),
('dyDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('dyDesc.dimA', '[1,256,200,304]'),
('dyDesc.strideA', '[15564800,60800,304,1]'),
('convDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('convDesc.padA', '[0,0]'),
('convDesc.strideA', '[1,1]'),
('convDesc.dilationA', '[1,1]'),
('convDesc.groupCount', '1'),
('algo', 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 (1)'),
('dxDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('dxDesc.dimA', '[1,256,200,304]'),
('dxDesc.strideA', '[15564800,60800,304,1]')])
v8 cudnnConvolutionBackwardData
OrderedDict([('wDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('wDesc.dimA', '[256,256,1,1]'),
('wDesc.format', 'CUDNN_TENSOR_NCHW (0)'),
('dyDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('dyDesc.dimA', '[1,256,200,304]'),
('dyDesc.strideA', '[15564800,60800,304,1]'),
('convDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('convDesc.padA', '[0,0]'),
('convDesc.strideA', '[1,1]'),
('convDesc.dilationA', '[1,1]'),
('convDesc.groupCount', '1'),
('algo', 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_0 (0)'),
('dxDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('dxDesc.dimA', '[1,256,200,304]'),
('dxDesc.strideA', '[15564800,60800,304,1]')])
--------------------
v7 cudnnConvolutionBackwardFilter
OrderedDict([('xDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('xDesc.dimA', '[1,256,200,304]'),
('xDesc.strideA', '[15564800,60800,304,1]'),
('dyDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('dyDesc.dimA', '[1,256,200,304]'),
('dyDesc.strideA', '[15564800,60800,304,1]'),
('convDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('convDesc.padA', '[0,0]'),
('convDesc.strideA', '[1,1]'),
('convDesc.dilationA', '[1,1]'),
('convDesc.groupCount', '1'),
('algo', 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3 (3)'),
('dwDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('dwDesc.dimA', '[256,256,1,1]'),
('dwDesc.format', 'CUDNN_TENSOR_NCHW (0)')])
v8 cudnnConvolutionBackwardFilter
OrderedDict([('xDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('xDesc.dimA', '[1,256,200,304]'),
('xDesc.strideA', '[15564800,60800,304,1]'),
('dyDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('dyDesc.dimA', '[1,256,200,304]'),
('dyDesc.strideA', '[15564800,60800,304,1]'),
('convDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('convDesc.padA', '[0,0]'),
('convDesc.strideA', '[1,1]'),
('convDesc.dilationA', '[1,1]'),
('convDesc.groupCount', '1'),
('algo', 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0 (0)'),
('dwDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('dwDesc.dimA', '[256,256,1,1]'),
('dwDesc.format', 'CUDNN_TENSOR_NCHW (0)')])
--------------------
v7 cudnnConvolutionBackwardFilter
OrderedDict([('xDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('xDesc.dimA', '[1,512,100,152]'),
('xDesc.strideA', '[7782400,15200,152,1]'),
('dyDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('dyDesc.dimA', '[1,256,100,152]'),
('dyDesc.strideA', '[3891200,15200,152,1]'),
('convDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('convDesc.padA', '[0,0]'),
('convDesc.strideA', '[1,1]'),
('convDesc.dilationA', '[1,1]'),
('convDesc.groupCount', '1'),
('algo', 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3 (3)'),
('dwDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('dwDesc.dimA', '[256,512,1,1]'),
('dwDesc.format', 'CUDNN_TENSOR_NCHW (0)')])
v8 cudnnConvolutionBackwardFilter
OrderedDict([('xDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('xDesc.dimA', '[1,512,100,152]'),
('xDesc.strideA', '[7782400,15200,152,1]'),
('dyDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('dyDesc.dimA', '[1,256,100,152]'),
('dyDesc.strideA', '[3891200,15200,152,1]'),
('convDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('convDesc.padA', '[0,0]'),
('convDesc.strideA', '[1,1]'),
('convDesc.dilationA', '[1,1]'),
('convDesc.groupCount', '1'),
('algo', 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0 (0)'),
('dwDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('dwDesc.dimA', '[256,512,1,1]'),
('dwDesc.format', 'CUDNN_TENSOR_NCHW (0)')])
--------------------
v7 cudnnConvolutionBackwardFilter
OrderedDict([('xDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('xDesc.dimA', '[1,1024,50,76]'),
('xDesc.strideA', '[3891200,3800,76,1]'),
('dyDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('dyDesc.dimA', '[1,256,50,76]'),
('dyDesc.strideA', '[972800,3800,76,1]'),
('convDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('convDesc.padA', '[0,0]'),
('convDesc.strideA', '[1,1]'),
('convDesc.dilationA', '[1,1]'),
('convDesc.groupCount', '1'),
('algo', 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 (1)'),
('dwDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('dwDesc.dimA', '[256,1024,1,1]'),
('dwDesc.format', 'CUDNN_TENSOR_NCHW (0)')])
v8 cudnnConvolutionBackwardFilter
OrderedDict([('xDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('xDesc.dimA', '[1,1024,50,76]'),
('xDesc.strideA', '[3891200,3800,76,1]'),
('dyDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('dyDesc.dimA', '[1,256,50,76]'),
('dyDesc.strideA', '[972800,3800,76,1]'),
('convDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('convDesc.padA', '[0,0]'),
('convDesc.strideA', '[1,1]'),
('convDesc.dilationA', '[1,1]'),
('convDesc.groupCount', '1'),
('algo', 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0 (0)'),
('dwDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('dwDesc.dimA', '[256,1024,1,1]'),
('dwDesc.format', 'CUDNN_TENSOR_NCHW (0)')])
--------------------
v7 cudnnConvolutionBackwardFilter
OrderedDict([('xDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('xDesc.dimA', '[1,2048,25,38]'),
('xDesc.strideA', '[1945600,950,38,1]'),
('dyDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('dyDesc.dimA', '[1,256,25,38]'),
('dyDesc.strideA', '[243200,950,38,1]'),
('convDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('convDesc.padA', '[0,0]'),
('convDesc.strideA', '[1,1]'),
('convDesc.dilationA', '[1,1]'),
('convDesc.groupCount', '1'),
('algo', 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 (1)'),
('dwDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('dwDesc.dimA', '[256,2048,1,1]'),
('dwDesc.format', 'CUDNN_TENSOR_NCHW (0)')])
v8 cudnnConvolutionBackwardFilter
OrderedDict([('xDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('xDesc.dimA', '[1,2048,25,38]'),
('xDesc.strideA', '[1945600,950,38,1]'),
('dyDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('dyDesc.dimA', '[1,256,25,38]'),
('dyDesc.strideA', '[243200,950,38,1]'),
('convDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('convDesc.padA', '[0,0]'),
('convDesc.strideA', '[1,1]'),
('convDesc.dilationA', '[1,1]'),
('convDesc.groupCount', '1'),
('algo', 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0 (0)'),
('dwDesc.dataType', 'CUDNN_DATA_FLOAT (0)'),
('dwDesc.dimA', '[256,2048,1,1]'),
('dwDesc.format', 'CUDNN_TENSOR_NCHW (0)')])
--------------------