cudnn8 has a ~50x performance regression in cudnnConvolutionBiasActivationForward on the Jetson TX2.
Results on the following minimal example:
CuDNNv7 non-fused: 1.490 s = 518.8 GF
CuDNNv8 non-fused: 1.478 s = 523.1 GF
CuDNNv7 fused : 1.015 s = 761.5 GF
CuDNNv8 fused : 48.168 s = 16.0 GF
Results on GTX 1070 with cudnn 8.1, cuda 11.2
CuDNNv8 non-fused: 0.146 s = 5290.1 GF
CuDNNv8 fused : 0.094 s = 8185.8 GF
Switching to the non-fuesed path is less dramatic but still substantial performance regression.
minimal example:
//nvcc -arch=sm_62 -o cudnn_test_nonfused_7 cudnn_test.cpp -l:libcudnn.so.7
//nvcc -arch=sm_62 -o cudnn_test_nonfused_8 cudnn_test.cpp -l:libcudnn.so.8
//nvcc -arch=sm_62 -DCONV_FUSE -o cudnn_test_fused_7 cudnn_test.cpp -l:libcudnn.so.7
//nvcc -arch=sm_62 -DCONV_FUSE -o cudnn_test_fused_8 cudnn_test.cpp -l:libcudnn.so.8
#include <stdlib.h>
#include <stdio.h>
#include <sys/time.h>
#include <cudnn.h>
#define CHK(f) do { int rc= f; if (rc){ printf("%s:%d: err %d\n",__FILE__,__LINE__,rc); return rc; }} while(0)
int main(int argc, char *argv[])
{
const int batsz=256, height= 64, width= 64, nfilt=64, nchan=64, fsz=3, pad=1, stride=1;
const float _one = 1, _zero = 0;
const cudnnDataType_t _dtype = CUDNN_DATA_FLOAT;
const cudnnDataType_t _ctype = CUDNN_DATA_FLOAT;
const cudnnTensorFormat_t _format = CUDNN_TENSOR_NCHW;
const cudnnConvolutionMode_t _mode = CUDNN_CROSS_CORRELATION;
const cudnnNanPropagation_t _nan_prop = CUDNN_NOT_PROPAGATE_NAN;
unsigned char *X, *Y, *W, *T, *b;
cudnnHandle_t _cudnn_h = nullptr;
cudnnTensorDescriptor_t Xt, Yt, bt;
cudnnFilterDescriptor_t Wt;
cudnnConvolutionDescriptor_t op;
cudnnConvolutionFwdAlgo_t algo;
cudnnActivationDescriptor_t actf;
cudnnConvolutionFwdAlgoPerf_t perfResults[CUDNN_CONVOLUTION_FWD_ALGO_COUNT];
size_t wssz;
int ydim[4];
int nperfResult=0;
timeval dt[2];
CHK(cudaMalloc(&X, batsz * nchan * height*stride * width*stride * 4));
CHK(cudaMalloc(&Y, batsz * nfilt * height * width * 4));
CHK(cudaMalloc(&W, nfilt * nchan * fsz * fsz *4 + nfilt *4));
b = W + nfilt*nchan*fsz*fsz*4;
CHK(cudaMemset(X, 0x3c, batsz * nchan * height*stride * width*stride *4)); //1.059 half, 0.0115 float
CHK(cudaMemset(Y, 0x00, batsz * nfilt * height * width *4));
CHK(cudaMemset(W, 0x3c, nfilt * nchan * fsz * fsz *4 + nfilt *4));
CHK( cudnnCreate(&_cudnn_h) );
CHK( cudnnCreateTensorDescriptor(&Xt) );
CHK( cudnnCreateTensorDescriptor(&Yt) );
CHK( cudnnCreateTensorDescriptor(&bt) );
CHK( cudnnCreateFilterDescriptor(&Wt) );
CHK( cudnnCreateConvolutionDescriptor(&op) );
CHK( cudnnCreateActivationDescriptor(&actf) );
CHK( cudnnSetTensor4dDescriptor(Xt, _format, _dtype, batsz, nchan, height*stride, width*stride) );
CHK( cudnnSetFilter4dDescriptor(Wt, _dtype, _format, nfilt,nchan,fsz,fsz) );
CHK( cudnnSetTensor4dDescriptor(bt, _format, _dtype, 1,nfilt,1,1) );
CHK( cudnnSetActivationDescriptor(actf, CUDNN_ACTIVATION_RELU, _nan_prop, 0.) );
CHK( cudnnSetConvolution2dDescriptor(op, pad,pad , stride,stride , 1,1 , _mode, _ctype) );
CHK( cudnnGetConvolution2dForwardOutputDim(op, Xt, Wt, ydim+0,ydim+1,ydim+2,ydim+3) );
CHK( cudnnSetTensor4dDescriptor(Yt, _format, _dtype, ydim[0],ydim[1],ydim[2],ydim[3]) );
if (argc > 1) {
algo = (cudnnConvolutionFwdAlgo_t)atoi(argv[1]);
CHK( cudnnGetConvolutionForwardWorkspaceSize(_cudnn_h, Xt, Wt, op, Yt, algo, &wssz) );
} else {
CHK( cudnnFindConvolutionForwardAlgorithm(_cudnn_h, Xt, Wt, op, Yt,
CUDNN_CONVOLUTION_FWD_ALGO_COUNT, &nperfResult, perfResults) );
algo = perfResults->algo;
wssz = perfResults->memory;
}
CHK( cudaMalloc(&T, wssz) );
CHK( cudaDeviceSynchronize() );
gettimeofday(dt,nullptr);
const int nrep = 10;
for (int rep=0; rep < nrep; rep++) {
#ifdef CONV_FUSE
CHK( cudnnConvolutionBiasActivationForward(_cudnn_h, &_one, Xt,X, Wt,W, op,algo,
T,wssz, &_zero, Yt,Y, bt,b, actf, Yt,Y) );
#else
CHK( cudnnConvolutionForward(_cudnn_h, &_one, Xt,X, Wt,W, op, algo, T,wssz, &_zero, Yt,Y) );
CHK( cudnnAddTensor(_cudnn_h, &_one,bt,b, &_one,Yt,Y) );
CHK( cudnnActivationForward(_cudnn_h, actf, &_one, Yt,Y, &_zero, Yt,Y) );
#endif
}
CHK( cudaDeviceSynchronize() );
gettimeofday(dt+1,nullptr);
timersub(dt+1,dt,dt);
double dtime = dt->tv_sec + dt->tv_usec * 1E-6;
double gflop = 1E-9 * nrep * batsz * nfilt * height/stride * width/stride * nchan * fsz * fsz * 2;
printf("CuDNNv%d "
#ifdef CONV_FUSE
"fused "
#else
"non-fused"
#endif
": %6.3f s = %5.1f GF\n", CUDNN_MAJOR, dtime, gflop/dtime);
return 0;
}