Cudnn8 regression on TX2

cudnn8 has a ~50x performance regression in cudnnConvolutionBiasActivationForward on the Jetson TX2.
Results on the following minimal example:

CuDNNv7 non-fused:  1.490 s = 518.8 GF
CuDNNv8 non-fused:  1.478 s = 523.1 GF
CuDNNv7 fused    :  1.015 s = 761.5 GF
CuDNNv8 fused    : 48.168 s =  16.0 GF

Results on GTX 1070 with cudnn 8.1, cuda 11.2

CuDNNv8 non-fused: 0.146 s = 5290.1 GF
CuDNNv8 fused    : 0.094 s = 8185.8 GF

Switching to the non-fuesed path is less dramatic but still substantial performance regression.

minimal example:

//nvcc -arch=sm_62 -o cudnn_test_nonfused_7 cudnn_test.cpp -l:libcudnn.so.7
//nvcc -arch=sm_62 -o cudnn_test_nonfused_8 cudnn_test.cpp -l:libcudnn.so.8
//nvcc -arch=sm_62 -DCONV_FUSE -o cudnn_test_fused_7 cudnn_test.cpp -l:libcudnn.so.7
//nvcc -arch=sm_62 -DCONV_FUSE -o cudnn_test_fused_8 cudnn_test.cpp -l:libcudnn.so.8

#include <stdlib.h>
#include <stdio.h>
#include <sys/time.h>
#include <cudnn.h>

#define CHK(f) do { int rc= f; if (rc){ printf("%s:%d: err %d\n",__FILE__,__LINE__,rc); return rc; }} while(0)

int main(int argc, char *argv[])
{
	const int batsz=256, height=  64, width=  64, nfilt=64, nchan=64, fsz=3, pad=1, stride=1;
	
	const float _one = 1, _zero = 0;
	
	const cudnnDataType_t        _dtype	 = CUDNN_DATA_FLOAT;
	const cudnnDataType_t        _ctype	 = CUDNN_DATA_FLOAT;
	const cudnnTensorFormat_t    _format = CUDNN_TENSOR_NCHW;
	const cudnnConvolutionMode_t _mode	 = CUDNN_CROSS_CORRELATION; 
	const cudnnNanPropagation_t  _nan_prop	= CUDNN_NOT_PROPAGATE_NAN;
	
	unsigned char *X, *Y, *W, *T, *b;
	cudnnHandle_t _cudnn_h = nullptr;
	cudnnTensorDescriptor_t Xt, Yt, bt;
	cudnnFilterDescriptor_t Wt;
	cudnnConvolutionDescriptor_t op;
	cudnnConvolutionFwdAlgo_t algo;
	cudnnActivationDescriptor_t actf;
	cudnnConvolutionFwdAlgoPerf_t perfResults[CUDNN_CONVOLUTION_FWD_ALGO_COUNT];

	size_t wssz;
	int ydim[4];
	int nperfResult=0;
	timeval dt[2];
	
	CHK(cudaMalloc(&X, batsz * nchan * height*stride * width*stride * 4));
	CHK(cudaMalloc(&Y, batsz * nfilt * height        * width        * 4));
	CHK(cudaMalloc(&W, nfilt * nchan * fsz * fsz *4 + nfilt *4));
	b = W + nfilt*nchan*fsz*fsz*4;
	CHK(cudaMemset(X, 0x3c, batsz * nchan * height*stride * width*stride *4)); //1.059 half, 0.0115 float
	CHK(cudaMemset(Y, 0x00, batsz * nfilt * height        * width        *4));
	CHK(cudaMemset(W, 0x3c, nfilt * nchan * fsz * fsz *4 + nfilt *4));
	
	CHK( cudnnCreate(&_cudnn_h) );
	CHK( cudnnCreateTensorDescriptor(&Xt) );
	CHK( cudnnCreateTensorDescriptor(&Yt) );
	CHK( cudnnCreateTensorDescriptor(&bt) );
	CHK( cudnnCreateFilterDescriptor(&Wt) );
	CHK( cudnnCreateConvolutionDescriptor(&op) );
	CHK( cudnnCreateActivationDescriptor(&actf) );
	
	CHK( cudnnSetTensor4dDescriptor(Xt, _format, _dtype, batsz, nchan, height*stride, width*stride) );
	CHK( cudnnSetFilter4dDescriptor(Wt, _dtype, _format, nfilt,nchan,fsz,fsz) );
	CHK( cudnnSetTensor4dDescriptor(bt, _format, _dtype, 1,nfilt,1,1) );
	CHK( cudnnSetActivationDescriptor(actf, CUDNN_ACTIVATION_RELU, _nan_prop, 0.) );
	CHK( cudnnSetConvolution2dDescriptor(op, pad,pad , stride,stride , 1,1 , _mode, _ctype) );
	CHK( cudnnGetConvolution2dForwardOutputDim(op, Xt, Wt, ydim+0,ydim+1,ydim+2,ydim+3) );
	CHK( cudnnSetTensor4dDescriptor(Yt, _format, _dtype, ydim[0],ydim[1],ydim[2],ydim[3]) );

	if (argc > 1) {
		algo = (cudnnConvolutionFwdAlgo_t)atoi(argv[1]);
	    CHK( cudnnGetConvolutionForwardWorkspaceSize(_cudnn_h, Xt, Wt, op, Yt, algo, &wssz) );
	} else {
		CHK( cudnnFindConvolutionForwardAlgorithm(_cudnn_h, Xt, Wt, op, Yt,
				CUDNN_CONVOLUTION_FWD_ALGO_COUNT, &nperfResult, perfResults) );
		algo = perfResults->algo;
		wssz = perfResults->memory;
	}
	CHK( cudaMalloc(&T, wssz) );
	
	CHK( cudaDeviceSynchronize() );
	gettimeofday(dt,nullptr);
	const int nrep = 10;
	for (int rep=0; rep < nrep; rep++) {
#ifdef CONV_FUSE
		CHK( cudnnConvolutionBiasActivationForward(_cudnn_h, &_one, Xt,X, Wt,W, op,algo,
					T,wssz, &_zero, Yt,Y, bt,b, actf, Yt,Y) );
#else
		CHK( cudnnConvolutionForward(_cudnn_h, &_one, Xt,X, Wt,W, op, algo, T,wssz, &_zero, Yt,Y) );
		CHK( cudnnAddTensor(_cudnn_h, &_one,bt,b, &_one,Yt,Y) );
		CHK( cudnnActivationForward(_cudnn_h, actf, &_one, Yt,Y, &_zero, Yt,Y) );
#endif
	}
	CHK( cudaDeviceSynchronize() );
	gettimeofday(dt+1,nullptr);
	timersub(dt+1,dt,dt);
	double dtime = dt->tv_sec + dt->tv_usec * 1E-6;
	double gflop = 1E-9 * nrep * batsz * nfilt * height/stride * width/stride * nchan * fsz * fsz * 2;
	printf("CuDNNv%d "
#ifdef CONV_FUSE
		   "fused    "
#else
		   "non-fused"     
#endif
		   ": %6.3f s = %5.1f GF\n", CUDNN_MAJOR, dtime, gflop/dtime);
	
	return 0;
}

Hi,

Which cuDNN version do you use.
If you are not using the 8.0.5 8.0.0 version(JetPack4.5), would you mind to give it a try?

Thanks.

Thanks, that sounds promising.
I am using using
libcudnn8_8.0.0.180-1+cuda10.2_arm64.deb
only because I thought it was the latest.
I just updated sdkmanager from 1.4.0 to 1.4.1 and it still is not showing 8.0.5.
Any idea how I might download cudnn 8.0.5 ?

Hi,

Sorry that my previous information is not correct.
cuDNN v8.0.0 is the latest version for Jetson currently.

We can reproduce this issue in our environment and is working on it.
Will share more information with you later.

Thanks.

Hi,

Thanks for your patience.

We confirmed that this issue will be fixed in our next major release.
Please wait for our announcement for the update.

Fantastic. Thank you.