The convoluion was slow in cudnn 8.4.0 and 8.4.1

I run this program under 8 GPUs, when a large amount of memory is created in the program, I have noticed that the more iterations, more slower in while loop , but when I uncomment the code snippet, the excution time is same in while loop. the excution time is same too when i swith to cudnn v8.2.2
my environment :
GPU:RTX3090 x8
RAM;512GB

the excution time of no new memory in while loop :

8100ms
7900ms
8122ms
8200ms
8233ms
8099ms

the excution time of new large memory in while loop:

8100ms
7900ms
8122ms
16000ms
24122ms
33000ms

test files:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda_fp16.h>
#include <cudnn.h>
#include <stdio.h>
#include <time.h>
typedef half mtype;
#define datatype CUDNN_DATA_HALF
#define MAX_Buffsz 1000000000


void handleErr(int t, const char* file, int line)
{
	if (t == 0) { return; }
	printf("err in cudnn %d %s %d", t, file, line);
	getchar();
}
#define checkCUDNN(status) {										\
	if (status != CUDNN_STATUS_SUCCESS) {							\
	 handleErr(status,__FILE__,__LINE__);}							\
}
class myTest {
public:
	cudnnHandle_t _cudnnHandle;
	cudnnDataType_t _dataType;
	int _n, _InC, _d, _h, _w, _winSzD, _winSzH, _winSzW, _padD, _padH, _padW, _strideD, _strideH, _strideW, _OutC, _OutD, _OutH, _OutW, _dilationD, _dilationH, _dilationW, _group, _padMode;
	cudnnTensorDescriptor_t _srcTensorDesc, _dstTensorDesc;
	cudnnFilterDescriptor_t _filterDesc;
	cudnnConvolutionDescriptor_t _convDesc;
	cudnnConvolutionFwdAlgoPerf_t _algoFwd;
	mtype* srcData, * filterData, * dstData, * buff;
	size_t szSrc, szfilter, szDst;

	template <typename algoPerf_t>
	int getBestAlgoIndex(algoPerf_t* perfs, int retAlgoCount, size_t limitMem, cudnnDataType_t mType) {
		int algoFlag = 0;
		int bestPerfIndex = 0;
		int flag = (mType == CUDNN_DATA_HALF) || (mType == CUDNN_DATA_BFLOAT16);
		for (int i = 0; i < retAlgoCount; i++) {
			if (perfs[i].status == CUDNN_STATUS_SUCCESS &&
				(flag ? perfs[i].mathType == CUDNN_TENSOR_OP_MATH : 1) &&
				perfs[i].determinism == CUDNN_DETERMINISTIC && (!limitMem || perfs[i].memory < limitMem)) {
				algoFlag = 1; bestPerfIndex = i; break;
			}
		}
		if (algoFlag == 0) {
			for (int i = 0; i < retAlgoCount; i++) {
				if (perfs[i].status == CUDNN_STATUS_SUCCESS &&
					(flag ? perfs[i].mathType == CUDNN_TENSOR_OP_MATH : 1) &&
					(!limitMem || perfs[i].memory < limitMem)) {
					algoFlag = 1; bestPerfIndex = i; break;
				}
			}
			if (algoFlag == 1) { printf(" algo found but NOT DETERMINISTIC "); }
		}
		if (algoFlag == 0) {
			for (int i = 0; i < retAlgoCount; i++) {
				if (perfs[i].status == CUDNN_STATUS_SUCCESS) {
					algoFlag = 1; bestPerfIndex = i; break;
				}
			}
			if (algoFlag == 1) { printf(" algo found but not enough memory"); }
		}
		if (algoFlag == 0) {
			printf("ERR: algo not found");
			//system("pause"); 
		}
		return bestPerfIndex;
	}
	void setConvolutionTensor(int n, int InC, int d, int h, int w, int winSzD, int winSzH, int winSzW, int padD, int padH, int padW, int strideD,
		int strideH, int strideW, int OutC, int OutD, int OutH, int OutW, int dilationD, int dilationH, int dilationW, int group, int padMode, cudnnDataType_t comtype)
	{

		_dataType = comtype;
		_n = n; _InC = InC; _h = h; _w = w; _d = d, _winSzW = winSzW; _winSzH = winSzH; _winSzD = winSzD, _padW = padW, _padH = padH, _padD = padD;
		_strideD = strideD; _strideW = strideW; _strideH = strideH; _OutC = OutC; _dilationW = dilationW; _dilationH = dilationH; _dilationD = dilationD, _group = group;

		_OutD = 1; _d = 1; _winSzD = 1;
		int dimSrc[4] = { n,InC,h,w };
		int strideSrc[4] = { h * w * InC, h * w, w,1 };
		checkCUDNN(cudnnSetTensorNdDescriptor(_srcTensorDesc, _dataType, 4, dimSrc, strideSrc));
		int filterA[4] = { _OutC, _InC / _group,_winSzH,_winSzW };
		checkCUDNN(cudnnSetFilterNdDescriptor(_filterDesc, _dataType, CUDNN_TENSOR_NCHW, 4, filterA));

		int padA[2] = { _padH,_padW };
		int strideA[2] = { _strideH,_strideW };
		int dilationA[2] = { _dilationH,_dilationW };
		cudnnDataType_t convType = _dataType;
		if (convType == CUDNN_DATA_BFLOAT16) {
			convType = CUDNN_DATA_FLOAT;
		}
		checkCUDNN(cudnnSetConvolutionNdDescriptor(_convDesc, 2, padA, strideA, dilationA, CUDNN_CROSS_CORRELATION, convType));
		cudaDeviceSynchronize();

#if CUDNN_VERSION > 7000
		if (_dataType == CUDNN_DATA_HALF || _dataType == CUDNN_DATA_BFLOAT16) {
			checkCUDNN(cudnnSetConvolutionMathType(_convDesc, CUDNN_TENSOR_OP_MATH));
		}
		else if (_dataType == CUDNN_DATA_FLOAT) {
			checkCUDNN(cudnnSetConvolutionMathType(_convDesc, CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION));
		}
		else {
			checkCUDNN(cudnnSetConvolutionMathType(_convDesc, CUDNN_DEFAULT_MATH));
		}
#endif
		cudnnSetConvolutionGroupCount(_convDesc, group);

		cudaDeviceSynchronize();
		int outDim[4] = { 0 };
		checkCUDNN(cudnnGetConvolutionNdForwardOutputDim(_convDesc, _srcTensorDesc, _filterDesc, 4, outDim));
		n = outDim[0];
		_OutC = outDim[1];
		_OutH = outDim[2];
		_OutW = outDim[3];

		int dimDst[4] = { n,OutC,_OutH,_OutW };
		int strideDst[4] = { _OutH * _OutW * _OutC, _OutH * _OutW,_OutW,1 };
		checkCUDNN(cudnnSetTensorNdDescriptor(_dstTensorDesc, _dataType, 4, dimDst, strideDst));
		if ((OutH != _OutH && OutH != 0) || (OutW != _OutW && OutW != 0)) { printf("err in comvDim"); }
		if (srcData) { cudaFree(srcData); }
		if (dstData) { cudaFree(dstData); }
		if (filterData) { cudaFree(filterData); }
		szSrc = n * InC * h * w;
		szDst = n * OutC * _OutH * _OutW;
		szfilter = InC * OutC / group * winSzH * winSzW;
		cudaMalloc(&srcData, szSrc * sizeof(mtype));
		cudaMalloc(&dstData, szDst * sizeof(mtype));
		cudaMalloc(&filterData, szfilter * sizeof(mtype));
		cudnnConvolutionFwdAlgoPerf_t perfs[CUDNN_CONVOLUTION_FWD_ALGO_COUNT];
		int retAlgoCount = -1;
		checkCUDNN(cudnnFindConvolutionForwardAlgorithmEx(_cudnnHandle,
			_srcTensorDesc, srcData, _filterDesc, filterData, _convDesc, _dstTensorDesc,
			dstData, CUDNN_CONVOLUTION_FWD_ALGO_COUNT, &retAlgoCount, perfs, buff, MAX_Buffsz));
		_algoFwd = perfs[getBestAlgoIndex(perfs, retAlgoCount, MAX_Buffsz, _dataType)];
	}

	int mymain(int i) {
		cudaSetDevice(i);
		srcData = 0; dstData = 0; filterData = 0;
		cudnnCreate(&_cudnnHandle);
		checkCUDNN(cudnnCreateTensorDescriptor(&_srcTensorDesc));
		checkCUDNN(cudnnCreateTensorDescriptor(&_dstTensorDesc));
		checkCUDNN(cudnnCreateFilterDescriptor(&_filterDesc));
		checkCUDNN(cudnnCreateConvolutionDescriptor(&_convDesc));
		cudaMalloc(&buff, MAX_Buffsz);
		int n = 1, c = 256, h = 256, w = 256, oc = 256, winSz = 5, stride = 1;
		int group = 1;// oc / 32;
		setConvolutionTensor(n, c, 1, h, w, 1, winSz, winSz, 0, (winSz - 1) / 2, (winSz - 1) / 2, 1,
			stride, stride, oc, 1, 0, 0, 1, 1, 1, group, 0, datatype);
		mtype* x = new mtype[szSrc];
		mtype* y = new mtype[szfilter];
		mtype* z = new mtype[szDst];
		for (int i = 0; i < szSrc; i++) {
			x[i] = 1.0;
		}
		for (int i = 0; i < szfilter; i++) {
			y[i] = 1.0;
		}

		cudaMemcpy(srcData, x, szSrc * sizeof(mtype), cudaMemcpyHostToDevice);
		cudaMemcpy(filterData, y, szfilter * sizeof(mtype), cudaMemcpyHostToDevice);
		cudaMemset(srcData, 0, szSrc * sizeof(mtype));
		cudaMemset(dstData, 0, szDst * sizeof(mtype));
		cudaMemset(filterData, 0, szfilter * sizeof(mtype));
		float alpha = 1, beta = 0;
		cudaDeviceSynchronize();
		size_t wsSize;
		size_t sss = 0;
		int s1 = clock();
		//start comment
		//int** tbf = new int* [50000];
		//for (int i = 0; i < 50000; i++) {
		//	tbf[i] = new int[5000];
		//	memset(tbf[i], 1, 20000);
		//}
		//endcomment
		while (1) {
			int cc = (rand() % 8 + 1) * 32;
			int fact = (rand() % 5 + 1);
			int fp = rand() % 2;
			int inc = cc * (fp ? fact : 1);
			int outc = cc * (fp ? 1 : fact);
			if (inc > 256) { inc = 256; }
			if (outc > 256) { outc = 256; }
			winSz = (rand() % 3) * 2 + 1;
			group = 1 << (rand() % 6);
			stride = 1 + (rand() % 2);
			if (winSz == 1) { stride = 1; }
			setConvolutionTensor(n, inc, 1, (rand() % 15 + 1) * 32, (rand() % 15 + 1) * 32, 1, winSz, winSz, 0, (winSz - 1) / 2, (winSz - 1) / 2, 1,
				stride, stride, outc, 1, 0, 0, 1, 1, 1, group, 0, datatype);
			checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(_cudnnHandle, _srcTensorDesc, _filterDesc, _convDesc,
				_dstTensorDesc, _algoFwd.algo, &wsSize));
			checkCUDNN(cudnnConvolutionForward(_cudnnHandle, &alpha, _srcTensorDesc, srcData, _filterDesc, filterData, _convDesc, _algoFwd.algo,
				buff, MAX_Buffsz, &beta, _dstTensorDesc, dstData));
			cudaDeviceSynchronize();
			sss++;
			if (sss % 1000 == 0) {
				int s2 = clock();
				printf("%d\n", s2 - s1);
				s1 = s2;
			}
		}
		int sumtime = 0;
		for (int i = 0; i < 100; i++) {
			int s1 = clock();
			checkCUDNN(cudnnConvolutionForward(_cudnnHandle, &alpha, _srcTensorDesc, srcData, _filterDesc, filterData, _convDesc, _algoFwd.algo,
				buff, MAX_Buffsz, &beta, _dstTensorDesc, dstData));
			cudaDeviceSynchronize();
			int s2 = clock();
			if (i != 0) {
				sumtime += s2 - s1;
				printf("t=%d %d\n", s2 - s1, (sumtime / i));
			}
		}
		cudaMemcpy(z, dstData, szDst * sizeof(mtype), cudaMemcpyDeviceToHost);
		//for (int i = 0; i < szDst; i++) {
		//	float ff = z[i];
		//	printf("%.2f", ff);
		//}
		return 0;
	}

};

void main() {
	//8 GPUS
#pragma omp parallel for
	for (int i = 0; i < 8; i++) {
		myTest A;
		A.mymain(i);
	}
}
type or paste code here

Hi,

Could you please share with us the compilation steps you’re following to run the above program.

Thank you.

The Visual studio 2019 has been used to compile this code with compute_86,sm_86 .
the program run in windows 10, cuda 11.4, cpu intel Xeon E5-2697A v4 . RAM 512GB,GPU:RXT3090 x8

Hi,

This looks like a known issue,
Please refer known issues section in the release notes.
https://docs.nvidia.com/deeplearning/cudnn/release-notes/rel_8.html#rel_8

Thank you.

which item ? I fail to find it . Thank you

V8.5.0 still slow, but v8.2.2 is normal
GPU RTX2070 is slow, too.
CUDA 11.4 and CUDA 11.6 . windows 10.