Cudnn convolution slow since 8.4.0

There are two code snippets, run in 8 GPUS, windows 10 ,compiled by visual studio 2019.
just becasue large memory has been created in different location, the second code snippet is special slow.
first snippet:


#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda_fp16.h>
#include <cudnn.h>
#include <stdio.h>
#include <time.h>
typedef half mtype;
#define datatype CUDNN_DATA_HALF
#define MAX_Buffsz 1000000000


#define  _CRTDBG_MAP_ALLOC
#include <stdlib.h>
#include <crtdbg.h>

void handleErr(int t, const char* file, int line)
{
	if (t == 0) { return; }
	printf("err in cudnn %d %s %d", t, file, line);
	getchar();
}
#define checkCUDNN(status) {										\
	if (status != CUDNN_STATUS_SUCCESS) {							\
	 handleErr(status,__FILE__,__LINE__);}							\
}
class myTest {
public:
	cudnnHandle_t _cudnnHandle;
	cudaStream_t _cudaStream;
	cudnnDataType_t _dataType;
	int _n, _InC, _d, _h, _w, _winSzD, _winSzH, _winSzW, _padD, _padH, _padW, _strideD, _strideH, _strideW, _OutC, _OutD, _OutH, _OutW, _dilationD, _dilationH, _dilationW, _group, _padMode;
	cudnnTensorDescriptor_t _srcTensorDesc, _dstTensorDesc;
	cudnnFilterDescriptor_t _filterDesc;
	cudnnConvolutionDescriptor_t _convDesc;
	cudnnConvolutionFwdAlgoPerf_t _algoFwd;
	mtype* srcData, * filterData, * dstData, * buff;
	size_t szSrc, szfilter, szDst;


	template <typename algoPerf_t>
	int getBestAlgoIndex(algoPerf_t* perfs, int retAlgoCount, size_t limitMem, cudnnDataType_t mType) {
		int algoFlag = 0;
		int bestPerfIndex = 0;
		int flag = (mType == CUDNN_DATA_HALF) || (mType == CUDNN_DATA_BFLOAT16);
		for (int i = 0; i < retAlgoCount; i++) {
			if (perfs[i].status == CUDNN_STATUS_SUCCESS &&
				(flag ? perfs[i].mathType == CUDNN_TENSOR_OP_MATH : 1) &&
				perfs[i].determinism == CUDNN_DETERMINISTIC && (!limitMem || perfs[i].memory < limitMem)) {
				algoFlag = 1; bestPerfIndex = i; break;
			}
		}
		if (algoFlag == 0) {
			for (int i = 0; i < retAlgoCount; i++) {
				if (perfs[i].status == CUDNN_STATUS_SUCCESS &&
					(flag ? perfs[i].mathType == CUDNN_TENSOR_OP_MATH : 1) &&
					(!limitMem || perfs[i].memory < limitMem)) {
					algoFlag = 1; bestPerfIndex = i; break;
				}
			}
			if (algoFlag == 1) { printf(" algo found but NOT DETERMINISTIC "); }
		}
		if (algoFlag == 0) {
			for (int i = 0; i < retAlgoCount; i++) {
				if (perfs[i].status == CUDNN_STATUS_SUCCESS) {
					algoFlag = 1; bestPerfIndex = i; break;
				}
			}
			if (algoFlag == 1) { printf(" algo found but not enough memory"); }
		}
		if (algoFlag == 0) {
			printf("ERR: algo not found");
			//system("pause"); 
		}
		return bestPerfIndex;
	}
	void setConvolutionTensor(int n, int InC, int d, int h, int w, int winSzD, int winSzH, int winSzW, int padD, int padH, int padW, int strideD,
		int strideH, int strideW, int OutC, int OutD, int OutH, int OutW, int dilationD, int dilationH, int dilationW, int group, int padMode, cudnnDataType_t comtype)
	{

		_dataType = comtype;
		_n = n; _InC = InC; _h = h; _w = w; _d = d, _winSzW = winSzW; _winSzH = winSzH; _winSzD = winSzD, _padW = padW, _padH = padH, _padD = padD;
		_strideD = strideD; _strideW = strideW; _strideH = strideH; _OutC = OutC; _dilationW = dilationW; _dilationH = dilationH; _dilationD = dilationD, _group = group;

		_OutD = 1; _d = 1; _winSzD = 1;
		int dimSrc[4] = { n,InC,h,w };
		int strideSrc[4] = { h * w * InC, h * w, w,1 };
		checkCUDNN(cudnnSetTensorNdDescriptor(_srcTensorDesc, _dataType, 4, dimSrc, strideSrc));
		int filterA[4] = { _OutC, _InC / _group,_winSzH,_winSzW };
		checkCUDNN(cudnnSetFilterNdDescriptor(_filterDesc, _dataType, CUDNN_TENSOR_NCHW, 4, filterA));

		int padA[2] = { _padH,_padW };
		int strideA[2] = { _strideH,_strideW };
		int dilationA[2] = { _dilationH,_dilationW };
		cudnnDataType_t convType = _dataType;
		if (convType == CUDNN_DATA_BFLOAT16) {
			convType = CUDNN_DATA_FLOAT;
		}
		checkCUDNN(cudnnSetConvolutionNdDescriptor(_convDesc, 2, padA, strideA, dilationA, CUDNN_CROSS_CORRELATION, convType));

#if CUDNN_VERSION > 7000
		if (_dataType == CUDNN_DATA_HALF || _dataType == CUDNN_DATA_BFLOAT16) {
			checkCUDNN(cudnnSetConvolutionMathType(_convDesc, CUDNN_TENSOR_OP_MATH));
		}
		else if (_dataType == CUDNN_DATA_FLOAT) {
			checkCUDNN(cudnnSetConvolutionMathType(_convDesc, CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION));
		}
		else {
			checkCUDNN(cudnnSetConvolutionMathType(_convDesc, CUDNN_DEFAULT_MATH));
		}
#endif
		cudnnSetConvolutionGroupCount(_convDesc, group);
		//if (group > 1) { cudnnSetConvolutionGroupCount(_convDesc, group); };
		//cudaDeviceSynchronize();
		int outDim[4] = { 0 };
		checkCUDNN(cudnnGetConvolutionNdForwardOutputDim(_convDesc, _srcTensorDesc, _filterDesc, 4, outDim));
		n = outDim[0];
		_OutC = outDim[1];
		_OutH = outDim[2];
		_OutW = outDim[3];

		int dimDst[4] = { n,OutC,_OutH,_OutW };
		int strideDst[4] = { _OutH * _OutW * _OutC, _OutH * _OutW,_OutW,1 };
		checkCUDNN(cudnnSetTensorNdDescriptor(_dstTensorDesc, _dataType, 4, dimDst, strideDst));
		if ((OutH != _OutH && OutH != 0) || (OutW != _OutW && OutW != 0)) { printf("err in comvDim"); }
		cudaDeviceSynchronize();

		if (srcData) { cudaFreeAsync(srcData,_cudaStream); }
		if (dstData) { cudaFreeAsync(dstData,_cudaStream); }
		if (filterData) { cudaFreeAsync(filterData,_cudaStream); }
		szSrc = n * InC * h * w;
		szDst = n * OutC * _OutH * _OutW;
		szfilter = InC * OutC / group * winSzH * winSzW;
		cudaMallocAsync(&srcData, szSrc * sizeof(mtype),_cudaStream);
		cudaMallocAsync(&dstData, szDst * sizeof(mtype) , _cudaStream);
		cudaMallocAsync(&filterData, szfilter * sizeof(mtype), _cudaStream);
		cudnnConvolutionFwdAlgoPerf_t perfs[CUDNN_CONVOLUTION_FWD_ALGO_COUNT];
		int retAlgoCount = -1;
		checkCUDNN(cudnnFindConvolutionForwardAlgorithmEx(_cudnnHandle,
			_srcTensorDesc, srcData, _filterDesc, filterData, _convDesc, _dstTensorDesc,
			dstData, CUDNN_CONVOLUTION_FWD_ALGO_COUNT, &retAlgoCount, perfs, buff, MAX_Buffsz));
		_algoFwd = perfs[getBestAlgoIndex(perfs, retAlgoCount, MAX_Buffsz, _dataType)];
	}
	void createHandle(int i) {
		cudaSetDevice(i);
		cudnnCreate(&_cudnnHandle);
		cudaStreamCreate(&_cudaStream);
		cudnnSetStream(_cudnnHandle, _cudaStream);
	}
	void createTensor(int i )
	{
		checkCUDNN(cudnnCreateTensorDescriptor(&_srcTensorDesc));
		checkCUDNN(cudnnCreateTensorDescriptor(&_dstTensorDesc));
		checkCUDNN(cudnnCreateFilterDescriptor(&_filterDesc));
		checkCUDNN(cudnnCreateConvolutionDescriptor(&_convDesc));
	}
	void setTensor() {
		int n = 1, c = 256, h = 256, w = 256, oc = 256, winSz = 5, stride = 1;
		int group = 1;// oc / 32;
		setConvolutionTensor(n, c, 1, h, w, 1, winSz, winSz, 0, (winSz - 1) / 2, (winSz - 1) / 2, 1,
			stride, stride, oc, 1, 0, 0, 1, 1, 1, group, 0, datatype);
	}
	void destoryTensor()
	{
		checkCUDNN(cudnnDestroyConvolutionDescriptor(_convDesc));
		checkCUDNN(cudnnDestroyFilterDescriptor(_filterDesc));
		checkCUDNN(cudnnDestroyTensorDescriptor(_dstTensorDesc));
		checkCUDNN(cudnnDestroyTensorDescriptor(_srcTensorDesc));
		cudaStreamDestroy(_cudaStream);
		cudnnDestroy(_cudnnHandle);
	}
	int mymain(int i) {
#define NEWLEN 10000
#define Front_New
#ifdef Front_New
		int** tbf = new int* [NEWLEN];
		for (int i = 0; i < NEWLEN; i++) {
			tbf[i] = new int[5000];
			memset(tbf[i], 1, 20000);
		}
#endif
		size_t wsSize;
		size_t sss = 0;
		cudaSetDevice(i);
		createTensor(i);
		cudaMallocAsync(&buff, MAX_Buffsz,_cudaStream);
		srcData = 0; dstData = 0; filterData = 0;
		setTensor();
		mtype* x = new mtype[szSrc];
		mtype* y = new mtype[szfilter];
		mtype* z = new mtype[szDst];
		for (int i = 0; i < szSrc; i++) {
			x[i] = 1.0;
		}
		for (int i = 0; i < szfilter; i++) {
			y[i] = 1.0;
		}
		cudaMemcpyAsync(srcData, x, szSrc * sizeof(mtype), cudaMemcpyHostToDevice, _cudaStream);
		cudaMemcpyAsync(filterData, y, szfilter * sizeof(mtype), cudaMemcpyHostToDevice, _cudaStream);
		cudaMemsetAsync(srcData, 0, szSrc * sizeof(mtype), _cudaStream);
		cudaMemsetAsync(dstData, 0, szDst * sizeof(mtype), _cudaStream);
		cudaMemsetAsync(filterData, 0, szfilter * sizeof(mtype), _cudaStream);
		float alpha = 1, beta = 0;
		cudaStreamSynchronize(_cudaStream);
#ifndef Front_New
		int** tbf = new int* [NEWLEN];
		for (int i = 0; i < NEWLEN; i++) {
			tbf[i] = (int*)malloc(sizeof(int) * 5000);
			memset(tbf[i], 1, sizeof(int) * 5000);
			//HeapAlloc()
		}
#endif

		int s1 = clock();
		while (1) {
			checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(_cudnnHandle, _srcTensorDesc, _filterDesc, _convDesc,
				_dstTensorDesc, _algoFwd.algo, &wsSize));
			checkCUDNN(cudnnConvolutionForward(_cudnnHandle, &alpha, _srcTensorDesc, srcData, _filterDesc, filterData, _convDesc, _algoFwd.algo,
				buff, MAX_Buffsz, &beta, _dstTensorDesc, dstData));
			cudaStreamSynchronize(_cudaStream);
			sss++;
			if (sss % 1000 == 0) {
				int s2 = clock();
				printf("GPU ID:%d time:%d\n",i, s2 - s1);
				s1 = s2;
				
			}
		}
		return 0;
	}

};

int main() {

	//8 GPUS
	myTest A[8];
#pragma omp parallel for 
 	for (int i = 0; i < 8; i++) {
 		A[i].createHandle(i);
 	}

#pragma omp parallel for 
	for (int i = 0; i < 8; i++) {
		A[i].mymain(i);
	}
	return 0;
}

Running screenshot


this running time is normal.

Second snippet:


#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda_fp16.h>
#include <cudnn.h>
#include <stdio.h>
#include <time.h>
typedef half mtype;
#define datatype CUDNN_DATA_HALF
#define MAX_Buffsz 1000000000


#define  _CRTDBG_MAP_ALLOC
#include <stdlib.h>
#include <crtdbg.h>

void handleErr(int t, const char* file, int line)
{
	if (t == 0) { return; }
	printf("err in cudnn %d %s %d", t, file, line);
	getchar();
}
#define checkCUDNN(status) {										\
	if (status != CUDNN_STATUS_SUCCESS) {							\
	 handleErr(status,__FILE__,__LINE__);}							\
}
class myTest {
public:
	cudnnHandle_t _cudnnHandle;
	cudaStream_t _cudaStream;
	cudnnDataType_t _dataType;
	int _n, _InC, _d, _h, _w, _winSzD, _winSzH, _winSzW, _padD, _padH, _padW, _strideD, _strideH, _strideW, _OutC, _OutD, _OutH, _OutW, _dilationD, _dilationH, _dilationW, _group, _padMode;
	cudnnTensorDescriptor_t _srcTensorDesc, _dstTensorDesc;
	cudnnFilterDescriptor_t _filterDesc;
	cudnnConvolutionDescriptor_t _convDesc;
	cudnnConvolutionFwdAlgoPerf_t _algoFwd;
	mtype* srcData, * filterData, * dstData, * buff;
	size_t szSrc, szfilter, szDst;


	template <typename algoPerf_t>
	int getBestAlgoIndex(algoPerf_t* perfs, int retAlgoCount, size_t limitMem, cudnnDataType_t mType) {
		int algoFlag = 0;
		int bestPerfIndex = 0;
		int flag = (mType == CUDNN_DATA_HALF) || (mType == CUDNN_DATA_BFLOAT16);
		for (int i = 0; i < retAlgoCount; i++) {
			if (perfs[i].status == CUDNN_STATUS_SUCCESS &&
				(flag ? perfs[i].mathType == CUDNN_TENSOR_OP_MATH : 1) &&
				perfs[i].determinism == CUDNN_DETERMINISTIC && (!limitMem || perfs[i].memory < limitMem)) {
				algoFlag = 1; bestPerfIndex = i; break;
			}
		}
		if (algoFlag == 0) {
			for (int i = 0; i < retAlgoCount; i++) {
				if (perfs[i].status == CUDNN_STATUS_SUCCESS &&
					(flag ? perfs[i].mathType == CUDNN_TENSOR_OP_MATH : 1) &&
					(!limitMem || perfs[i].memory < limitMem)) {
					algoFlag = 1; bestPerfIndex = i; break;
				}
			}
			if (algoFlag == 1) { printf(" algo found but NOT DETERMINISTIC "); }
		}
		if (algoFlag == 0) {
			for (int i = 0; i < retAlgoCount; i++) {
				if (perfs[i].status == CUDNN_STATUS_SUCCESS) {
					algoFlag = 1; bestPerfIndex = i; break;
				}
			}
			if (algoFlag == 1) { printf(" algo found but not enough memory"); }
		}
		if (algoFlag == 0) {
			printf("ERR: algo not found");
			//system("pause"); 
		}
		return bestPerfIndex;
	}
	void setConvolutionTensor(int n, int InC, int d, int h, int w, int winSzD, int winSzH, int winSzW, int padD, int padH, int padW, int strideD,
		int strideH, int strideW, int OutC, int OutD, int OutH, int OutW, int dilationD, int dilationH, int dilationW, int group, int padMode, cudnnDataType_t comtype)
	{

		_dataType = comtype;
		_n = n; _InC = InC; _h = h; _w = w; _d = d, _winSzW = winSzW; _winSzH = winSzH; _winSzD = winSzD, _padW = padW, _padH = padH, _padD = padD;
		_strideD = strideD; _strideW = strideW; _strideH = strideH; _OutC = OutC; _dilationW = dilationW; _dilationH = dilationH; _dilationD = dilationD, _group = group;

		_OutD = 1; _d = 1; _winSzD = 1;
		int dimSrc[4] = { n,InC,h,w };
		int strideSrc[4] = { h * w * InC, h * w, w,1 };
		checkCUDNN(cudnnSetTensorNdDescriptor(_srcTensorDesc, _dataType, 4, dimSrc, strideSrc));
		int filterA[4] = { _OutC, _InC / _group,_winSzH,_winSzW };
		checkCUDNN(cudnnSetFilterNdDescriptor(_filterDesc, _dataType, CUDNN_TENSOR_NCHW, 4, filterA));

		int padA[2] = { _padH,_padW };
		int strideA[2] = { _strideH,_strideW };
		int dilationA[2] = { _dilationH,_dilationW };
		cudnnDataType_t convType = _dataType;
		if (convType == CUDNN_DATA_BFLOAT16) {
			convType = CUDNN_DATA_FLOAT;
		}
		checkCUDNN(cudnnSetConvolutionNdDescriptor(_convDesc, 2, padA, strideA, dilationA, CUDNN_CROSS_CORRELATION, convType));

#if CUDNN_VERSION > 7000
		if (_dataType == CUDNN_DATA_HALF || _dataType == CUDNN_DATA_BFLOAT16) {
			checkCUDNN(cudnnSetConvolutionMathType(_convDesc, CUDNN_TENSOR_OP_MATH));
		}
		else if (_dataType == CUDNN_DATA_FLOAT) {
			checkCUDNN(cudnnSetConvolutionMathType(_convDesc, CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION));
		}
		else {
			checkCUDNN(cudnnSetConvolutionMathType(_convDesc, CUDNN_DEFAULT_MATH));
		}
#endif
		cudnnSetConvolutionGroupCount(_convDesc, group);
		//if (group > 1) { cudnnSetConvolutionGroupCount(_convDesc, group); };
		//cudaDeviceSynchronize();
		int outDim[4] = { 0 };
		checkCUDNN(cudnnGetConvolutionNdForwardOutputDim(_convDesc, _srcTensorDesc, _filterDesc, 4, outDim));
		n = outDim[0];
		_OutC = outDim[1];
		_OutH = outDim[2];
		_OutW = outDim[3];

		int dimDst[4] = { n,OutC,_OutH,_OutW };
		int strideDst[4] = { _OutH * _OutW * _OutC, _OutH * _OutW,_OutW,1 };
		checkCUDNN(cudnnSetTensorNdDescriptor(_dstTensorDesc, _dataType, 4, dimDst, strideDst));
		if ((OutH != _OutH && OutH != 0) || (OutW != _OutW && OutW != 0)) { printf("err in comvDim"); }
		cudaDeviceSynchronize();

		if (srcData) { cudaFreeAsync(srcData,_cudaStream); }
		if (dstData) { cudaFreeAsync(dstData,_cudaStream); }
		if (filterData) { cudaFreeAsync(filterData,_cudaStream); }
		szSrc = n * InC * h * w;
		szDst = n * OutC * _OutH * _OutW;
		szfilter = InC * OutC / group * winSzH * winSzW;
		cudaMallocAsync(&srcData, szSrc * sizeof(mtype),_cudaStream);
		cudaMallocAsync(&dstData, szDst * sizeof(mtype) , _cudaStream);
		cudaMallocAsync(&filterData, szfilter * sizeof(mtype), _cudaStream);
		cudnnConvolutionFwdAlgoPerf_t perfs[CUDNN_CONVOLUTION_FWD_ALGO_COUNT];
		int retAlgoCount = -1;
		checkCUDNN(cudnnFindConvolutionForwardAlgorithmEx(_cudnnHandle,
			_srcTensorDesc, srcData, _filterDesc, filterData, _convDesc, _dstTensorDesc,
			dstData, CUDNN_CONVOLUTION_FWD_ALGO_COUNT, &retAlgoCount, perfs, buff, MAX_Buffsz));
		_algoFwd = perfs[getBestAlgoIndex(perfs, retAlgoCount, MAX_Buffsz, _dataType)];
	}
	void createHandle(int i) {
		cudaSetDevice(i);
		cudnnCreate(&_cudnnHandle);
		cudaStreamCreate(&_cudaStream);
		cudnnSetStream(_cudnnHandle, _cudaStream);
	}
	void createTensor(int i )
	{
		checkCUDNN(cudnnCreateTensorDescriptor(&_srcTensorDesc));
		checkCUDNN(cudnnCreateTensorDescriptor(&_dstTensorDesc));
		checkCUDNN(cudnnCreateFilterDescriptor(&_filterDesc));
		checkCUDNN(cudnnCreateConvolutionDescriptor(&_convDesc));
	}
	void setTensor() {
		int n = 1, c = 256, h = 256, w = 256, oc = 256, winSz = 5, stride = 1;
		int group = 1;// oc / 32;
		setConvolutionTensor(n, c, 1, h, w, 1, winSz, winSz, 0, (winSz - 1) / 2, (winSz - 1) / 2, 1,
			stride, stride, oc, 1, 0, 0, 1, 1, 1, group, 0, datatype);
	}
	void destoryTensor()
	{
		checkCUDNN(cudnnDestroyConvolutionDescriptor(_convDesc));
		checkCUDNN(cudnnDestroyFilterDescriptor(_filterDesc));
		checkCUDNN(cudnnDestroyTensorDescriptor(_dstTensorDesc));
		checkCUDNN(cudnnDestroyTensorDescriptor(_srcTensorDesc));
		cudaStreamDestroy(_cudaStream);
		cudnnDestroy(_cudnnHandle);
	}
	int mymain(int i) {
#define NEWLEN 10000
//#define Front_New
#ifdef Front_New
		int** tbf = new int* [NEWLEN];
		for (int i = 0; i < NEWLEN; i++) {
			tbf[i] = new int[5000];
			memset(tbf[i], 1, 20000);
		}
#endif
		size_t wsSize;
		size_t sss = 0;
		cudaSetDevice(i);
		createTensor(i);
		cudaMallocAsync(&buff, MAX_Buffsz,_cudaStream);
		srcData = 0; dstData = 0; filterData = 0;
		setTensor();
		mtype* x = new mtype[szSrc];
		mtype* y = new mtype[szfilter];
		mtype* z = new mtype[szDst];
		for (int i = 0; i < szSrc; i++) {
			x[i] = 1.0;
		}
		for (int i = 0; i < szfilter; i++) {
			y[i] = 1.0;
		}
		cudaMemcpyAsync(srcData, x, szSrc * sizeof(mtype), cudaMemcpyHostToDevice, _cudaStream);
		cudaMemcpyAsync(filterData, y, szfilter * sizeof(mtype), cudaMemcpyHostToDevice, _cudaStream);
		cudaMemsetAsync(srcData, 0, szSrc * sizeof(mtype), _cudaStream);
		cudaMemsetAsync(dstData, 0, szDst * sizeof(mtype), _cudaStream);
		cudaMemsetAsync(filterData, 0, szfilter * sizeof(mtype), _cudaStream);
		float alpha = 1, beta = 0;
		cudaStreamSynchronize(_cudaStream);
#ifndef Front_New
		int** tbf = new int* [NEWLEN];
		for (int i = 0; i < NEWLEN; i++) {
			tbf[i] = (int*)malloc(sizeof(int) * 5000);
			memset(tbf[i], 1, sizeof(int) * 5000);
			//HeapAlloc()
		}
#endif

		int s1 = clock();
		while (1) {
			checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(_cudnnHandle, _srcTensorDesc, _filterDesc, _convDesc,
				_dstTensorDesc, _algoFwd.algo, &wsSize));
			checkCUDNN(cudnnConvolutionForward(_cudnnHandle, &alpha, _srcTensorDesc, srcData, _filterDesc, filterData, _convDesc, _algoFwd.algo,
				buff, MAX_Buffsz, &beta, _dstTensorDesc, dstData));
			cudaStreamSynchronize(_cudaStream);
			sss++;
			if (sss % 1000 == 0) {
				int s2 = clock();
				printf("GPU ID:%d time:%d\n",i, s2 - s1);
				s1 = s2;
				
			}
		}
		return 0;
	}

};

int main() {

	//8 GPUS
	myTest A[8];
#pragma omp parallel for 
 	for (int i = 0; i < 8; i++) {
 		A[i].createHandle(i);
 	}

#pragma omp parallel for 
	for (int i = 0; i < 8; i++) {
		A[i].mymain(i);
	}
	return 0;
}

Running screenshot


this running time is more and more slow. it may be related to zlib.
It quickly occurred to me that the program be running in 8gpus. this bug was disappeard in cudnn 8.2.2.

my environment
cuda 11.4
cudnn 8.4.0
GPU: 8X RTX 3090
GPU driver:517.40-desktop-win10-win11-64bit-international-nsd-dch-whql
ram 768GB
visual studio 2019
windows 10

if my code snippets has bug , please point out .
thanks

Later, i use cudaMallocHost or windows api VirtualAlloc,VirtualLock, it is normal.
so it is related to page locked memory ?

It is diffcult that i replaced all new with cudaMallocHost, spcieal, large numbers of object and stl vector need to be created in heap.
this has already perplexed me for three months,please help me, thanks.

Hi @ttt ,
We are checking on it, meanwhile can you please try out the following and let us know the results?

  1. Make your heap allocation code match exactly (i.e. use either new or malloc) to make sure we’re doing right comparison
  2. Do a device sync before the while(1) loop to see if that has any effect.

Thanks

The problem remains

Hi @ttt , We are debugging this issue, and will get back to you as soon as possible.
Thank you for your patience.

Hi @ttt
Apologies for delay.
We were not able to repro the issue.
However with the latest releases the problem should not persist and we recommend you to try the same.
Thanks