I run this program under 8 GPUs, when a large amount of memory is created in the program, I have noticed that the more iterations, more slower in while loop , but when I uncomment the code snippet, the excution time is same in while loop. the excution time is same too when i swith to cudnn v8.2.2
my environment :
GPU:RTX3090 x8
RAM;512GB
the excution time of no new memory in while loop :
8100ms
7900ms
8122ms
8200ms
8233ms
8099ms
the excution time of new large memory in while loop:
8100ms
7900ms
8122ms
16000ms
24122ms
33000ms
test files:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda_fp16.h>
#include <cudnn.h>
#include <stdio.h>
#include <time.h>
typedef half mtype;
#define datatype CUDNN_DATA_HALF
#define MAX_Buffsz 1000000000
void handleErr(int t, const char* file, int line)
{
if (t == 0) { return; }
printf("err in cudnn %d %s %d", t, file, line);
getchar();
}
#define checkCUDNN(status) { \
if (status != CUDNN_STATUS_SUCCESS) { \
handleErr(status,__FILE__,__LINE__);} \
}
class myTest {
public:
cudnnHandle_t _cudnnHandle;
cudnnDataType_t _dataType;
int _n, _InC, _d, _h, _w, _winSzD, _winSzH, _winSzW, _padD, _padH, _padW, _strideD, _strideH, _strideW, _OutC, _OutD, _OutH, _OutW, _dilationD, _dilationH, _dilationW, _group, _padMode;
cudnnTensorDescriptor_t _srcTensorDesc, _dstTensorDesc;
cudnnFilterDescriptor_t _filterDesc;
cudnnConvolutionDescriptor_t _convDesc;
cudnnConvolutionFwdAlgoPerf_t _algoFwd;
mtype* srcData, * filterData, * dstData, * buff;
size_t szSrc, szfilter, szDst;
template <typename algoPerf_t>
int getBestAlgoIndex(algoPerf_t* perfs, int retAlgoCount, size_t limitMem, cudnnDataType_t mType) {
int algoFlag = 0;
int bestPerfIndex = 0;
int flag = (mType == CUDNN_DATA_HALF) || (mType == CUDNN_DATA_BFLOAT16);
for (int i = 0; i < retAlgoCount; i++) {
if (perfs[i].status == CUDNN_STATUS_SUCCESS &&
(flag ? perfs[i].mathType == CUDNN_TENSOR_OP_MATH : 1) &&
perfs[i].determinism == CUDNN_DETERMINISTIC && (!limitMem || perfs[i].memory < limitMem)) {
algoFlag = 1; bestPerfIndex = i; break;
}
}
if (algoFlag == 0) {
for (int i = 0; i < retAlgoCount; i++) {
if (perfs[i].status == CUDNN_STATUS_SUCCESS &&
(flag ? perfs[i].mathType == CUDNN_TENSOR_OP_MATH : 1) &&
(!limitMem || perfs[i].memory < limitMem)) {
algoFlag = 1; bestPerfIndex = i; break;
}
}
if (algoFlag == 1) { printf(" algo found but NOT DETERMINISTIC "); }
}
if (algoFlag == 0) {
for (int i = 0; i < retAlgoCount; i++) {
if (perfs[i].status == CUDNN_STATUS_SUCCESS) {
algoFlag = 1; bestPerfIndex = i; break;
}
}
if (algoFlag == 1) { printf(" algo found but not enough memory"); }
}
if (algoFlag == 0) {
printf("ERR: algo not found");
//system("pause");
}
return bestPerfIndex;
}
void setConvolutionTensor(int n, int InC, int d, int h, int w, int winSzD, int winSzH, int winSzW, int padD, int padH, int padW, int strideD,
int strideH, int strideW, int OutC, int OutD, int OutH, int OutW, int dilationD, int dilationH, int dilationW, int group, int padMode, cudnnDataType_t comtype)
{
_dataType = comtype;
_n = n; _InC = InC; _h = h; _w = w; _d = d, _winSzW = winSzW; _winSzH = winSzH; _winSzD = winSzD, _padW = padW, _padH = padH, _padD = padD;
_strideD = strideD; _strideW = strideW; _strideH = strideH; _OutC = OutC; _dilationW = dilationW; _dilationH = dilationH; _dilationD = dilationD, _group = group;
_OutD = 1; _d = 1; _winSzD = 1;
int dimSrc[4] = { n,InC,h,w };
int strideSrc[4] = { h * w * InC, h * w, w,1 };
checkCUDNN(cudnnSetTensorNdDescriptor(_srcTensorDesc, _dataType, 4, dimSrc, strideSrc));
int filterA[4] = { _OutC, _InC / _group,_winSzH,_winSzW };
checkCUDNN(cudnnSetFilterNdDescriptor(_filterDesc, _dataType, CUDNN_TENSOR_NCHW, 4, filterA));
int padA[2] = { _padH,_padW };
int strideA[2] = { _strideH,_strideW };
int dilationA[2] = { _dilationH,_dilationW };
cudnnDataType_t convType = _dataType;
if (convType == CUDNN_DATA_BFLOAT16) {
convType = CUDNN_DATA_FLOAT;
}
checkCUDNN(cudnnSetConvolutionNdDescriptor(_convDesc, 2, padA, strideA, dilationA, CUDNN_CROSS_CORRELATION, convType));
cudaDeviceSynchronize();
#if CUDNN_VERSION > 7000
if (_dataType == CUDNN_DATA_HALF || _dataType == CUDNN_DATA_BFLOAT16) {
checkCUDNN(cudnnSetConvolutionMathType(_convDesc, CUDNN_TENSOR_OP_MATH));
}
else if (_dataType == CUDNN_DATA_FLOAT) {
checkCUDNN(cudnnSetConvolutionMathType(_convDesc, CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION));
}
else {
checkCUDNN(cudnnSetConvolutionMathType(_convDesc, CUDNN_DEFAULT_MATH));
}
#endif
cudnnSetConvolutionGroupCount(_convDesc, group);
cudaDeviceSynchronize();
int outDim[4] = { 0 };
checkCUDNN(cudnnGetConvolutionNdForwardOutputDim(_convDesc, _srcTensorDesc, _filterDesc, 4, outDim));
n = outDim[0];
_OutC = outDim[1];
_OutH = outDim[2];
_OutW = outDim[3];
int dimDst[4] = { n,OutC,_OutH,_OutW };
int strideDst[4] = { _OutH * _OutW * _OutC, _OutH * _OutW,_OutW,1 };
checkCUDNN(cudnnSetTensorNdDescriptor(_dstTensorDesc, _dataType, 4, dimDst, strideDst));
if ((OutH != _OutH && OutH != 0) || (OutW != _OutW && OutW != 0)) { printf("err in comvDim"); }
if (srcData) { cudaFree(srcData); }
if (dstData) { cudaFree(dstData); }
if (filterData) { cudaFree(filterData); }
szSrc = n * InC * h * w;
szDst = n * OutC * _OutH * _OutW;
szfilter = InC * OutC / group * winSzH * winSzW;
cudaMalloc(&srcData, szSrc * sizeof(mtype));
cudaMalloc(&dstData, szDst * sizeof(mtype));
cudaMalloc(&filterData, szfilter * sizeof(mtype));
cudnnConvolutionFwdAlgoPerf_t perfs[CUDNN_CONVOLUTION_FWD_ALGO_COUNT];
int retAlgoCount = -1;
checkCUDNN(cudnnFindConvolutionForwardAlgorithmEx(_cudnnHandle,
_srcTensorDesc, srcData, _filterDesc, filterData, _convDesc, _dstTensorDesc,
dstData, CUDNN_CONVOLUTION_FWD_ALGO_COUNT, &retAlgoCount, perfs, buff, MAX_Buffsz));
_algoFwd = perfs[getBestAlgoIndex(perfs, retAlgoCount, MAX_Buffsz, _dataType)];
}
int mymain(int i) {
cudaSetDevice(i);
srcData = 0; dstData = 0; filterData = 0;
cudnnCreate(&_cudnnHandle);
checkCUDNN(cudnnCreateTensorDescriptor(&_srcTensorDesc));
checkCUDNN(cudnnCreateTensorDescriptor(&_dstTensorDesc));
checkCUDNN(cudnnCreateFilterDescriptor(&_filterDesc));
checkCUDNN(cudnnCreateConvolutionDescriptor(&_convDesc));
cudaMalloc(&buff, MAX_Buffsz);
int n = 1, c = 256, h = 256, w = 256, oc = 256, winSz = 5, stride = 1;
int group = 1;// oc / 32;
setConvolutionTensor(n, c, 1, h, w, 1, winSz, winSz, 0, (winSz - 1) / 2, (winSz - 1) / 2, 1,
stride, stride, oc, 1, 0, 0, 1, 1, 1, group, 0, datatype);
mtype* x = new mtype[szSrc];
mtype* y = new mtype[szfilter];
mtype* z = new mtype[szDst];
for (int i = 0; i < szSrc; i++) {
x[i] = 1.0;
}
for (int i = 0; i < szfilter; i++) {
y[i] = 1.0;
}
cudaMemcpy(srcData, x, szSrc * sizeof(mtype), cudaMemcpyHostToDevice);
cudaMemcpy(filterData, y, szfilter * sizeof(mtype), cudaMemcpyHostToDevice);
cudaMemset(srcData, 0, szSrc * sizeof(mtype));
cudaMemset(dstData, 0, szDst * sizeof(mtype));
cudaMemset(filterData, 0, szfilter * sizeof(mtype));
float alpha = 1, beta = 0;
cudaDeviceSynchronize();
size_t wsSize;
size_t sss = 0;
int s1 = clock();
//start comment
//int** tbf = new int* [50000];
//for (int i = 0; i < 50000; i++) {
// tbf[i] = new int[5000];
// memset(tbf[i], 1, 20000);
//}
//endcomment
while (1) {
int cc = (rand() % 8 + 1) * 32;
int fact = (rand() % 5 + 1);
int fp = rand() % 2;
int inc = cc * (fp ? fact : 1);
int outc = cc * (fp ? 1 : fact);
if (inc > 256) { inc = 256; }
if (outc > 256) { outc = 256; }
winSz = (rand() % 3) * 2 + 1;
group = 1 << (rand() % 6);
stride = 1 + (rand() % 2);
if (winSz == 1) { stride = 1; }
setConvolutionTensor(n, inc, 1, (rand() % 15 + 1) * 32, (rand() % 15 + 1) * 32, 1, winSz, winSz, 0, (winSz - 1) / 2, (winSz - 1) / 2, 1,
stride, stride, outc, 1, 0, 0, 1, 1, 1, group, 0, datatype);
checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(_cudnnHandle, _srcTensorDesc, _filterDesc, _convDesc,
_dstTensorDesc, _algoFwd.algo, &wsSize));
checkCUDNN(cudnnConvolutionForward(_cudnnHandle, &alpha, _srcTensorDesc, srcData, _filterDesc, filterData, _convDesc, _algoFwd.algo,
buff, MAX_Buffsz, &beta, _dstTensorDesc, dstData));
cudaDeviceSynchronize();
sss++;
if (sss % 1000 == 0) {
int s2 = clock();
printf("%d\n", s2 - s1);
s1 = s2;
}
}
int sumtime = 0;
for (int i = 0; i < 100; i++) {
int s1 = clock();
checkCUDNN(cudnnConvolutionForward(_cudnnHandle, &alpha, _srcTensorDesc, srcData, _filterDesc, filterData, _convDesc, _algoFwd.algo,
buff, MAX_Buffsz, &beta, _dstTensorDesc, dstData));
cudaDeviceSynchronize();
int s2 = clock();
if (i != 0) {
sumtime += s2 - s1;
printf("t=%d %d\n", s2 - s1, (sumtime / i));
}
}
cudaMemcpy(z, dstData, szDst * sizeof(mtype), cudaMemcpyDeviceToHost);
//for (int i = 0; i < szDst; i++) {
// float ff = z[i];
// printf("%.2f", ff);
//}
return 0;
}
};
void main() {
//8 GPUS
#pragma omp parallel for
for (int i = 0; i < 8; i++) {
myTest A;
A.mymain(i);
}
}
type or paste code here