There are two code snippets, run in 8 GPUS, windows 10 ,compiled by visual studio 2019.
just becasue large memory has been created in different location, the second code snippet is special slow.
first snippet:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda_fp16.h>
#include <cudnn.h>
#include <stdio.h>
#include <time.h>
typedef half mtype;
#define datatype CUDNN_DATA_HALF
#define MAX_Buffsz 1000000000
#define _CRTDBG_MAP_ALLOC
#include <stdlib.h>
#include <crtdbg.h>
void handleErr(int t, const char* file, int line)
{
if (t == 0) { return; }
printf("err in cudnn %d %s %d", t, file, line);
getchar();
}
#define checkCUDNN(status) { \
if (status != CUDNN_STATUS_SUCCESS) { \
handleErr(status,__FILE__,__LINE__);} \
}
class myTest {
public:
cudnnHandle_t _cudnnHandle;
cudaStream_t _cudaStream;
cudnnDataType_t _dataType;
int _n, _InC, _d, _h, _w, _winSzD, _winSzH, _winSzW, _padD, _padH, _padW, _strideD, _strideH, _strideW, _OutC, _OutD, _OutH, _OutW, _dilationD, _dilationH, _dilationW, _group, _padMode;
cudnnTensorDescriptor_t _srcTensorDesc, _dstTensorDesc;
cudnnFilterDescriptor_t _filterDesc;
cudnnConvolutionDescriptor_t _convDesc;
cudnnConvolutionFwdAlgoPerf_t _algoFwd;
mtype* srcData, * filterData, * dstData, * buff;
size_t szSrc, szfilter, szDst;
template <typename algoPerf_t>
int getBestAlgoIndex(algoPerf_t* perfs, int retAlgoCount, size_t limitMem, cudnnDataType_t mType) {
int algoFlag = 0;
int bestPerfIndex = 0;
int flag = (mType == CUDNN_DATA_HALF) || (mType == CUDNN_DATA_BFLOAT16);
for (int i = 0; i < retAlgoCount; i++) {
if (perfs[i].status == CUDNN_STATUS_SUCCESS &&
(flag ? perfs[i].mathType == CUDNN_TENSOR_OP_MATH : 1) &&
perfs[i].determinism == CUDNN_DETERMINISTIC && (!limitMem || perfs[i].memory < limitMem)) {
algoFlag = 1; bestPerfIndex = i; break;
}
}
if (algoFlag == 0) {
for (int i = 0; i < retAlgoCount; i++) {
if (perfs[i].status == CUDNN_STATUS_SUCCESS &&
(flag ? perfs[i].mathType == CUDNN_TENSOR_OP_MATH : 1) &&
(!limitMem || perfs[i].memory < limitMem)) {
algoFlag = 1; bestPerfIndex = i; break;
}
}
if (algoFlag == 1) { printf(" algo found but NOT DETERMINISTIC "); }
}
if (algoFlag == 0) {
for (int i = 0; i < retAlgoCount; i++) {
if (perfs[i].status == CUDNN_STATUS_SUCCESS) {
algoFlag = 1; bestPerfIndex = i; break;
}
}
if (algoFlag == 1) { printf(" algo found but not enough memory"); }
}
if (algoFlag == 0) {
printf("ERR: algo not found");
//system("pause");
}
return bestPerfIndex;
}
void setConvolutionTensor(int n, int InC, int d, int h, int w, int winSzD, int winSzH, int winSzW, int padD, int padH, int padW, int strideD,
int strideH, int strideW, int OutC, int OutD, int OutH, int OutW, int dilationD, int dilationH, int dilationW, int group, int padMode, cudnnDataType_t comtype)
{
_dataType = comtype;
_n = n; _InC = InC; _h = h; _w = w; _d = d, _winSzW = winSzW; _winSzH = winSzH; _winSzD = winSzD, _padW = padW, _padH = padH, _padD = padD;
_strideD = strideD; _strideW = strideW; _strideH = strideH; _OutC = OutC; _dilationW = dilationW; _dilationH = dilationH; _dilationD = dilationD, _group = group;
_OutD = 1; _d = 1; _winSzD = 1;
int dimSrc[4] = { n,InC,h,w };
int strideSrc[4] = { h * w * InC, h * w, w,1 };
checkCUDNN(cudnnSetTensorNdDescriptor(_srcTensorDesc, _dataType, 4, dimSrc, strideSrc));
int filterA[4] = { _OutC, _InC / _group,_winSzH,_winSzW };
checkCUDNN(cudnnSetFilterNdDescriptor(_filterDesc, _dataType, CUDNN_TENSOR_NCHW, 4, filterA));
int padA[2] = { _padH,_padW };
int strideA[2] = { _strideH,_strideW };
int dilationA[2] = { _dilationH,_dilationW };
cudnnDataType_t convType = _dataType;
if (convType == CUDNN_DATA_BFLOAT16) {
convType = CUDNN_DATA_FLOAT;
}
checkCUDNN(cudnnSetConvolutionNdDescriptor(_convDesc, 2, padA, strideA, dilationA, CUDNN_CROSS_CORRELATION, convType));
#if CUDNN_VERSION > 7000
if (_dataType == CUDNN_DATA_HALF || _dataType == CUDNN_DATA_BFLOAT16) {
checkCUDNN(cudnnSetConvolutionMathType(_convDesc, CUDNN_TENSOR_OP_MATH));
}
else if (_dataType == CUDNN_DATA_FLOAT) {
checkCUDNN(cudnnSetConvolutionMathType(_convDesc, CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION));
}
else {
checkCUDNN(cudnnSetConvolutionMathType(_convDesc, CUDNN_DEFAULT_MATH));
}
#endif
cudnnSetConvolutionGroupCount(_convDesc, group);
//if (group > 1) { cudnnSetConvolutionGroupCount(_convDesc, group); };
//cudaDeviceSynchronize();
int outDim[4] = { 0 };
checkCUDNN(cudnnGetConvolutionNdForwardOutputDim(_convDesc, _srcTensorDesc, _filterDesc, 4, outDim));
n = outDim[0];
_OutC = outDim[1];
_OutH = outDim[2];
_OutW = outDim[3];
int dimDst[4] = { n,OutC,_OutH,_OutW };
int strideDst[4] = { _OutH * _OutW * _OutC, _OutH * _OutW,_OutW,1 };
checkCUDNN(cudnnSetTensorNdDescriptor(_dstTensorDesc, _dataType, 4, dimDst, strideDst));
if ((OutH != _OutH && OutH != 0) || (OutW != _OutW && OutW != 0)) { printf("err in comvDim"); }
cudaDeviceSynchronize();
if (srcData) { cudaFreeAsync(srcData,_cudaStream); }
if (dstData) { cudaFreeAsync(dstData,_cudaStream); }
if (filterData) { cudaFreeAsync(filterData,_cudaStream); }
szSrc = n * InC * h * w;
szDst = n * OutC * _OutH * _OutW;
szfilter = InC * OutC / group * winSzH * winSzW;
cudaMallocAsync(&srcData, szSrc * sizeof(mtype),_cudaStream);
cudaMallocAsync(&dstData, szDst * sizeof(mtype) , _cudaStream);
cudaMallocAsync(&filterData, szfilter * sizeof(mtype), _cudaStream);
cudnnConvolutionFwdAlgoPerf_t perfs[CUDNN_CONVOLUTION_FWD_ALGO_COUNT];
int retAlgoCount = -1;
checkCUDNN(cudnnFindConvolutionForwardAlgorithmEx(_cudnnHandle,
_srcTensorDesc, srcData, _filterDesc, filterData, _convDesc, _dstTensorDesc,
dstData, CUDNN_CONVOLUTION_FWD_ALGO_COUNT, &retAlgoCount, perfs, buff, MAX_Buffsz));
_algoFwd = perfs[getBestAlgoIndex(perfs, retAlgoCount, MAX_Buffsz, _dataType)];
}
void createHandle(int i) {
cudaSetDevice(i);
cudnnCreate(&_cudnnHandle);
cudaStreamCreate(&_cudaStream);
cudnnSetStream(_cudnnHandle, _cudaStream);
}
void createTensor(int i )
{
checkCUDNN(cudnnCreateTensorDescriptor(&_srcTensorDesc));
checkCUDNN(cudnnCreateTensorDescriptor(&_dstTensorDesc));
checkCUDNN(cudnnCreateFilterDescriptor(&_filterDesc));
checkCUDNN(cudnnCreateConvolutionDescriptor(&_convDesc));
}
void setTensor() {
int n = 1, c = 256, h = 256, w = 256, oc = 256, winSz = 5, stride = 1;
int group = 1;// oc / 32;
setConvolutionTensor(n, c, 1, h, w, 1, winSz, winSz, 0, (winSz - 1) / 2, (winSz - 1) / 2, 1,
stride, stride, oc, 1, 0, 0, 1, 1, 1, group, 0, datatype);
}
void destoryTensor()
{
checkCUDNN(cudnnDestroyConvolutionDescriptor(_convDesc));
checkCUDNN(cudnnDestroyFilterDescriptor(_filterDesc));
checkCUDNN(cudnnDestroyTensorDescriptor(_dstTensorDesc));
checkCUDNN(cudnnDestroyTensorDescriptor(_srcTensorDesc));
cudaStreamDestroy(_cudaStream);
cudnnDestroy(_cudnnHandle);
}
int mymain(int i) {
#define NEWLEN 10000
#define Front_New
#ifdef Front_New
int** tbf = new int* [NEWLEN];
for (int i = 0; i < NEWLEN; i++) {
tbf[i] = new int[5000];
memset(tbf[i], 1, 20000);
}
#endif
size_t wsSize;
size_t sss = 0;
cudaSetDevice(i);
createTensor(i);
cudaMallocAsync(&buff, MAX_Buffsz,_cudaStream);
srcData = 0; dstData = 0; filterData = 0;
setTensor();
mtype* x = new mtype[szSrc];
mtype* y = new mtype[szfilter];
mtype* z = new mtype[szDst];
for (int i = 0; i < szSrc; i++) {
x[i] = 1.0;
}
for (int i = 0; i < szfilter; i++) {
y[i] = 1.0;
}
cudaMemcpyAsync(srcData, x, szSrc * sizeof(mtype), cudaMemcpyHostToDevice, _cudaStream);
cudaMemcpyAsync(filterData, y, szfilter * sizeof(mtype), cudaMemcpyHostToDevice, _cudaStream);
cudaMemsetAsync(srcData, 0, szSrc * sizeof(mtype), _cudaStream);
cudaMemsetAsync(dstData, 0, szDst * sizeof(mtype), _cudaStream);
cudaMemsetAsync(filterData, 0, szfilter * sizeof(mtype), _cudaStream);
float alpha = 1, beta = 0;
cudaStreamSynchronize(_cudaStream);
#ifndef Front_New
int** tbf = new int* [NEWLEN];
for (int i = 0; i < NEWLEN; i++) {
tbf[i] = (int*)malloc(sizeof(int) * 5000);
memset(tbf[i], 1, sizeof(int) * 5000);
//HeapAlloc()
}
#endif
int s1 = clock();
while (1) {
checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(_cudnnHandle, _srcTensorDesc, _filterDesc, _convDesc,
_dstTensorDesc, _algoFwd.algo, &wsSize));
checkCUDNN(cudnnConvolutionForward(_cudnnHandle, &alpha, _srcTensorDesc, srcData, _filterDesc, filterData, _convDesc, _algoFwd.algo,
buff, MAX_Buffsz, &beta, _dstTensorDesc, dstData));
cudaStreamSynchronize(_cudaStream);
sss++;
if (sss % 1000 == 0) {
int s2 = clock();
printf("GPU ID:%d time:%d\n",i, s2 - s1);
s1 = s2;
}
}
return 0;
}
};
int main() {
//8 GPUS
myTest A[8];
#pragma omp parallel for
for (int i = 0; i < 8; i++) {
A[i].createHandle(i);
}
#pragma omp parallel for
for (int i = 0; i < 8; i++) {
A[i].mymain(i);
}
return 0;
}
Running screenshot
this running time is normal.
Second snippet:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda_fp16.h>
#include <cudnn.h>
#include <stdio.h>
#include <time.h>
typedef half mtype;
#define datatype CUDNN_DATA_HALF
#define MAX_Buffsz 1000000000
#define _CRTDBG_MAP_ALLOC
#include <stdlib.h>
#include <crtdbg.h>
void handleErr(int t, const char* file, int line)
{
if (t == 0) { return; }
printf("err in cudnn %d %s %d", t, file, line);
getchar();
}
#define checkCUDNN(status) { \
if (status != CUDNN_STATUS_SUCCESS) { \
handleErr(status,__FILE__,__LINE__);} \
}
class myTest {
public:
cudnnHandle_t _cudnnHandle;
cudaStream_t _cudaStream;
cudnnDataType_t _dataType;
int _n, _InC, _d, _h, _w, _winSzD, _winSzH, _winSzW, _padD, _padH, _padW, _strideD, _strideH, _strideW, _OutC, _OutD, _OutH, _OutW, _dilationD, _dilationH, _dilationW, _group, _padMode;
cudnnTensorDescriptor_t _srcTensorDesc, _dstTensorDesc;
cudnnFilterDescriptor_t _filterDesc;
cudnnConvolutionDescriptor_t _convDesc;
cudnnConvolutionFwdAlgoPerf_t _algoFwd;
mtype* srcData, * filterData, * dstData, * buff;
size_t szSrc, szfilter, szDst;
template <typename algoPerf_t>
int getBestAlgoIndex(algoPerf_t* perfs, int retAlgoCount, size_t limitMem, cudnnDataType_t mType) {
int algoFlag = 0;
int bestPerfIndex = 0;
int flag = (mType == CUDNN_DATA_HALF) || (mType == CUDNN_DATA_BFLOAT16);
for (int i = 0; i < retAlgoCount; i++) {
if (perfs[i].status == CUDNN_STATUS_SUCCESS &&
(flag ? perfs[i].mathType == CUDNN_TENSOR_OP_MATH : 1) &&
perfs[i].determinism == CUDNN_DETERMINISTIC && (!limitMem || perfs[i].memory < limitMem)) {
algoFlag = 1; bestPerfIndex = i; break;
}
}
if (algoFlag == 0) {
for (int i = 0; i < retAlgoCount; i++) {
if (perfs[i].status == CUDNN_STATUS_SUCCESS &&
(flag ? perfs[i].mathType == CUDNN_TENSOR_OP_MATH : 1) &&
(!limitMem || perfs[i].memory < limitMem)) {
algoFlag = 1; bestPerfIndex = i; break;
}
}
if (algoFlag == 1) { printf(" algo found but NOT DETERMINISTIC "); }
}
if (algoFlag == 0) {
for (int i = 0; i < retAlgoCount; i++) {
if (perfs[i].status == CUDNN_STATUS_SUCCESS) {
algoFlag = 1; bestPerfIndex = i; break;
}
}
if (algoFlag == 1) { printf(" algo found but not enough memory"); }
}
if (algoFlag == 0) {
printf("ERR: algo not found");
//system("pause");
}
return bestPerfIndex;
}
void setConvolutionTensor(int n, int InC, int d, int h, int w, int winSzD, int winSzH, int winSzW, int padD, int padH, int padW, int strideD,
int strideH, int strideW, int OutC, int OutD, int OutH, int OutW, int dilationD, int dilationH, int dilationW, int group, int padMode, cudnnDataType_t comtype)
{
_dataType = comtype;
_n = n; _InC = InC; _h = h; _w = w; _d = d, _winSzW = winSzW; _winSzH = winSzH; _winSzD = winSzD, _padW = padW, _padH = padH, _padD = padD;
_strideD = strideD; _strideW = strideW; _strideH = strideH; _OutC = OutC; _dilationW = dilationW; _dilationH = dilationH; _dilationD = dilationD, _group = group;
_OutD = 1; _d = 1; _winSzD = 1;
int dimSrc[4] = { n,InC,h,w };
int strideSrc[4] = { h * w * InC, h * w, w,1 };
checkCUDNN(cudnnSetTensorNdDescriptor(_srcTensorDesc, _dataType, 4, dimSrc, strideSrc));
int filterA[4] = { _OutC, _InC / _group,_winSzH,_winSzW };
checkCUDNN(cudnnSetFilterNdDescriptor(_filterDesc, _dataType, CUDNN_TENSOR_NCHW, 4, filterA));
int padA[2] = { _padH,_padW };
int strideA[2] = { _strideH,_strideW };
int dilationA[2] = { _dilationH,_dilationW };
cudnnDataType_t convType = _dataType;
if (convType == CUDNN_DATA_BFLOAT16) {
convType = CUDNN_DATA_FLOAT;
}
checkCUDNN(cudnnSetConvolutionNdDescriptor(_convDesc, 2, padA, strideA, dilationA, CUDNN_CROSS_CORRELATION, convType));
#if CUDNN_VERSION > 7000
if (_dataType == CUDNN_DATA_HALF || _dataType == CUDNN_DATA_BFLOAT16) {
checkCUDNN(cudnnSetConvolutionMathType(_convDesc, CUDNN_TENSOR_OP_MATH));
}
else if (_dataType == CUDNN_DATA_FLOAT) {
checkCUDNN(cudnnSetConvolutionMathType(_convDesc, CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION));
}
else {
checkCUDNN(cudnnSetConvolutionMathType(_convDesc, CUDNN_DEFAULT_MATH));
}
#endif
cudnnSetConvolutionGroupCount(_convDesc, group);
//if (group > 1) { cudnnSetConvolutionGroupCount(_convDesc, group); };
//cudaDeviceSynchronize();
int outDim[4] = { 0 };
checkCUDNN(cudnnGetConvolutionNdForwardOutputDim(_convDesc, _srcTensorDesc, _filterDesc, 4, outDim));
n = outDim[0];
_OutC = outDim[1];
_OutH = outDim[2];
_OutW = outDim[3];
int dimDst[4] = { n,OutC,_OutH,_OutW };
int strideDst[4] = { _OutH * _OutW * _OutC, _OutH * _OutW,_OutW,1 };
checkCUDNN(cudnnSetTensorNdDescriptor(_dstTensorDesc, _dataType, 4, dimDst, strideDst));
if ((OutH != _OutH && OutH != 0) || (OutW != _OutW && OutW != 0)) { printf("err in comvDim"); }
cudaDeviceSynchronize();
if (srcData) { cudaFreeAsync(srcData,_cudaStream); }
if (dstData) { cudaFreeAsync(dstData,_cudaStream); }
if (filterData) { cudaFreeAsync(filterData,_cudaStream); }
szSrc = n * InC * h * w;
szDst = n * OutC * _OutH * _OutW;
szfilter = InC * OutC / group * winSzH * winSzW;
cudaMallocAsync(&srcData, szSrc * sizeof(mtype),_cudaStream);
cudaMallocAsync(&dstData, szDst * sizeof(mtype) , _cudaStream);
cudaMallocAsync(&filterData, szfilter * sizeof(mtype), _cudaStream);
cudnnConvolutionFwdAlgoPerf_t perfs[CUDNN_CONVOLUTION_FWD_ALGO_COUNT];
int retAlgoCount = -1;
checkCUDNN(cudnnFindConvolutionForwardAlgorithmEx(_cudnnHandle,
_srcTensorDesc, srcData, _filterDesc, filterData, _convDesc, _dstTensorDesc,
dstData, CUDNN_CONVOLUTION_FWD_ALGO_COUNT, &retAlgoCount, perfs, buff, MAX_Buffsz));
_algoFwd = perfs[getBestAlgoIndex(perfs, retAlgoCount, MAX_Buffsz, _dataType)];
}
void createHandle(int i) {
cudaSetDevice(i);
cudnnCreate(&_cudnnHandle);
cudaStreamCreate(&_cudaStream);
cudnnSetStream(_cudnnHandle, _cudaStream);
}
void createTensor(int i )
{
checkCUDNN(cudnnCreateTensorDescriptor(&_srcTensorDesc));
checkCUDNN(cudnnCreateTensorDescriptor(&_dstTensorDesc));
checkCUDNN(cudnnCreateFilterDescriptor(&_filterDesc));
checkCUDNN(cudnnCreateConvolutionDescriptor(&_convDesc));
}
void setTensor() {
int n = 1, c = 256, h = 256, w = 256, oc = 256, winSz = 5, stride = 1;
int group = 1;// oc / 32;
setConvolutionTensor(n, c, 1, h, w, 1, winSz, winSz, 0, (winSz - 1) / 2, (winSz - 1) / 2, 1,
stride, stride, oc, 1, 0, 0, 1, 1, 1, group, 0, datatype);
}
void destoryTensor()
{
checkCUDNN(cudnnDestroyConvolutionDescriptor(_convDesc));
checkCUDNN(cudnnDestroyFilterDescriptor(_filterDesc));
checkCUDNN(cudnnDestroyTensorDescriptor(_dstTensorDesc));
checkCUDNN(cudnnDestroyTensorDescriptor(_srcTensorDesc));
cudaStreamDestroy(_cudaStream);
cudnnDestroy(_cudnnHandle);
}
int mymain(int i) {
#define NEWLEN 10000
//#define Front_New
#ifdef Front_New
int** tbf = new int* [NEWLEN];
for (int i = 0; i < NEWLEN; i++) {
tbf[i] = new int[5000];
memset(tbf[i], 1, 20000);
}
#endif
size_t wsSize;
size_t sss = 0;
cudaSetDevice(i);
createTensor(i);
cudaMallocAsync(&buff, MAX_Buffsz,_cudaStream);
srcData = 0; dstData = 0; filterData = 0;
setTensor();
mtype* x = new mtype[szSrc];
mtype* y = new mtype[szfilter];
mtype* z = new mtype[szDst];
for (int i = 0; i < szSrc; i++) {
x[i] = 1.0;
}
for (int i = 0; i < szfilter; i++) {
y[i] = 1.0;
}
cudaMemcpyAsync(srcData, x, szSrc * sizeof(mtype), cudaMemcpyHostToDevice, _cudaStream);
cudaMemcpyAsync(filterData, y, szfilter * sizeof(mtype), cudaMemcpyHostToDevice, _cudaStream);
cudaMemsetAsync(srcData, 0, szSrc * sizeof(mtype), _cudaStream);
cudaMemsetAsync(dstData, 0, szDst * sizeof(mtype), _cudaStream);
cudaMemsetAsync(filterData, 0, szfilter * sizeof(mtype), _cudaStream);
float alpha = 1, beta = 0;
cudaStreamSynchronize(_cudaStream);
#ifndef Front_New
int** tbf = new int* [NEWLEN];
for (int i = 0; i < NEWLEN; i++) {
tbf[i] = (int*)malloc(sizeof(int) * 5000);
memset(tbf[i], 1, sizeof(int) * 5000);
//HeapAlloc()
}
#endif
int s1 = clock();
while (1) {
checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(_cudnnHandle, _srcTensorDesc, _filterDesc, _convDesc,
_dstTensorDesc, _algoFwd.algo, &wsSize));
checkCUDNN(cudnnConvolutionForward(_cudnnHandle, &alpha, _srcTensorDesc, srcData, _filterDesc, filterData, _convDesc, _algoFwd.algo,
buff, MAX_Buffsz, &beta, _dstTensorDesc, dstData));
cudaStreamSynchronize(_cudaStream);
sss++;
if (sss % 1000 == 0) {
int s2 = clock();
printf("GPU ID:%d time:%d\n",i, s2 - s1);
s1 = s2;
}
}
return 0;
}
};
int main() {
//8 GPUS
myTest A[8];
#pragma omp parallel for
for (int i = 0; i < 8; i++) {
A[i].createHandle(i);
}
#pragma omp parallel for
for (int i = 0; i < 8; i++) {
A[i].mymain(i);
}
return 0;
}
Running screenshot
this running time is more and more slow. it may be related to zlib.
It quickly occurred to me that the program be running in 8gpus. this bug was disappeard in cudnn 8.2.2.
my environment
cuda 11.4
cudnn 8.4.0
GPU: 8X RTX 3090
GPU driver:517.40-desktop-win10-win11-64bit-international-nsd-dch-whql
ram 768GB
visual studio 2019
windows 10
if my code snippets has bug , please point out .
thanks