All algo returned by cudnnFindConvolutionForwardAlgorithmEX is CUDNN_STATUS_NOT_SUPPORTED .
GPU : RTX 2070 super compute capability 7.5
CUDA 11.4
CUDNN 8.2.2
Visual studio 2019
this is my code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda_fp16.h>
#include <cudnn.h>
#include <stdio.h>
#include <time.h>
typedef half mtype;
#define datatype CUDNN_DATA_HALF
#define MAX_Buffsz 1000000000
void handleErr(int t, const char* file, int line)
{
if (t == 0) { return; }
printf("err in cudnn %d %s %d", t, file, line);
getchar();
}
#define checkCUDNN(status) { \
if (status != CUDNN_STATUS_SUCCESS) { \
handleErr(status,__FILE__,__LINE__);} \
}
class myTest {
public:
cudnnHandle_t _cudnnHandle;
cudnnDataType_t _dataType;
int _n, _InC, _d, _h, _w, _winSzD, _winSzH, _winSzW, _padD, _padH, _padW, _strideD, _strideH, _strideW, _OutC, _OutD, _OutH, _OutW, _dilationD, _dilationH, _dilationW, _group, _padMode;
cudnnTensorDescriptor_t _srcTensorDesc, _dstTensorDesc;
cudnnFilterDescriptor_t _filterDesc;
cudnnConvolutionDescriptor_t _convDesc;
cudnnConvolutionFwdAlgoPerf_t _algoFwd;
mtype* srcData, * filterData, * dstData, * buff;
size_t szSrc, szfilter, szDst;
template <typename algoPerf_t>
int getBestAlgoIndex(algoPerf_t* perfs, int retAlgoCount, size_t limitMem, cudnnDataType_t mType) {
int algoFlag = 0;
int bestPerfIndex = 0;
int flag = (mType == CUDNN_DATA_HALF) || (mType == CUDNN_DATA_BFLOAT16);
for (int i = 0; i < retAlgoCount; i++) {
if (perfs[i].status == CUDNN_STATUS_SUCCESS &&
(flag ? perfs[i].mathType == CUDNN_TENSOR_OP_MATH : 1) &&
perfs[i].determinism == CUDNN_DETERMINISTIC && (!limitMem || perfs[i].memory < limitMem)) {
algoFlag = 1; bestPerfIndex = i; break;
}
}
if (algoFlag == 0) {
for (int i = 0; i < retAlgoCount; i++) {
if (perfs[i].status == CUDNN_STATUS_SUCCESS &&
(flag ? perfs[i].mathType == CUDNN_TENSOR_OP_MATH : 1) &&
(!limitMem || perfs[i].memory < limitMem)) {
algoFlag = 1; bestPerfIndex = i; break;
}
}
if (algoFlag == 1) { printf(" algo found but NOT DETERMINISTIC "); }
}
if (algoFlag == 0) {
for (int i = 0; i < retAlgoCount; i++) {
if (perfs[i].status == CUDNN_STATUS_SUCCESS) {
algoFlag = 1; bestPerfIndex = i; break;
}
}
if (algoFlag == 1) { printf(" algo found but not enough memory"); }
}
if (algoFlag == 0) {
printf("ERR: algo not found");
//system("pause");
}
return bestPerfIndex;
}
void setConvolutionTensor(int n, int InC, int d, int h, int w, int winSzD, int winSzH, int winSzW, int padD, int padH, int padW, int strideD,
int strideH, int strideW, int OutC, int OutD, int OutH, int OutW, int dilationD, int dilationH, int dilationW, int group, int padMode, cudnnDataType_t comtype)
{
_dataType = comtype;
_n = n; _InC = InC; _h = h; _w = w; _d = d, _winSzW = winSzW; _winSzH = winSzH; _winSzD = winSzD, _padW = padW, _padH = padH, _padD = padD;
_strideD = strideD; _strideW = strideW; _strideH = strideH; _OutC = OutC; _dilationW = dilationW; _dilationH = dilationH; _dilationD = dilationD, _group = group;
int dimSrc[5] = { n,InC,d,h,w };
int strideSrc[5] = { d * h * w * InC,d * h * w, h * w, w,1 };
checkCUDNN(cudnnSetTensorNdDescriptor(_srcTensorDesc, _dataType, 5, dimSrc, strideSrc));
int filterA[5] = { _OutC, _InC / _group, _winSzD,_winSzH,_winSzW };
checkCUDNN(cudnnSetFilterNdDescriptor(_filterDesc, _dataType, CUDNN_TENSOR_NCHW, 5, filterA));
int padA[3] = { _padD,_padH,_padW };
int strideA[3] = { _strideD,_strideH,_strideW };
int dilationA[3] = { _dilationD,_dilationH,_dilationW };
cudnnDataType_t convType = _dataType;
if (convType == CUDNN_DATA_BFLOAT16) {
convType = CUDNN_DATA_FLOAT;
}
checkCUDNN(cudnnSetConvolutionNdDescriptor(_convDesc, 3, padA, strideA, dilationA, CUDNN_CROSS_CORRELATION, convType));
#if CUDNN_VERSION > 7000
if (_dataType == CUDNN_DATA_HALF || _dataType == CUDNN_DATA_BFLOAT16) {
checkCUDNN(cudnnSetConvolutionMathType(_convDesc, CUDNN_TENSOR_OP_MATH));
}
else if (_dataType == CUDNN_DATA_FLOAT) {
checkCUDNN(cudnnSetConvolutionMathType(_convDesc, CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION));
}
else {
checkCUDNN(cudnnSetConvolutionMathType(_convDesc, CUDNN_DEFAULT_MATH));
}
#endif
cudnnSetConvolutionGroupCount(_convDesc, group);
int outDim[5] = { 0 };
checkCUDNN(cudnnGetConvolutionNdForwardOutputDim(_convDesc, _srcTensorDesc, _filterDesc, 5, outDim));
n = outDim[0];
_OutC = outDim[1];
_OutD = outDim[2];
_OutH = outDim[3];
_OutW = outDim[4];
int dimDst[5] = { n,OutC,_OutD,_OutH,_OutW };
int strideDst[5] = { _OutH * _OutW * _OutD * _OutC, _OutH * _OutW * _OutD, _OutH * _OutW,_OutW,1 };
checkCUDNN(cudnnSetTensorNdDescriptor(_dstTensorDesc, _dataType, 5, dimDst, strideDst));
if ((OutH != _OutH && OutH != 0) || (OutW != _OutW && OutW != 0) || (OutD != _OutD && OutD != 0)) { printf("err in comvDim"); }
if (srcData) { cudaFree(srcData); }
if (dstData) { cudaFree(dstData); }
if (filterData) { cudaFree(filterData); }
szSrc = n * InC * d * h * w;
szDst = n * _OutC * _OutD * _OutH * _OutW;
szfilter = InC * OutC / group * winSzD * winSzH * winSzW;
cudaMalloc(&srcData, szSrc * sizeof(mtype));
cudaMalloc(&dstData, szDst * sizeof(mtype));
cudaMalloc(&filterData, szfilter * sizeof(mtype));
cudnnConvolutionFwdAlgoPerf_t perfs[CUDNN_CONVOLUTION_FWD_ALGO_COUNT];
int retAlgoCount = -1;
checkCUDNN(cudnnFindConvolutionForwardAlgorithmEx(_cudnnHandle,
_srcTensorDesc, srcData, _filterDesc, filterData, _convDesc, _dstTensorDesc,
dstData, CUDNN_CONVOLUTION_FWD_ALGO_COUNT, &retAlgoCount, perfs, buff, MAX_Buffsz));
_algoFwd = perfs[getBestAlgoIndex(perfs, retAlgoCount, MAX_Buffsz, _dataType)];
}
int mymain(int i) {
cudaSetDevice(i);
srcData = 0; dstData = 0; filterData = 0;
cudnnCreate(&_cudnnHandle);
checkCUDNN(cudnnCreateTensorDescriptor(&_srcTensorDesc));
checkCUDNN(cudnnCreateTensorDescriptor(&_dstTensorDesc));
checkCUDNN(cudnnCreateFilterDescriptor(&_filterDesc));
checkCUDNN(cudnnCreateConvolutionDescriptor(&_convDesc));
//Malloc workspace
cudaMalloc(&buff, MAX_Buffsz);
int n = 1, d = 32, c = 3, h = 224, w = 224, oc = 32, winSz = 3, stride = 2;
int group = 1;
setConvolutionTensor(n, c, d, h, w, winSz, winSz, winSz, (winSz - 1) / 2, (winSz - 1) / 2, (winSz - 1) / 2, 1,
stride, stride, oc, 0, 0, 0, 1, 1, 1, group, 0, datatype);
return 0;
}
};
void main() {
//#pragma omp parallel for
for (int i = 0; i < 1; i++) {
myTest A;
A.mymain(i);
}
}
this is log file
I! CuDNN (v8202) function cudnnCreate() called:
i! handle: location=host; addr=0000000AA64FFD50;
i! Time: 2022-09-08T10:15:47.373233 (0d+0h+0m+0s since start)
i! Process=25596; Thread=16148; GPU=NULL; Handle=NULL; StreamId=NULL.
I! CuDNN (v8202) function cudnnCreateTensorDescriptor() called:
i! Time: 2022-09-08T10:15:48.060786 (0d+0h+0m+1s since start)
i! Process=25596; Thread=16148; GPU=NULL; Handle=NULL; StreamId=NULL.
I! CuDNN (v8202) function cudnnCreateTensorDescriptor() called:
i! Time: 2022-09-08T10:15:48.060786 (0d+0h+0m+1s since start)
i! Process=25596; Thread=16148; GPU=NULL; Handle=NULL; StreamId=NULL.
I! CuDNN (v8202) function cudnnCreateFilterDescriptor() called:
i! Time: 2022-09-08T10:15:48.060786 (0d+0h+0m+1s since start)
i! Process=25596; Thread=16148; GPU=NULL; Handle=NULL; StreamId=NULL.
I! CuDNN (v8202) function cudnnCreateConvolutionDescriptor() called:
i! convDesc: location=host; addr=0000000AA64FFDD0;
i! Time: 2022-09-08T10:15:48.091920 (0d+0h+0m+1s since start)
i! Process=25596; Thread=16148; GPU=NULL; Handle=NULL; StreamId=NULL.
I! CuDNN (v8202) function cudnnSetTensorNdDescriptor() called:
i! dataType: type=cudnnDataType_t; val=CUDNN_DATA_HALF (2);
i! nbDims: type=int; val=5;
i! dimA: type=int; val=[1,3,32,224,224];
i! strideA: type=int; val=[4816896,1605632,50176,224,1];
i! Time: 2022-09-08T10:15:48.107462 (0d+0h+0m+1s since start)
i! Process=25596; Thread=16148; GPU=NULL; Handle=NULL; StreamId=NULL.
I! CuDNN (v8202) function cudnnSetFilterNdDescriptor() called:
i! dataType: type=cudnnDataType_t; val=CUDNN_DATA_HALF (2);
i! format: type=cudnnTensorFormat_t; val=CUDNN_TENSOR_NCHW (0);
i! nbDims: type=int; val=5;
i! filterDimA: type=int; val=[32,3,3,3,3];
i! Time: 2022-09-08T10:15:48.107462 (0d+0h+0m+1s since start)
i! Process=25596; Thread=16148; GPU=NULL; Handle=NULL; StreamId=NULL.
I! CuDNN (v8202) function cudnnSetConvolutionNdDescriptor() called:
i! convDesc: location=host; addr=0000017435BFC6A0;
i! arrayLength: type=int; val=3;
i! padA: type=int; val=[1,1,1];
i! strideA: type=int; val=[1,2,2];
i! dilationA: type=int; val=[1,1,1];
i! mode: type=cudnnConvolutionMode_t; val=CUDNN_CROSS_CORRELATION (1);
i! dataType: type=cudnnDataType_t; val=CUDNN_DATA_HALF (2);
i! Time: 2022-09-08T10:15:48.107462 (0d+0h+0m+1s since start)
i! Process=25596; Thread=16148; GPU=NULL; Handle=NULL; StreamId=NULL.
I! CuDNN (v8202) function cudnnSetConvolutionMathType() called:
i! convDesc: location=host; addr=0000017435BFC6A0;
i! mathType: type=cudnnMathType_t; val=CUDNN_TENSOR_OP_MATH (1);
i! Time: 2022-09-08T10:15:48.107462 (0d+0h+0m+1s since start)
i! Process=25596; Thread=16148; GPU=NULL; Handle=NULL; StreamId=NULL.
I! CuDNN (v8202) function cudnnSetConvolutionGroupCount() called:
i! convDesc: location=host; addr=0000017435BFC6A0;
i! groupCount: type=int; val=1;
i! Time: 2022-09-08T10:15:48.107462 (0d+0h+0m+1s since start)
i! Process=25596; Thread=16148; GPU=NULL; Handle=NULL; StreamId=NULL.
I! CuDNN (v8202) function cudnnGetConvolutionNdForwardOutputDim() called:
i! convDesc: type=cudnnConvolutionDescriptor_t:
i! mode: type=cudnnConvolutionMode_t; val=CUDNN_CROSS_CORRELATION (1);
i! dataType: type=cudnnDataType_t; val=CUDNN_DATA_HALF (2);
i! mathType: type=cudnnMathType_t; val=CUDNN_TENSOR_OP_MATH (1);
i! reorderType: type=int; val=0;
i! arrayLength: type=int; val=3;
i! padA: type=int; val=[1,1,1];
i! strideA: type=int; val=[1,2,2];
i! dilationA: type=int; val=[1,1,1];
i! groupCount: type=int; val=1;
i! inputTensorDesc: type=cudnnTensorDescriptor_t:
i! dataType: type=cudnnDataType_t; val=CUDNN_DATA_HALF (2);
i! nbDims: type=int; val=5;
i! dimA: type=int; val=[1,3,32,224,224];
i! strideA: type=int; val=[4816896,1605632,50176,224,1];
i! filterDesc: type=cudnnFilterDescriptor_t:
i! dataType: type=cudnnDataType_t; val=CUDNN_DATA_HALF (2);
i! vect: type=int; val=0;
i! nbDims: type=int; val=5;
i! dimA: type=int; val=[32,3,3,3,3];
i! format: type=cudnnTensorFormat_t; val=CUDNN_TENSOR_NCHW (0);
i! nbDims: type=int; val=5;
i! tensorOuputDimA: location=host; addr=0000000AA64FF9C8;
i! Time: 2022-09-08T10:15:48.107462 (0d+0h+0m+1s since start)
i! Process=25596; Thread=16148; GPU=NULL; Handle=NULL; StreamId=NULL.
I! CuDNN (v8202) function cudnnSetTensorNdDescriptor() called:
i! dataType: type=cudnnDataType_t; val=CUDNN_DATA_HALF (2);
i! nbDims: type=int; val=5;
i! dimA: type=int; val=[1,32,32,112,112];
i! strideA: type=int; val=[12845056,401408,12544,112,1];
i! Time: 2022-09-08T10:15:48.107462 (0d+0h+0m+1s since start)
i! Process=25596; Thread=16148; GPU=NULL; Handle=NULL; StreamId=NULL.
I! CuDNN (v8202) function cudnnFindConvolutionForwardAlgorithmEx() called:
i! handle: type=cudnnHandle_t; streamId=0000000000000000 (defaultStream);
i! srcDesc: type=cudnnTensorDescriptor_t:
i! dataType: type=cudnnDataType_t; val=CUDNN_DATA_HALF (2);
i! nbDims: type=int; val=5;
i! dimA: type=int; val=[1,3,32,224,224];
i! strideA: type=int; val=[4816896,1605632,50176,224,1];
i! srcData: location=dev; addr=000000074DE00000;
i! filterDesc: type=cudnnFilterDescriptor_t:
i! dataType: type=cudnnDataType_t; val=CUDNN_DATA_HALF (2);
i! vect: type=int; val=0;
i! nbDims: type=int; val=5;
i! dimA: type=int; val=[32,3,3,3,3];
i! format: type=cudnnTensorFormat_t; val=CUDNN_TENSOR_NCHW (0);
i! filterData: location=dev; addr=0000000711A27C00;
i! convDesc: type=cudnnConvolutionDescriptor_t:
i! mode: type=cudnnConvolutionMode_t; val=CUDNN_CROSS_CORRELATION (1);
i! dataType: type=cudnnDataType_t; val=CUDNN_DATA_HALF (2);
i! mathType: type=cudnnMathType_t; val=CUDNN_TENSOR_OP_MATH (1);
i! reorderType: type=int; val=0;
i! arrayLength: type=int; val=3;
i! padA: type=int; val=[1,1,1];
i! strideA: type=int; val=[1,2,2];
i! dilationA: type=int; val=[1,1,1];
i! groupCount: type=int; val=1;
i! destDesc: type=cudnnTensorDescriptor_t:
i! dataType: type=cudnnDataType_t; val=CUDNN_DATA_HALF (2);
i! nbDims: type=int; val=5;
i! dimA: type=int; val=[1,32,32,112,112];
i! strideA: type=int; val=[12845056,401408,12544,112,1];
i! destData: location=dev; addr=000000074E800000;
i! requestedAlgoCount: type=int; val=8;
i! returnedAlgoCount: location=host; addr=0000000AA64FF9C0;
i! perfResults: location=host; addr=0000000AA64FFA90;
i! workSpace: location=dev; addr=0000000712400000;
i! workSpaceSizeInBytes: type=unsigned long long; val=1000000000;
i! Time: 2022-09-08T10:15:48.107462 (0d+0h+0m+1s since start)
i! Process=25596; Thread=16148; GPU=0; Handle=000001747B964CD0; StreamId=0000000000000000 (defaultStream).
I! CuDNN (v8202) function cudnnGetConvolutionForwardAlgorithmMaxCount() called:
i! handle: type=cudnnHandle_t; streamId=0000000000000000 (defaultStream);
i! count: location=host; addr=0000000AA64FB620;
i! Time: 2022-09-08T10:15:48.107462 (0d+0h+0m+1s since start)
i! Process=25596; Thread=16148; GPU=NULL; Handle=NULL; StreamId=NULL.
I also get same result in RTX 3080 Laptop GPU. cudnn 8.4.1,cuda 11.4.
Algo can be found when i use PSEUDO_HALF_CONFIG mode.
TRUE_HALF_CONFIG supported on architectures with true FP16 support, meaning, compute capability 5.3 and later,but RTX 2070 super and RTX 3080 laptop is meet it.