All algo is CUDNN_STATUS_NOT_SUPPORTED

All algo returned by cudnnFindConvolutionForwardAlgorithmEX is CUDNN_STATUS_NOT_SUPPORTED when i use 3D convolution in half mode.
input N:1 C:3 D:32 H:224 W:224
output N:1 C:32 D:32 H:112 W:112
Ksize (3,3,3) stride(1,2,2) padding(1,1,1)

cuda 11.4
cudnn version 8.2.2
GPU RTX2070 Super
windows 10

Hi,

Are you sure that CUDNN_STATUS_NOT_SUPPORTED is returned by cudnnFindConvolutionForwardAlgorithmEX?
Please refer to the following doc which tells about inputs and return info for the above function.
Please make sure, you’re sending the right input parameters.

Also, could you please share with us the output logs and if possible minimal issue repro.

Thank you.

All algo returned by cudnnFindConvolutionForwardAlgorithmEX is CUDNN_STATUS_NOT_SUPPORTED .

GPU : RTX 2070 super compute capability 7.5
CUDA 11.4
CUDNN 8.2.2
Visual studio 2019

this is my code:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda_fp16.h>
#include <cudnn.h>
#include <stdio.h>
#include <time.h>
typedef half mtype;
#define datatype CUDNN_DATA_HALF
#define MAX_Buffsz 1000000000

void handleErr(int t, const char* file, int line)
{
	if (t == 0) { return; }
	printf("err in cudnn %d %s %d", t, file, line);
	getchar();
}
#define checkCUDNN(status) {										\
	if (status != CUDNN_STATUS_SUCCESS) {							\
	 handleErr(status,__FILE__,__LINE__);}							\
}
class myTest {
public:
	cudnnHandle_t _cudnnHandle;
	cudnnDataType_t _dataType;
	int _n, _InC, _d, _h, _w, _winSzD, _winSzH, _winSzW, _padD, _padH, _padW, _strideD, _strideH, _strideW, _OutC, _OutD, _OutH, _OutW, _dilationD, _dilationH, _dilationW, _group, _padMode;
	cudnnTensorDescriptor_t _srcTensorDesc, _dstTensorDesc;
	cudnnFilterDescriptor_t _filterDesc;
	cudnnConvolutionDescriptor_t _convDesc;
	cudnnConvolutionFwdAlgoPerf_t _algoFwd;
	mtype* srcData, * filterData, * dstData, * buff;
	size_t szSrc, szfilter, szDst;

	template <typename algoPerf_t>
	int getBestAlgoIndex(algoPerf_t* perfs, int retAlgoCount, size_t limitMem, cudnnDataType_t mType) {
		int algoFlag = 0;
		int bestPerfIndex = 0;
		int flag = (mType == CUDNN_DATA_HALF) || (mType == CUDNN_DATA_BFLOAT16);
		for (int i = 0; i < retAlgoCount; i++) {
			if (perfs[i].status == CUDNN_STATUS_SUCCESS &&
				(flag ? perfs[i].mathType == CUDNN_TENSOR_OP_MATH : 1) &&
				perfs[i].determinism == CUDNN_DETERMINISTIC && (!limitMem || perfs[i].memory < limitMem)) {
				algoFlag = 1; bestPerfIndex = i; break;
			}
		}
		if (algoFlag == 0) {
			for (int i = 0; i < retAlgoCount; i++) {
				if (perfs[i].status == CUDNN_STATUS_SUCCESS &&
					(flag ? perfs[i].mathType == CUDNN_TENSOR_OP_MATH : 1) &&
					(!limitMem || perfs[i].memory < limitMem)) {
					algoFlag = 1; bestPerfIndex = i; break;
				}
			}
			if (algoFlag == 1) { printf(" algo found but NOT DETERMINISTIC "); }
		}
		if (algoFlag == 0) {
			for (int i = 0; i < retAlgoCount; i++) {
				if (perfs[i].status == CUDNN_STATUS_SUCCESS) {
					algoFlag = 1; bestPerfIndex = i; break;
				}
			}
			if (algoFlag == 1) { printf(" algo found but not enough memory"); }
		}
		if (algoFlag == 0) {
			printf("ERR: algo not found");
			//system("pause"); 
		}
		return bestPerfIndex;
	}
	void setConvolutionTensor(int n, int InC, int d, int h, int w, int winSzD, int winSzH, int winSzW, int padD, int padH, int padW, int strideD,
		int strideH, int strideW, int OutC, int OutD, int OutH, int OutW, int dilationD, int dilationH, int dilationW, int group, int padMode, cudnnDataType_t comtype)
	{
		_dataType = comtype;
		_n = n; _InC = InC; _h = h; _w = w; _d = d, _winSzW = winSzW; _winSzH = winSzH; _winSzD = winSzD, _padW = padW, _padH = padH, _padD = padD;
		_strideD = strideD; _strideW = strideW; _strideH = strideH; _OutC = OutC; _dilationW = dilationW; _dilationH = dilationH; _dilationD = dilationD, _group = group;

		int dimSrc[5] = { n,InC,d,h,w };
		int strideSrc[5] = { d * h * w * InC,d * h * w, h * w, w,1 };
		checkCUDNN(cudnnSetTensorNdDescriptor(_srcTensorDesc, _dataType, 5, dimSrc, strideSrc));
		int filterA[5] = { _OutC, _InC / _group, _winSzD,_winSzH,_winSzW };
		checkCUDNN(cudnnSetFilterNdDescriptor(_filterDesc, _dataType, CUDNN_TENSOR_NCHW, 5, filterA));

		int padA[3] = { _padD,_padH,_padW };
		int strideA[3] = { _strideD,_strideH,_strideW };
		int dilationA[3] = { _dilationD,_dilationH,_dilationW };
		cudnnDataType_t convType = _dataType;
		if (convType == CUDNN_DATA_BFLOAT16) {
			convType = CUDNN_DATA_FLOAT;
		}
		checkCUDNN(cudnnSetConvolutionNdDescriptor(_convDesc, 3, padA, strideA, dilationA, CUDNN_CROSS_CORRELATION, convType));

#if CUDNN_VERSION > 7000
		if (_dataType == CUDNN_DATA_HALF || _dataType == CUDNN_DATA_BFLOAT16) {
			checkCUDNN(cudnnSetConvolutionMathType(_convDesc, CUDNN_TENSOR_OP_MATH));
		}
		else if (_dataType == CUDNN_DATA_FLOAT) {
			checkCUDNN(cudnnSetConvolutionMathType(_convDesc, CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION));
		}
		else {
			checkCUDNN(cudnnSetConvolutionMathType(_convDesc, CUDNN_DEFAULT_MATH));
		}
#endif
		cudnnSetConvolutionGroupCount(_convDesc, group);

		int outDim[5] = { 0 };
		checkCUDNN(cudnnGetConvolutionNdForwardOutputDim(_convDesc, _srcTensorDesc, _filterDesc, 5, outDim));
		n = outDim[0];
		_OutC = outDim[1];
		_OutD = outDim[2];
		_OutH = outDim[3];
		_OutW = outDim[4];

		int dimDst[5] = { n,OutC,_OutD,_OutH,_OutW };
		int strideDst[5] = { _OutH * _OutW * _OutD * _OutC, _OutH * _OutW * _OutD, _OutH * _OutW,_OutW,1 };
		checkCUDNN(cudnnSetTensorNdDescriptor(_dstTensorDesc, _dataType, 5, dimDst, strideDst));
		if ((OutH != _OutH && OutH != 0) || (OutW != _OutW && OutW != 0) || (OutD != _OutD && OutD != 0)) { printf("err in comvDim"); }
		if (srcData) { cudaFree(srcData); }
		if (dstData) { cudaFree(dstData); }
		if (filterData) { cudaFree(filterData); }
		szSrc = n * InC * d * h * w;
		szDst = n * _OutC * _OutD * _OutH * _OutW;
		szfilter = InC * OutC / group * winSzD * winSzH * winSzW;
		cudaMalloc(&srcData, szSrc * sizeof(mtype));
		cudaMalloc(&dstData, szDst * sizeof(mtype));
		cudaMalloc(&filterData, szfilter * sizeof(mtype));

		cudnnConvolutionFwdAlgoPerf_t perfs[CUDNN_CONVOLUTION_FWD_ALGO_COUNT];
		int retAlgoCount = -1;
		checkCUDNN(cudnnFindConvolutionForwardAlgorithmEx(_cudnnHandle,
			_srcTensorDesc, srcData, _filterDesc, filterData, _convDesc, _dstTensorDesc,
			dstData, CUDNN_CONVOLUTION_FWD_ALGO_COUNT, &retAlgoCount, perfs, buff, MAX_Buffsz));
		_algoFwd = perfs[getBestAlgoIndex(perfs, retAlgoCount, MAX_Buffsz, _dataType)];
	}

	int mymain(int i) {
		cudaSetDevice(i);
		srcData = 0; dstData = 0; filterData = 0;
		cudnnCreate(&_cudnnHandle);
		checkCUDNN(cudnnCreateTensorDescriptor(&_srcTensorDesc));
		checkCUDNN(cudnnCreateTensorDescriptor(&_dstTensorDesc));
		checkCUDNN(cudnnCreateFilterDescriptor(&_filterDesc));
		checkCUDNN(cudnnCreateConvolutionDescriptor(&_convDesc));
		//Malloc workspace
		cudaMalloc(&buff, MAX_Buffsz);
		int n = 1, d = 32, c = 3, h = 224, w = 224, oc = 32, winSz = 3, stride = 2;
		int group = 1;
		setConvolutionTensor(n, c, d, h, w, winSz, winSz, winSz, (winSz - 1) / 2, (winSz - 1) / 2, (winSz - 1) / 2, 1,
			stride, stride, oc, 0, 0, 0, 1, 1, 1, group, 0, datatype);
		return 0;
	}

};

void main() {
//#pragma omp parallel for
	for (int i = 0; i < 1; i++) {
		myTest A;
		A.mymain(i);
	}
}

this is log file


I! CuDNN (v8202) function cudnnCreate() called:
i!     handle: location=host; addr=0000000AA64FFD50;
i! Time: 2022-09-08T10:15:47.373233 (0d+0h+0m+0s since start)
i! Process=25596; Thread=16148; GPU=NULL; Handle=NULL; StreamId=NULL.


I! CuDNN (v8202) function cudnnCreateTensorDescriptor() called:
i! Time: 2022-09-08T10:15:48.060786 (0d+0h+0m+1s since start)
i! Process=25596; Thread=16148; GPU=NULL; Handle=NULL; StreamId=NULL.


I! CuDNN (v8202) function cudnnCreateTensorDescriptor() called:
i! Time: 2022-09-08T10:15:48.060786 (0d+0h+0m+1s since start)
i! Process=25596; Thread=16148; GPU=NULL; Handle=NULL; StreamId=NULL.


I! CuDNN (v8202) function cudnnCreateFilterDescriptor() called:
i! Time: 2022-09-08T10:15:48.060786 (0d+0h+0m+1s since start)
i! Process=25596; Thread=16148; GPU=NULL; Handle=NULL; StreamId=NULL.


I! CuDNN (v8202) function cudnnCreateConvolutionDescriptor() called:
i!     convDesc: location=host; addr=0000000AA64FFDD0;
i! Time: 2022-09-08T10:15:48.091920 (0d+0h+0m+1s since start)
i! Process=25596; Thread=16148; GPU=NULL; Handle=NULL; StreamId=NULL.


I! CuDNN (v8202) function cudnnSetTensorNdDescriptor() called:
i!     dataType: type=cudnnDataType_t; val=CUDNN_DATA_HALF (2);
i!     nbDims: type=int; val=5;
i!     dimA: type=int; val=[1,3,32,224,224];
i!     strideA: type=int; val=[4816896,1605632,50176,224,1];
i! Time: 2022-09-08T10:15:48.107462 (0d+0h+0m+1s since start)
i! Process=25596; Thread=16148; GPU=NULL; Handle=NULL; StreamId=NULL.


I! CuDNN (v8202) function cudnnSetFilterNdDescriptor() called:
i!     dataType: type=cudnnDataType_t; val=CUDNN_DATA_HALF (2);
i!     format: type=cudnnTensorFormat_t; val=CUDNN_TENSOR_NCHW (0);
i!     nbDims: type=int; val=5;
i!     filterDimA: type=int; val=[32,3,3,3,3];
i! Time: 2022-09-08T10:15:48.107462 (0d+0h+0m+1s since start)
i! Process=25596; Thread=16148; GPU=NULL; Handle=NULL; StreamId=NULL.


I! CuDNN (v8202) function cudnnSetConvolutionNdDescriptor() called:
i!     convDesc: location=host; addr=0000017435BFC6A0;
i!     arrayLength: type=int; val=3;
i!     padA: type=int; val=[1,1,1];
i!     strideA: type=int; val=[1,2,2];
i!     dilationA: type=int; val=[1,1,1];
i!     mode: type=cudnnConvolutionMode_t; val=CUDNN_CROSS_CORRELATION (1);
i!     dataType: type=cudnnDataType_t; val=CUDNN_DATA_HALF (2);
i! Time: 2022-09-08T10:15:48.107462 (0d+0h+0m+1s since start)
i! Process=25596; Thread=16148; GPU=NULL; Handle=NULL; StreamId=NULL.


I! CuDNN (v8202) function cudnnSetConvolutionMathType() called:
i!     convDesc: location=host; addr=0000017435BFC6A0;
i!     mathType: type=cudnnMathType_t; val=CUDNN_TENSOR_OP_MATH (1);
i! Time: 2022-09-08T10:15:48.107462 (0d+0h+0m+1s since start)
i! Process=25596; Thread=16148; GPU=NULL; Handle=NULL; StreamId=NULL.


I! CuDNN (v8202) function cudnnSetConvolutionGroupCount() called:
i!     convDesc: location=host; addr=0000017435BFC6A0;
i!     groupCount: type=int; val=1;
i! Time: 2022-09-08T10:15:48.107462 (0d+0h+0m+1s since start)
i! Process=25596; Thread=16148; GPU=NULL; Handle=NULL; StreamId=NULL.


I! CuDNN (v8202) function cudnnGetConvolutionNdForwardOutputDim() called:
i!     convDesc: type=cudnnConvolutionDescriptor_t:
i!         mode: type=cudnnConvolutionMode_t; val=CUDNN_CROSS_CORRELATION (1);
i!         dataType: type=cudnnDataType_t; val=CUDNN_DATA_HALF (2);
i!         mathType: type=cudnnMathType_t; val=CUDNN_TENSOR_OP_MATH (1);
i!         reorderType: type=int; val=0;
i!         arrayLength: type=int; val=3;
i!         padA: type=int; val=[1,1,1];
i!         strideA: type=int; val=[1,2,2];
i!         dilationA: type=int; val=[1,1,1];
i!         groupCount: type=int; val=1;
i!     inputTensorDesc: type=cudnnTensorDescriptor_t:
i!         dataType: type=cudnnDataType_t; val=CUDNN_DATA_HALF (2);
i!         nbDims: type=int; val=5;
i!         dimA: type=int; val=[1,3,32,224,224];
i!         strideA: type=int; val=[4816896,1605632,50176,224,1];
i!     filterDesc: type=cudnnFilterDescriptor_t:
i!         dataType: type=cudnnDataType_t; val=CUDNN_DATA_HALF (2);
i!         vect: type=int; val=0;
i!         nbDims: type=int; val=5;
i!         dimA: type=int; val=[32,3,3,3,3];
i!         format: type=cudnnTensorFormat_t; val=CUDNN_TENSOR_NCHW (0);
i!     nbDims: type=int; val=5;
i!     tensorOuputDimA: location=host; addr=0000000AA64FF9C8;
i! Time: 2022-09-08T10:15:48.107462 (0d+0h+0m+1s since start)
i! Process=25596; Thread=16148; GPU=NULL; Handle=NULL; StreamId=NULL.


I! CuDNN (v8202) function cudnnSetTensorNdDescriptor() called:
i!     dataType: type=cudnnDataType_t; val=CUDNN_DATA_HALF (2);
i!     nbDims: type=int; val=5;
i!     dimA: type=int; val=[1,32,32,112,112];
i!     strideA: type=int; val=[12845056,401408,12544,112,1];
i! Time: 2022-09-08T10:15:48.107462 (0d+0h+0m+1s since start)
i! Process=25596; Thread=16148; GPU=NULL; Handle=NULL; StreamId=NULL.


I! CuDNN (v8202) function cudnnFindConvolutionForwardAlgorithmEx() called:
i!     handle: type=cudnnHandle_t; streamId=0000000000000000 (defaultStream);
i!     srcDesc: type=cudnnTensorDescriptor_t:
i!         dataType: type=cudnnDataType_t; val=CUDNN_DATA_HALF (2);
i!         nbDims: type=int; val=5;
i!         dimA: type=int; val=[1,3,32,224,224];
i!         strideA: type=int; val=[4816896,1605632,50176,224,1];
i!     srcData: location=dev; addr=000000074DE00000;
i!     filterDesc: type=cudnnFilterDescriptor_t:
i!         dataType: type=cudnnDataType_t; val=CUDNN_DATA_HALF (2);
i!         vect: type=int; val=0;
i!         nbDims: type=int; val=5;
i!         dimA: type=int; val=[32,3,3,3,3];
i!         format: type=cudnnTensorFormat_t; val=CUDNN_TENSOR_NCHW (0);
i!     filterData: location=dev; addr=0000000711A27C00;
i!     convDesc: type=cudnnConvolutionDescriptor_t:
i!         mode: type=cudnnConvolutionMode_t; val=CUDNN_CROSS_CORRELATION (1);
i!         dataType: type=cudnnDataType_t; val=CUDNN_DATA_HALF (2);
i!         mathType: type=cudnnMathType_t; val=CUDNN_TENSOR_OP_MATH (1);
i!         reorderType: type=int; val=0;
i!         arrayLength: type=int; val=3;
i!         padA: type=int; val=[1,1,1];
i!         strideA: type=int; val=[1,2,2];
i!         dilationA: type=int; val=[1,1,1];
i!         groupCount: type=int; val=1;
i!     destDesc: type=cudnnTensorDescriptor_t:
i!         dataType: type=cudnnDataType_t; val=CUDNN_DATA_HALF (2);
i!         nbDims: type=int; val=5;
i!         dimA: type=int; val=[1,32,32,112,112];
i!         strideA: type=int; val=[12845056,401408,12544,112,1];
i!     destData: location=dev; addr=000000074E800000;
i!     requestedAlgoCount: type=int; val=8;
i!     returnedAlgoCount: location=host; addr=0000000AA64FF9C0;
i!     perfResults: location=host; addr=0000000AA64FFA90;
i!     workSpace: location=dev; addr=0000000712400000;
i!     workSpaceSizeInBytes: type=unsigned long long; val=1000000000;
i! Time: 2022-09-08T10:15:48.107462 (0d+0h+0m+1s since start)
i! Process=25596; Thread=16148; GPU=0; Handle=000001747B964CD0; StreamId=0000000000000000 (defaultStream).


I! CuDNN (v8202) function cudnnGetConvolutionForwardAlgorithmMaxCount() called:
i!     handle: type=cudnnHandle_t; streamId=0000000000000000 (defaultStream);
i!     count: location=host; addr=0000000AA64FB620;
i! Time: 2022-09-08T10:15:48.107462 (0d+0h+0m+1s since start)
i! Process=25596; Thread=16148; GPU=NULL; Handle=NULL; StreamId=NULL.


I also get same result in RTX 3080 Laptop GPU. cudnn 8.4.1,cuda 11.4.

Algo can be found when i use PSEUDO_HALF_CONFIG mode.
TRUE_HALF_CONFIG supported on architectures with true FP16 support, meaning, compute capability 5.3 and later,but RTX 2070 super and RTX 3080 laptop is meet it.

Does anyone know?