Convolutional backward filter algorithm returns no supported results

Hello,

My question is based on the following two assumptions:

  • the tensor format NHWC is faster than NCHW
  • it is better to work with half precision than with float, if tensor operations should be used.

So my questions is, why does cudnnGetConvolutionBackwardFilterAlgorithm_v7 not return a single result that has the status CUDNN_STATUS_SUCCESS?

I use:
Windows 10
Cuda: 10.2
Code Generation: compute_75,sm_75
Nvidia Titan RTX
Visual Studio 2019

I hope someone can help me in changing the parameters to a better supported version. However I would still like to maintain half precision and a data format that works without much conversion overhead.

I am thankfull for any response and code for recreating the problem is appended.
Yours thankfully Harald Schweiger

#include <stdlib.h>
#include <stdio.h>
#include <cudnn.h>

#define checkCU(status) _cu_exit(status, __FILE__ , __LINE__)

void _cu_exit(cudnnStatus_t status, const char* file, int line) {
    if (status) {
        fprintf(stderr, "CudnnError %s:%d: %s\n", file, line, cudnnGetErrorString(status));
        exit(EXIT_FAILURE);
    }
}

const char* getMathTypeName(cudnnMathType_t math) {
    switch (math) {
    case CUDNN_DEFAULT_MATH:
        return "CUDNN_DEFAULT_MATH";
    case CUDNN_TENSOR_OP_MATH:
        return "CUDNN_TENSOR_OP_MATH";
    case CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION:
        return "CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION";
    default:
        exit(EXIT_FAILURE);
    }
}

const char* getBwdFilterAlgoName(cudnnConvolutionBwdFilterAlgo_t algo) {
    switch (algo) {
    case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0:
        return "CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0";
    case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1:
        return "CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1";
    case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT:
        return "CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT";
    case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3:
        return "CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3";
    case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD:
        return "CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD";
    case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED:
        return "CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED";
    case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING:
        return "CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING";
    case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT:
        return "CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT";
    default:
        exit(EXIT_FAILURE);
    }
}

const char* getTensorFormatName(cudnnTensorFormat_t format) {
    switch (format) {
    case CUDNN_TENSOR_NCHW:
        return "CUDNN_TENSOR_NCHW";
    case CUDNN_TENSOR_NHWC:
        return "CUDNN_TENSOR_NHWC";
    case CUDNN_TENSOR_NCHW_VECT_C:
        return "CUDNN_TENSOR_NCHW_VECT_C";
    default:
        exit(EXIT_FAILURE);
    }
}

const char* getDataTypeName(cudnnDataType_t type) {
    switch (type) {
    case CUDNN_DATA_FLOAT:
        return "CUDNN_DATA_FLOAT";
    case CUDNN_DATA_DOUBLE:
        return "CUDNN_DATA_DOUBLE";
    case CUDNN_DATA_HALF:
        return "CUDNN_DATA_HALF";
    case CUDNN_DATA_INT8:
        return "CUDNN_DATA_INT8";
    case CUDNN_DATA_INT32:
        return "CUDNN_DATA_INT32";
    case CUDNN_DATA_INT8x4:
        return "CUDNN_DATA_INT8x4";
    case CUDNN_DATA_UINT8:
        return "CUDNN_DATA_UINT8";
    case CUDNN_DATA_UINT8x4:
        return "CUDNN_DATA_UINT8x4";
    case CUDNN_DATA_INT8x32:
        return "CUDNN_DATA_INT8x32";
    default:
        exit(EXIT_FAILURE);
    }
}

int main()
{
    const int n = 8;		// Batch Size
    const int c = 8;		// Input Channel
    const int k = 8;		// Output Channel
    const int h = 128;		// Image Height
    const int w = 128;		// Image Width
    const int r = 3;		// Filter Height
    const int s = 3;		// Filter Width
    const int u = 1;		// Vertical Subsample Stride
    const int v = 1;		// Horizontal  Subsample Stride
    const int pad_h = 1;	// Vertical Padding
    const int pad_w = 1;	// Horizontal Padding
    const int dil_h = 1;	// Vertical  Dilation
    const int dil_w = 1;	// Horizontal Dilation

    cudnnTensorFormat_t cudnnTensorFormat = CUDNN_TENSOR_NHWC;
    cudnnDataType_t cudnnDataType = CUDNN_DATA_HALF;

    cudnnHandle_t cudnn;
    cudnnTensorDescriptor_t xDesc, dyDesc;
    cudnnConvolutionDescriptor_t conv;
    cudnnFilterDescriptor_t dfDesc;

    checkCU(cudnnCreate(&cudnn));

    checkCU(cudnnCreateTensorDescriptor(&xDesc));
    checkCU(cudnnSetTensor4dDescriptor(xDesc, cudnnTensorFormat, cudnnDataType, n, c, h, w));

    checkCU(cudnnCreateTensorDescriptor(&dyDesc));
    checkCU(cudnnSetTensor4dDescriptor(dyDesc, cudnnTensorFormat, cudnnDataType, n, k, h, w));

    checkCU(cudnnCreateConvolutionDescriptor(&conv));
    checkCU(cudnnSetConvolutionMathType(conv, CUDNN_TENSOR_OP_MATH));
    checkCU(cudnnSetConvolution2dDescriptor(conv, pad_h, pad_w, u, v, dil_h, dil_w, CUDNN_CROSS_CORRELATION, cudnnDataType));

    checkCU(cudnnCreateFilterDescriptor(&dfDesc));
    checkCU(cudnnSetFilter4dDescriptor(dfDesc, cudnnDataType, cudnnTensorFormat, k, c, r, s));

    int count = 20;
    const int maxCount = 20;

    cudnnConvolutionBwdFilterAlgoPerf_t perfResults[maxCount];

    checkCU(cudnnGetConvolutionBackwardFilterAlgorithm_v7(cudnn, xDesc, dyDesc, conv, dfDesc, maxCount, &count, perfResults));

    printf(" |----------------------------------------------------------------------------------------------------------|\n");
    printf(" |   %-51s   %-20s   %26s|\n", getTensorFormatName(cudnnTensorFormat), getDataTypeName(cudnnDataType), "");
    printf(" |----------------------------------------------------------------------------------------------------------|\n");
    for (int i = 0; i < count; i++) {
        cudnnConvolutionBwdFilterAlgoPerf_t algoPref = perfResults[i];
        printf(" |%d: %-51s | %-20s | %-26s|\n", i, getBwdFilterAlgoName(algoPref.algo), getMathTypeName(algoPref.mathType), cudnnGetErrorString(algoPref.status));
    }
    printf(" |----------------------------------------------------------------------------------------------------------|\n");

    checkCU(cudnnDestroyFilterDescriptor(dfDesc));
    checkCU(cudnnDestroyConvolutionDescriptor(conv));
    checkCU(cudnnDestroyTensorDescriptor(dyDesc));
    checkCU(cudnnDestroyTensorDescriptor(xDesc));
    checkCU(cudnnDestroy(cudnn));

    return 0;
}

Have you try CUDNN_DATATYPE_FLOAT instead? TensorOp are half in/out and single precision compute.

Hi,

Can you try setting ‚ÄúreturnedAlgoCount‚ÄĚ value of cudnnGetConvolutionBackwardFilterAlgorithm_v7 API to non-zero value?

Thanks

I updated the code to provide more insight into my current situation.
These are the results:

 |----------------------------------------------------------------------------------------------------------|
 |   CUDNN_TENSOR_NHWC                                     CUDNN_DATA_HALF                                  |
 |----------------------------------------------------------------------------------------------------------|
 |0: CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0                 | CUDNN_DEFAULT_MATH   | CUDNN_STATUS_NOT_SUPPORTED|
 |1: CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1                 | CUDNN_TENSOR_OP_MATH | CUDNN_STATUS_NOT_SUPPORTED|
 |2: CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1                 | CUDNN_DEFAULT_MATH   | CUDNN_STATUS_NOT_SUPPORTED|
 |3: CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING        | CUDNN_DEFAULT_MATH   | CUDNN_STATUS_NOT_SUPPORTED|
 |4: CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT               | CUDNN_DEFAULT_MATH   | CUDNN_STATUS_NOT_SUPPORTED|
 |5: CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3                 | CUDNN_DEFAULT_MATH   | CUDNN_STATUS_NOT_SUPPORTED|
 |6: CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD          | CUDNN_DEFAULT_MATH   | CUDNN_STATUS_NOT_SUPPORTED|
 |7: CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED | CUDNN_DEFAULT_MATH   | CUDNN_STATUS_NOT_SUPPORTED|
 |8: CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED | CUDNN_TENSOR_OP_MATH | CUDNN_STATUS_NOT_SUPPORTED|
 |----------------------------------------------------------------------------------------------------------|

 |----------------------------------------------------------------------------------------------------------|
 |   CUDNN_TENSOR_NHWC                                     CUDNN_DATA_FLOAT                                 |
 |----------------------------------------------------------------------------------------------------------|
 |0: CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0                 | CUDNN_DEFAULT_MATH   | CUDNN_STATUS_SUCCESS      |
 |1: CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1                 | CUDNN_TENSOR_OP_MATH | CUDNN_STATUS_SUCCESS      |
 |2: CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1                 | CUDNN_DEFAULT_MATH   | CUDNN_STATUS_SUCCESS      |
 |3: CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING        | CUDNN_DEFAULT_MATH   | CUDNN_STATUS_NOT_SUPPORTED|
 |4: CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT               | CUDNN_DEFAULT_MATH   | CUDNN_STATUS_NOT_SUPPORTED|
 |5: CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3                 | CUDNN_DEFAULT_MATH   | CUDNN_STATUS_NOT_SUPPORTED|
 |6: CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD          | CUDNN_DEFAULT_MATH   | CUDNN_STATUS_NOT_SUPPORTED|
 |7: CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED | CUDNN_DEFAULT_MATH   | CUDNN_STATUS_NOT_SUPPORTED|
 |8: CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED | CUDNN_TENSOR_OP_MATH | CUDNN_STATUS_NOT_SUPPORTED|
 |----------------------------------------------------------------------------------------------------------|

 |----------------------------------------------------------------------------------------------------------|
 |   CUDNN_TENSOR_NCHW                                     CUDNN_DATA_HALF                                  |
 |----------------------------------------------------------------------------------------------------------|
 |0: CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED | CUDNN_DEFAULT_MATH   | CUDNN_STATUS_SUCCESS      |
 |1: CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED | CUDNN_TENSOR_OP_MATH | CUDNN_STATUS_SUCCESS      |
 |2: CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1                 | CUDNN_TENSOR_OP_MATH | CUDNN_STATUS_SUCCESS      |
 |3: CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1                 | CUDNN_DEFAULT_MATH   | CUDNN_STATUS_SUCCESS      |
 |4: CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING        | CUDNN_DEFAULT_MATH   | CUDNN_STATUS_NOT_SUPPORTED|
 |5: CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT               | CUDNN_DEFAULT_MATH   | CUDNN_STATUS_NOT_SUPPORTED|
 |6: CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3                 | CUDNN_DEFAULT_MATH   | CUDNN_STATUS_NOT_SUPPORTED|
 |7: CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD          | CUDNN_DEFAULT_MATH   | CUDNN_STATUS_NOT_SUPPORTED|
 |8: CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0                 | CUDNN_DEFAULT_MATH   | CUDNN_STATUS_NOT_SUPPORTED|
 |----------------------------------------------------------------------------------------------------------|

The big question is, why is NCHW supported for half precision, but the format with less overhead according to the developer guide (NHWC) not? Is this a bug or did I get somewhere confused?

You mean like this?

int count = 20;
const int maxCount = 20;

cudnnConvolutionBwdFilterAlgoPerf_t perfResults[maxCount];

checkCU(cudnnGetConvolutionBackwardFilterAlgorithm_v7(cudnn, xDesc, dyDesc, conv, dfDesc, maxCount, &count, perfResults));

I updated my code with these lines, but there were no differences in the outcome.

I suppose you mean half-data half-precision compute, a.k.a. true-half. rather than half-in/out single-compute, a.k.a. pseudo-half.

When the developer guide recommends NHWC for half-precision data, it is talking about pseudo-half.

1 Like