cuDNN v6.0 failure of filter and workspace initialization for 3D convolution (CUDNN_STATUS_NOT_SUPPORTED)

Hello,

i’m trying to use cuDNN v6.0 with CUDA 8.0 for 3D convolution, but apparently all API calls concerning the
initialization of backward filter and data algorithms and determination of workspace sizes seem to fail with CUDNN_STATUS_NOT_SUPPORTED. This error code is not documented in the cuDNN v6.0 user guide (DU-06702-001_v6.0 | February 2017) for the failing functions (cudnnGetConvolutionBackwardFilterAlgorithm,
cudnnGetConvolutionBackwardFilterWorkspaceSize, cudnnGetConvolutionBackwardDataAlgorithm, cudnnGetConvolutionBackwardDataWorkspaceSize).

Here a complete minimal example:

#include <iostream>
#include <cuda_runtime.h>
#include <cudnn.h>
#include <thrust/device_ptr.h>
#include <cmath>
#include <algorithm>

typedef double FloatType;

template<class FloatType>
class CUDNN_DATATYPE
{
public:
    static const int val = 1;
};

template<>
class CUDNN_DATATYPE<float>
{
public:
    static const int val = CUDNN_DATA_FLOAT;
};

template<>
class CUDNN_DATATYPE<double>
{
public:
    static const int val = CUDNN_DATA_DOUBLE;
};

inline const char* cudnnGetErrorString(cudnnStatus_t status) {
  switch (status) {
    case CUDNN_STATUS_SUCCESS:
      return "CUDNN_STATUS_SUCCESS";
    case CUDNN_STATUS_NOT_INITIALIZED:
      return "CUDNN_STATUS_NOT_INITIALIZED";
    case CUDNN_STATUS_ALLOC_FAILED:
      return "CUDNN_STATUS_ALLOC_FAILED";
    case CUDNN_STATUS_BAD_PARAM:
      return "CUDNN_STATUS_BAD_PARAM";
    case CUDNN_STATUS_INTERNAL_ERROR:
      return "CUDNN_STATUS_INTERNAL_ERROR";
    case CUDNN_STATUS_INVALID_VALUE:
      return "CUDNN_STATUS_INVALID_VALUE";
    case CUDNN_STATUS_ARCH_MISMATCH:
      return "CUDNN_STATUS_ARCH_MISMATCH";
    case CUDNN_STATUS_MAPPING_ERROR:
      return "CUDNN_STATUS_MAPPING_ERROR";
    case CUDNN_STATUS_EXECUTION_FAILED:
      return "CUDNN_STATUS_EXECUTION_FAILED";
    case CUDNN_STATUS_NOT_SUPPORTED:
      return "CUDNN_STATUS_NOT_SUPPORTED";
    case CUDNN_STATUS_LICENSE_ERROR:
      return "CUDNN_STATUS_LICENSE_ERROR";
  }
  return "Unknown cudnn status";
}

#ifndef CUDNN_CALL
#define CUDNN_CALL(x) if((x) != CUDNN_STATUS_SUCCESS) { std::cout << "cuDNN Error in " << __FILE__ << " at line " << __LINE__ << ": " << cudnnGetErrorString((x)) << std::endl; }
#endif

#define CUDNN_CONVOLUTION_FWD_STRATEGY CUDNN_CONVOLUTION_FWD_PREFER_FASTEST
#define CUDNN_CONVOLUTION_BWD_FILTER_STRATEGY CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST
#define CUDNN_CONVOLUTION_BWD_DATA_STRATEGY CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST

const int num_images = 10;
const int num_features_in = 10;
const int num_features_out = 10;
const int depth = 10;
const int rows = 10;
const int cols = 10;

const int kernelSizeX = 5;
const int kernelSizeY = 5;
const int kernelSizeZ = 5;
const int strideX = 1;
const int strideY = 1;
const int strideZ = 1;
const int paddingX = 0;
const int paddingY = 0;
const int paddingZ = 0;
const int upscaleX = 1;
const int upscaleY = 1;
const int upscaleZ = 1;

cudnnHandle_t cudnnHandle;
cudnnTensorDescriptor_t inputTensorDesc;
cudnnTensorDescriptor_t biasTensorDesc;
cudnnTensorDescriptor_t outputTensorDesc;
cudnnFilterDescriptor_t filterDesc;
cudnnConvolutionDescriptor_t convolutionDesc;
cudnnConvolutionFwdAlgo_t forward_algo;
cudnnConvolutionBwdFilterAlgo_t backward_falgo;
cudnnConvolutionBwdDataAlgo_t backward_dalgo;

size_t workspaceSizeForward;
size_t workspaceSizeBackwardData;
size_t workspaceSizeBackwardFilter;

using namespace std;

int main(int argc, char *argv[])
{
    //create descriptors
    CUDNN_CALL(cudnnCreate(&cudnnHandle));
    CUDNN_CALL(cudnnCreateTensorDescriptor(&inputTensorDesc));
    CUDNN_CALL(cudnnCreateTensorDescriptor(&biasTensorDesc));
    CUDNN_CALL(cudnnCreateTensorDescriptor(&outputTensorDesc));
    CUDNN_CALL(cudnnCreateFilterDescriptor(&filterDesc));
    CUDNN_CALL(cudnnCreateConvolutionDescriptor(&convolutionDesc));

const int dimInput[] = { num_images, num_features_in, depth, rows, cols };
    const int strideInput[] = { 1, 1, 1, 1, 1 };

    CUDNN_CALL(cudnnSetTensorNdDescriptor(inputTensorDesc,
                                          (cudnnDataType_t)CUDNN_DATATYPE<FloatType>::val,
                                          5,
                                          dimInput,
                                          strideInput));

    const int dimBias[] = { 1, num_features_out, 1, 1, 1 };
    const int strideBias[] = { 1, 1, 1, 1, 1};

    CUDNN_CALL(cudnnSetTensorNdDescriptor(biasTensorDesc,
                                          (cudnnDataType_t)CUDNN_DATATYPE<FloatType>::val,
                                          5,
                                          dimBias,
                                          strideBias));

    const int dimFilter[] = { num_features_out, num_features_in, kernelSizeZ, kernelSizeY, kernelSizeX };
    CUDNN_CALL(cudnnSetFilterNdDescriptor(filterDesc,
                                          (cudnnDataType_t)CUDNN_DATATYPE<FloatType>::val,
                                          CUDNN_TENSOR_NCHW,
                                          5,
                                          dimFilter));

    const int padding[] = { paddingZ, paddingY, paddingX };
    const int stride[] = { strideZ, strideY, strideX };
    const int upscale[] = { upscaleZ, upscaleY, upscaleX };

    CUDNN_CALL(cudnnSetConvolutionNdDescriptor(convolutionDesc,
                                               3,
                                               padding,
                                               stride,
                                               upscale,
                                               CUDNN_CROSS_CORRELATION,
                                               (cudnnDataType_t)CUDNN_DATATYPE<FloatType>::val));

    int dimOutput[5];

    CUDNN_CALL(cudnnGetConvolutionNdForwardOutputDim(convolutionDesc,
                                                     inputTensorDesc,
                                                     filterDesc,
                                                     5,
                                                     dimOutput));

    CUDNN_CALL(cudnnSetTensorNdDescriptor(outputTensorDesc,
                                          (cudnnDataType_t)CUDNN_DATATYPE<FloatType>::val,
                                          5,
                                          dimOutput,strideInput));

    CUDNN_CALL(cudnnGetConvolutionForwardAlgorithm(cudnnHandle,
                                                   inputTensorDesc,
                                                   filterDesc,
                                                   convolutionDesc,
                                                   outputTensorDesc,
                                                   CUDNN_CONVOLUTION_FWD_STRATEGY,
                                                   0,
                                                   &forward_algo));

CUDNN_CALL(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle,
                                                       inputTensorDesc,
                                                       filterDesc,
                                                       convolutionDesc,
                                                       outputTensorDesc,
                                                       forward_algo,
                                                       &workspaceSizeForward));

    //backward filter
    CUDNN_CALL(cudnnGetConvolutionBackwardFilterAlgorithm(cudnnHandle,
                                                          inputTensorDesc,
                                                          outputTensorDesc,
                                                          convolutionDesc,
                                                          filterDesc,
                                                          CUDNN_CONVOLUTION_BWD_FILTER_STRATEGY,
                                                          0,
                                                          &backward_falgo));

    CUDNN_CALL(cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnnHandle,
                                                              inputTensorDesc,
                                                              outputTensorDesc,
                                                              convolutionDesc,
                                                              filterDesc,
                                                              backward_falgo,
                                                              &workspaceSizeBackwardFilter));

    //backward data
    CUDNN_CALL(cudnnGetConvolutionBackwardDataAlgorithm(cudnnHandle,
                                                        filterDesc,
                                                        outputTensorDesc,
                                                        convolutionDesc,
                                                        inputTensorDesc,
                                                        CUDNN_CONVOLUTION_BWD_DATA_STRATEGY,0,&backward_dalgo));

    CUDNN_CALL(cudnnGetConvolutionBackwardDataWorkspaceSize(cudnnHandle,
                                                            filterDesc,
                                                            outputTensorDesc,
                                                            convolutionDesc,
                                                            inputTensorDesc,
                                                            backward_dalgo,
                                                            &workspaceSizeBackwardData));

const size_t maxWorkSpaceSize = std::max(std::max(workspaceSizeForward,workspaceSizeBackwardFilter),workspaceSizeBackwardData);

    CUDNN_CALL(cudnnDestroy(cudnnHandle));
    CUDNN_CALL(cudnnDestroyTensorDescriptor(inputTensorDesc));
    CUDNN_CALL(cudnnDestroyTensorDescriptor(biasTensorDesc));
    CUDNN_CALL(cudnnDestroyTensorDescriptor(outputTensorDesc));
    CUDNN_CALL(cudnnDestroyFilterDescriptor(filterDesc));
    CUDNN_CALL(cudnnDestroyConvolutionDescriptor(convolutionDesc));

    return 0;
}

I’m using a Quadro K5000 with compute_30 compiler switch. Is this functionality not yet available in general, a hardware issue or is there a workaround?

Thanks,