CUDNN Status Not Supported when trying to use FFT convolution but works for Winograd.

I am trying to use the cuDNN library to do a FFT convolution. The code runs when I use the Winograd convolution / the cuDNN method that selects the fastest convolution method, but when I tried to run using the FFT convolution method it does not work.

I set the forward method to FFT convolution myself.

I checked the documents and my input is in NCHW format as required for the FFT convolution. From the docs:

CUDNN_CONVOLUTION_FWD_ALGO_FFT
    xDesc Format Support: NCHW HW-packed
    yDesc Format Support: NCHW HW-packed

The error “CUDNN_STATUS_NOT_SUPPORTED” happens during the cudnnGetConvolutionForwardWorkspaceSize function call.

What is happening that causes this error when I use FFT convolution VS best or Winograd?

For reference I am using cuda 9.1, cuDNN 7. I compile with the following command on Ubuntu 16.04: nvcc -arch=sm_35 -std=c++11 -O2 -lcudnn FFT_cuDNN.cu -o conv pkg-config --cflags --libs opencv; ./conv TF.png

#include <cudnn.h>
    #include <cassert>
    #include <cstdlib>
    #include <iostream>
    #include <opencv2/opencv.hpp>
    #include <opencv2/dnn.hpp>
    
    using namespace cv;
    using namespace cv::dnn;
    
    #define checkCUDNN(expression)                               \
      {                                                          \
        cudnnStatus_t status = (expression);                     \
        if (status != CUDNN_STATUS_SUCCESS) {                    \
          std::cerr << "Error on line " << __LINE__ << ": "      \
                    << cudnnGetErrorString(status) << std::endl; \
          std::exit(EXIT_FAILURE);                               \
        }                                                        \
      }
    
    cv::Mat load_image_NCHW(const char* image_path)
    {
    	cv::Mat image = cv::imread(image_path, cv::IMREAD_COLOR);
    	image.convertTo(image, CV_32FC3);
    	cv::normalize(image,image,0,1, cv::NORM_MINMAX);
    
    	cv::Mat inputBlob = blobFromImage(image, 1.0f, cv::Size(image.rows,image.cols), cv::Scalar(0,0,0));
    	return inputBlob;
    }
    
    void save_image(const char* output_filename,
                    float* buffer,
                    int height,
                    int width) {
      cv::Mat output_image(height, width, CV_32FC3, buffer);
      // Make negative values zero.
      cv::threshold(output_image,
                    output_image,
                    /*threshold=*/0,
                    /*maxval=*/0,
                    cv::THRESH_TOZERO);
      cv::normalize(output_image, output_image, 0.0, 255.0, cv::NORM_MINMAX);
      output_image.convertTo(output_image, CV_8UC3);
      cv::imwrite(output_filename, output_image);
      std::cerr << "Wrote output to " << output_filename << std::endl;
    }
    
    int main(int argc, const char* argv[]) {
      if (argc < 2) {
        std::cerr << "usage: conv <image> [gpu=0] [sigmoid=0]" << std::endl;
        std::exit(EXIT_FAILURE);
      }
    
      int gpu_id = (argc > 2) ? std::atoi(argv[2]) : 0;
      std::cerr << "GPU: " << gpu_id << std::endl;
    
      bool with_sigmoid = (argc > 3) ? std::atoi(argv[3]) : 0;
      std::cerr << "With sigmoid: " << std::boolalpha << with_sigmoid << std::endl;
    
      // Load the image
      cv::Mat image = load_image_NCHW(argv[1]);
    
      int imgH = 600;
      int imgW = 561;
      int inC = 3;
    
      // Set GPU to use
      cudaSetDevice(gpu_id);
    
      // Create the cudnn Handle
      cudnnHandle_t cudnn;
      checkCUDNN(cudnnCreate(&cudnn));
    
      // Need a descriptor for
      // The input, kernel, and convolution
    
      cudnnTensorDescriptor_t input_descriptor;
      checkCUDNN(cudnnCreateTensorDescriptor(&input_descriptor));
      checkCUDNN(cudnnSetTensor4dDescriptor(input_descriptor,
                                            /*format=*/CUDNN_TENSOR_NCHW,
                                            /*dataType=*/CUDNN_DATA_FLOAT,
                                            /*batch_size=*/1,
                                            /*channels=*/inC,
                                            /*image_height=*/imgH,
                                            /*image_width=*/imgW));
    
      cudnnFilterDescriptor_t kernel_descriptor;
      checkCUDNN(cudnnCreateFilterDescriptor(&kernel_descriptor));
      checkCUDNN(cudnnSetFilter4dDescriptor(kernel_descriptor,
                                            /*dataType=*/CUDNN_DATA_FLOAT,
                                            /*format=*/CUDNN_TENSOR_NCHW,
                                            /*out_channels=*/3,
                                            /*in_channels=*/inC,
                                            /*kernel_height=*/3,
                                            /*kernel_width=*/3));
    
      cudnnConvolutionDescriptor_t convolution_descriptor;
      checkCUDNN(cudnnCreateConvolutionDescriptor(&convolution_descriptor));
      checkCUDNN(cudnnSetConvolution2dDescriptor(convolution_descriptor,
                                                 /*pad_height=*/1,
                                                 /*pad_width=*/1,
                                                 /*vertical_stride=*/1,
                                                 /*horizontal_stride=*/1,
                                                 /*dilation_height=*/1,
                                                 /*dilation_width=*/1,
                                                 /*mode=*/CUDNN_CROSS_CORRELATION,
                                                 /*computeType=*/CUDNN_DATA_FLOAT));
    
      // Need to compute the output size
      int batch_size{0}, channels{0}, height{0}, width{0};
      checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convolution_descriptor,
                                                       input_descriptor,
                                                       kernel_descriptor,
                                                       &batch_size,
                                                       &channels,
                                                       &height,
                                                       &width));
    
      std::cerr << "Output Image: " << height << " x " << width << " x " << channels
                << std::endl;
    
      // Need an output descriptor
      cudnnTensorDescriptor_t output_descriptor;
      checkCUDNN(cudnnCreateTensorDescriptor(&output_descriptor));
      checkCUDNN(cudnnSetTensor4dDescriptor(output_descriptor,
                                            /*format=*/CUDNN_TENSOR_NCHW,
                                            /*dataType=*/CUDNN_DATA_FLOAT,
                                            /*batch_size=*/1,
                                            /*channels=*/3,
                                            /*image_height=*/imgH,
                                            /*image_width=*/imgW));
    
      // Need to define the forward algorithm
      cudnnConvolutionFwdAlgo_t convolution_algorithm = CUDNN_CONVOLUTION_FWD_ALGO_FFT;
    
      // Have to compute the workspace size
      size_t workspace_bytes{0};
      checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnn,
                                                         input_descriptor,
                                                         kernel_descriptor,
                                                         convolution_descriptor,
                                                         output_descriptor,
                                                         convolution_algorithm,
                                                         &workspace_bytes));
    
      std::cerr << "Workspace size: " << (workspace_bytes / 1048576.0) << "MB"
                << std::endl;
      assert(workspace_bytes > 0);
    
      // Allocate the memory needed for the workspace
      void* d_workspace{nullptr};
      cudaMalloc(&d_workspace, workspace_bytes);
    
      // Allocate memory for the batch of images
      // and copy from host to device
      int image_bytes = batch_size * channels * height * width * sizeof(float);
    
      float* d_input{nullptr};
      cudaMalloc(&d_input, image_bytes);
      cudaMemcpy(d_input, image.ptr<float>(0), image_bytes, cudaMemcpyHostToDevice);
    
      // Allocate memory for the output images
      // Copy from host to device
      float* d_output{nullptr};
      cudaMalloc(&d_output, image_bytes);
      cudaMemset(d_output, 0, image_bytes);
    
      // clang-format off
      const float kernel_template[3][3] = {
        {1, 1, 1},
        {1, -8, 1},
        {1, 1, 1}
      };
      // clang-format on
    
      float h_kernel[3][3][3][3];
      for (int kernel = 0; kernel < 3; ++kernel) {
        for (int channel = 0; channel < 3; ++channel) {
          for (int row = 0; row < 3; ++row) {
            for (int column = 0; column < 3; ++column) {
              h_kernel[kernel][channel][row][column] = kernel_template[row][column];
            }
          }
        }
      }
    
      float* d_kernel{nullptr};
      cudaMalloc(&d_kernel, sizeof(h_kernel));
      cudaMemcpy(d_kernel, h_kernel, sizeof(h_kernel), cudaMemcpyHostToDevice);
    
      // Perform actual convolution
      const float alpha = 1.0f, beta = 0.0f;
    
      checkCUDNN(cudnnConvolutionForward(cudnn,
                                         &alpha,
                                         input_descriptor,
                                         d_input,
                                         kernel_descriptor,
                                         d_kernel,
                                         convolution_descriptor,
                                         convolution_algorithm,
                                         d_workspace,
                                         workspace_bytes,
                                         &beta,
                                         output_descriptor,
                                         d_output));
    
      // If wish to use sigmoid activation
      if (with_sigmoid) {
        cudnnActivationDescriptor_t activation_descriptor;
        checkCUDNN(cudnnCreateActivationDescriptor(&activation_descriptor));
        checkCUDNN(cudnnSetActivationDescriptor(activation_descriptor,
                                                CUDNN_ACTIVATION_SIGMOID,
                                                CUDNN_PROPAGATE_NAN,
                                                /*relu_coef=*/0));
        checkCUDNN(cudnnActivationForward(cudnn,
                                          activation_descriptor,
                                          &alpha,
                                          output_descriptor,
                                          d_output,
                                          &beta,
                                          output_descriptor,
                                          d_output));
        cudnnDestroyActivationDescriptor(activation_descriptor);
      }
    
      // Move results to host
      float* h_output = new float[image_bytes];
      cudaMemcpy(h_output, d_output, image_bytes, cudaMemcpyDeviceToHost);
    
      save_image("cudnn-out.png", h_output, height, width);
    
      // Free memory
      delete[] h_output;
      cudaFree(d_kernel);
      cudaFree(d_input);
      cudaFree(d_output);
      cudaFree(d_workspace);
    
      cudnnDestroyTensorDescriptor(input_descriptor);
      cudnnDestroyTensorDescriptor(output_descriptor);
      cudnnDestroyFilterDescriptor(kernel_descriptor);
      cudnnDestroyConvolutionDescriptor(convolution_descriptor);
    
      cudnnDestroy(cudnn);
    }

Figured it out, the docs state:

xDesc’s feature map height + 2 * convDesc’s zero-padding height must equal 256 or less
xDesc’s feature map width + 2 * convDesc’s zero-padding width must equal 256 or less

and my image is far too large. If it’s resized to meet these constraints, it works fine.