I am trying to use the cuDNN library to do a FFT convolution. The code runs when I use the Winograd convolution / the cuDNN method that selects the fastest convolution method, but when I tried to run using the FFT convolution method it does not work.
I set the forward method to FFT convolution myself.
I checked the documents and my input is in NCHW format as required for the FFT convolution. From the docs:
CUDNN_CONVOLUTION_FWD_ALGO_FFT
xDesc Format Support: NCHW HW-packed
yDesc Format Support: NCHW HW-packed
The error “CUDNN_STATUS_NOT_SUPPORTED” happens during the cudnnGetConvolutionForwardWorkspaceSize function call.
What is happening that causes this error when I use FFT convolution VS best or Winograd?
For reference I am using cuda 9.1, cuDNN 7. I compile with the following command on Ubuntu 16.04: nvcc -arch=sm_35 -std=c++11 -O2 -lcudnn FFT_cuDNN.cu -o conv pkg-config --cflags --libs opencv
; ./conv TF.png
#include <cudnn.h>
#include <cassert>
#include <cstdlib>
#include <iostream>
#include <opencv2/opencv.hpp>
#include <opencv2/dnn.hpp>
using namespace cv;
using namespace cv::dnn;
#define checkCUDNN(expression) \
{ \
cudnnStatus_t status = (expression); \
if (status != CUDNN_STATUS_SUCCESS) { \
std::cerr << "Error on line " << __LINE__ << ": " \
<< cudnnGetErrorString(status) << std::endl; \
std::exit(EXIT_FAILURE); \
} \
}
cv::Mat load_image_NCHW(const char* image_path)
{
cv::Mat image = cv::imread(image_path, cv::IMREAD_COLOR);
image.convertTo(image, CV_32FC3);
cv::normalize(image,image,0,1, cv::NORM_MINMAX);
cv::Mat inputBlob = blobFromImage(image, 1.0f, cv::Size(image.rows,image.cols), cv::Scalar(0,0,0));
return inputBlob;
}
void save_image(const char* output_filename,
float* buffer,
int height,
int width) {
cv::Mat output_image(height, width, CV_32FC3, buffer);
// Make negative values zero.
cv::threshold(output_image,
output_image,
/*threshold=*/0,
/*maxval=*/0,
cv::THRESH_TOZERO);
cv::normalize(output_image, output_image, 0.0, 255.0, cv::NORM_MINMAX);
output_image.convertTo(output_image, CV_8UC3);
cv::imwrite(output_filename, output_image);
std::cerr << "Wrote output to " << output_filename << std::endl;
}
int main(int argc, const char* argv[]) {
if (argc < 2) {
std::cerr << "usage: conv <image> [gpu=0] [sigmoid=0]" << std::endl;
std::exit(EXIT_FAILURE);
}
int gpu_id = (argc > 2) ? std::atoi(argv[2]) : 0;
std::cerr << "GPU: " << gpu_id << std::endl;
bool with_sigmoid = (argc > 3) ? std::atoi(argv[3]) : 0;
std::cerr << "With sigmoid: " << std::boolalpha << with_sigmoid << std::endl;
// Load the image
cv::Mat image = load_image_NCHW(argv[1]);
int imgH = 600;
int imgW = 561;
int inC = 3;
// Set GPU to use
cudaSetDevice(gpu_id);
// Create the cudnn Handle
cudnnHandle_t cudnn;
checkCUDNN(cudnnCreate(&cudnn));
// Need a descriptor for
// The input, kernel, and convolution
cudnnTensorDescriptor_t input_descriptor;
checkCUDNN(cudnnCreateTensorDescriptor(&input_descriptor));
checkCUDNN(cudnnSetTensor4dDescriptor(input_descriptor,
/*format=*/CUDNN_TENSOR_NCHW,
/*dataType=*/CUDNN_DATA_FLOAT,
/*batch_size=*/1,
/*channels=*/inC,
/*image_height=*/imgH,
/*image_width=*/imgW));
cudnnFilterDescriptor_t kernel_descriptor;
checkCUDNN(cudnnCreateFilterDescriptor(&kernel_descriptor));
checkCUDNN(cudnnSetFilter4dDescriptor(kernel_descriptor,
/*dataType=*/CUDNN_DATA_FLOAT,
/*format=*/CUDNN_TENSOR_NCHW,
/*out_channels=*/3,
/*in_channels=*/inC,
/*kernel_height=*/3,
/*kernel_width=*/3));
cudnnConvolutionDescriptor_t convolution_descriptor;
checkCUDNN(cudnnCreateConvolutionDescriptor(&convolution_descriptor));
checkCUDNN(cudnnSetConvolution2dDescriptor(convolution_descriptor,
/*pad_height=*/1,
/*pad_width=*/1,
/*vertical_stride=*/1,
/*horizontal_stride=*/1,
/*dilation_height=*/1,
/*dilation_width=*/1,
/*mode=*/CUDNN_CROSS_CORRELATION,
/*computeType=*/CUDNN_DATA_FLOAT));
// Need to compute the output size
int batch_size{0}, channels{0}, height{0}, width{0};
checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convolution_descriptor,
input_descriptor,
kernel_descriptor,
&batch_size,
&channels,
&height,
&width));
std::cerr << "Output Image: " << height << " x " << width << " x " << channels
<< std::endl;
// Need an output descriptor
cudnnTensorDescriptor_t output_descriptor;
checkCUDNN(cudnnCreateTensorDescriptor(&output_descriptor));
checkCUDNN(cudnnSetTensor4dDescriptor(output_descriptor,
/*format=*/CUDNN_TENSOR_NCHW,
/*dataType=*/CUDNN_DATA_FLOAT,
/*batch_size=*/1,
/*channels=*/3,
/*image_height=*/imgH,
/*image_width=*/imgW));
// Need to define the forward algorithm
cudnnConvolutionFwdAlgo_t convolution_algorithm = CUDNN_CONVOLUTION_FWD_ALGO_FFT;
// Have to compute the workspace size
size_t workspace_bytes{0};
checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnn,
input_descriptor,
kernel_descriptor,
convolution_descriptor,
output_descriptor,
convolution_algorithm,
&workspace_bytes));
std::cerr << "Workspace size: " << (workspace_bytes / 1048576.0) << "MB"
<< std::endl;
assert(workspace_bytes > 0);
// Allocate the memory needed for the workspace
void* d_workspace{nullptr};
cudaMalloc(&d_workspace, workspace_bytes);
// Allocate memory for the batch of images
// and copy from host to device
int image_bytes = batch_size * channels * height * width * sizeof(float);
float* d_input{nullptr};
cudaMalloc(&d_input, image_bytes);
cudaMemcpy(d_input, image.ptr<float>(0), image_bytes, cudaMemcpyHostToDevice);
// Allocate memory for the output images
// Copy from host to device
float* d_output{nullptr};
cudaMalloc(&d_output, image_bytes);
cudaMemset(d_output, 0, image_bytes);
// clang-format off
const float kernel_template[3][3] = {
{1, 1, 1},
{1, -8, 1},
{1, 1, 1}
};
// clang-format on
float h_kernel[3][3][3][3];
for (int kernel = 0; kernel < 3; ++kernel) {
for (int channel = 0; channel < 3; ++channel) {
for (int row = 0; row < 3; ++row) {
for (int column = 0; column < 3; ++column) {
h_kernel[kernel][channel][row][column] = kernel_template[row][column];
}
}
}
}
float* d_kernel{nullptr};
cudaMalloc(&d_kernel, sizeof(h_kernel));
cudaMemcpy(d_kernel, h_kernel, sizeof(h_kernel), cudaMemcpyHostToDevice);
// Perform actual convolution
const float alpha = 1.0f, beta = 0.0f;
checkCUDNN(cudnnConvolutionForward(cudnn,
&alpha,
input_descriptor,
d_input,
kernel_descriptor,
d_kernel,
convolution_descriptor,
convolution_algorithm,
d_workspace,
workspace_bytes,
&beta,
output_descriptor,
d_output));
// If wish to use sigmoid activation
if (with_sigmoid) {
cudnnActivationDescriptor_t activation_descriptor;
checkCUDNN(cudnnCreateActivationDescriptor(&activation_descriptor));
checkCUDNN(cudnnSetActivationDescriptor(activation_descriptor,
CUDNN_ACTIVATION_SIGMOID,
CUDNN_PROPAGATE_NAN,
/*relu_coef=*/0));
checkCUDNN(cudnnActivationForward(cudnn,
activation_descriptor,
&alpha,
output_descriptor,
d_output,
&beta,
output_descriptor,
d_output));
cudnnDestroyActivationDescriptor(activation_descriptor);
}
// Move results to host
float* h_output = new float[image_bytes];
cudaMemcpy(h_output, d_output, image_bytes, cudaMemcpyDeviceToHost);
save_image("cudnn-out.png", h_output, height, width);
// Free memory
delete[] h_output;
cudaFree(d_kernel);
cudaFree(d_input);
cudaFree(d_output);
cudaFree(d_workspace);
cudnnDestroyTensorDescriptor(input_descriptor);
cudnnDestroyTensorDescriptor(output_descriptor);
cudnnDestroyFilterDescriptor(kernel_descriptor);
cudnnDestroyConvolutionDescriptor(convolution_descriptor);
cudnnDestroy(cudnn);
}