I’m trying to perform some simple convolution with cuDNN, but am having trouble getting satisfactory results. My convolution parameters are as such:
inputs: 1000 x 256 x 7 x 7 (NCHW)
kernel: 1024 x 256 x 7 x 7 (KCHW)
outputs: 1000 x 1024 x 1 x 1 (NCHW)
I’m aiming for a speed of about 0.01s for the operation. Currently, with NHWC format I’m getting about 0.13s. I thought that using NCHW would be faster, but I never got to find out as doing so would result in a ridiculous amount of workspace size required. While using NHWC only requires 12 bytes, NCHW is asking for about 19 GB of device memory, which I don’t have. Is there any reason why it is asking for so much memory? And is it actually faster to use NCHW compared to NHWC?
Here’s the codes to just show the workspace size required.
#include <iostream>
#include <opencv2/opencv.hpp>
#include <cuda_runtime_api.h>
#include <cudnn.h>
// #define FORMAT CUDNN_TENSOR_NCHW
#define FORMAT CUDNN_TENSOR_NHWC
// NHWC will result in workspace size 12
// NCHW will result in workspace size 21285044224
#define CheckCUDNN(expression) \
{ \
cudnnStatus_t status = (expression); \
if (status != CUDNN_STATUS_SUCCESS) { \
std::cerr << "Error on line " << __LINE__ << ": " \
<< cudnnGetErrorString(status) << std::endl; \
std::exit(EXIT_FAILURE); \
} \
}
int main()
{
cudnnHandle_t CUDNN;
cudnnTensorDescriptor_t InputDesc;
cudnnTensorDescriptor_t OutputDesc;
cudnnFilterDescriptor_t KernelDesc;
cudnnConvolutionDescriptor_t ConvDesc;
cudnnConvolutionFwdAlgo_t ConvAlgo;
const int Batch = 9;
cv::Vec4i InDims(1000, 256, 7, 7);
cv::Vec4i KernelDims(1024, 256, 7, 7);
cv::Vec4i OutDims(1000, 1024, 1, 1);
CheckCUDNN(cudnnCreate(&CUDNN));
CheckCUDNN(cudnnCreateTensorDescriptor(&InputDesc));
CheckCUDNN(cudnnSetTensor4dDescriptor(InputDesc, FORMAT, CUDNN_DATA_FLOAT, Batch*InDims.val[0], InDims.val[1], InDims.val[2], InDims.val[3]));
CheckCUDNN(cudnnCreateTensorDescriptor(&OutputDesc));
CheckCUDNN(cudnnSetTensor4dDescriptor(OutputDesc, FORMAT, CUDNN_DATA_FLOAT, Batch*OutDims.val[0], OutDims.val[1], OutDims.val[2], OutDims.val[3]));
CheckCUDNN(cudnnCreateFilterDescriptor(&KernelDesc));
CheckCUDNN(cudnnSetFilter4dDescriptor(KernelDesc, CUDNN_DATA_FLOAT, FORMAT, KernelDims.val[0], KernelDims.val[1], KernelDims.val[2], KernelDims.val[3]));
int Padding = 0;
CheckCUDNN(cudnnCreateConvolutionDescriptor(&ConvDesc));
CheckCUDNN(cudnnSetConvolution2dDescriptor(ConvDesc, Padding, Padding, 1, 1, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT));
CheckCUDNN(cudnnGetConvolutionForwardAlgorithm(CUDNN, InputDesc, KernelDesc, ConvDesc, OutputDesc, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &ConvAlgo));
size_t WorkspaceBytes = 0;
CheckCUDNN(cudnnGetConvolutionForwardWorkspaceSize(CUDNN, InputDesc, KernelDesc, ConvDesc, OutputDesc, ConvAlgo, &WorkspaceBytes));
std::cout << "ws size " << WorkspaceBytes << std::endl;
return 0;
}
edit: forgot to mention that I am trying to convolve a batch of 9 each time, so the inputs and outputs N is effectively 9000. With 1000 the size required for NCHW is only 2GB, but I still find it unreasonably high, since the sum of 9x1000x256x7x7x4 bytes of inputs and 9x1000x1024x1x1x4 bytes of outputs isn’t even 1GB.