NHWC vs NCHW convolution

I’m trying to perform some simple convolution with cuDNN, but am having trouble getting satisfactory results. My convolution parameters are as such:

inputs: 1000 x 256 x 7 x 7 (NCHW)
kernel: 1024 x 256 x 7 x 7 (KCHW)
outputs: 1000 x 1024 x 1 x 1 (NCHW)

I’m aiming for a speed of about 0.01s for the operation. Currently, with NHWC format I’m getting about 0.13s. I thought that using NCHW would be faster, but I never got to find out as doing so would result in a ridiculous amount of workspace size required. While using NHWC only requires 12 bytes, NCHW is asking for about 19 GB of device memory, which I don’t have. Is there any reason why it is asking for so much memory? And is it actually faster to use NCHW compared to NHWC?

Here’s the codes to just show the workspace size required.

#include <iostream>
#include <opencv2/opencv.hpp>
#include <cuda_runtime_api.h>
#include <cudnn.h>


// NHWC will result in workspace size 12
// NCHW will result in workspace size 21285044224

#define CheckCUDNN(expression)                               \
{                                                            \
    cudnnStatus_t status = (expression);                     \
    if (status != CUDNN_STATUS_SUCCESS) {                    \
      std::cerr << "Error on line " << __LINE__ << ": "      \
                << cudnnGetErrorString(status) << std::endl; \
      std::exit(EXIT_FAILURE);                               \
    }                                                        \

int main()
    cudnnHandle_t CUDNN;
    cudnnTensorDescriptor_t InputDesc;
    cudnnTensorDescriptor_t OutputDesc;
    cudnnFilterDescriptor_t KernelDesc;
    cudnnConvolutionDescriptor_t ConvDesc;
    cudnnConvolutionFwdAlgo_t ConvAlgo;

    const int Batch = 9;
    cv::Vec4i InDims(1000, 256, 7, 7);
    cv::Vec4i KernelDims(1024, 256, 7, 7);
    cv::Vec4i OutDims(1000, 1024, 1, 1);

    CheckCUDNN(cudnnSetTensor4dDescriptor(InputDesc, FORMAT, CUDNN_DATA_FLOAT, Batch*InDims.val[0], InDims.val[1], InDims.val[2], InDims.val[3]));

    CheckCUDNN(cudnnSetTensor4dDescriptor(OutputDesc, FORMAT, CUDNN_DATA_FLOAT, Batch*OutDims.val[0], OutDims.val[1], OutDims.val[2], OutDims.val[3]));

    CheckCUDNN(cudnnSetFilter4dDescriptor(KernelDesc, CUDNN_DATA_FLOAT, FORMAT, KernelDims.val[0], KernelDims.val[1], KernelDims.val[2], KernelDims.val[3]));

    int Padding = 0;
    CheckCUDNN(cudnnSetConvolution2dDescriptor(ConvDesc, Padding, Padding, 1, 1, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT));

    CheckCUDNN(cudnnGetConvolutionForwardAlgorithm(CUDNN, InputDesc, KernelDesc, ConvDesc, OutputDesc, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &ConvAlgo));

    size_t WorkspaceBytes = 0;
    CheckCUDNN(cudnnGetConvolutionForwardWorkspaceSize(CUDNN, InputDesc, KernelDesc, ConvDesc, OutputDesc, ConvAlgo, &WorkspaceBytes));
    std::cout << "ws size " << WorkspaceBytes << std::endl;

    return 0;

edit: forgot to mention that I am trying to convolve a batch of 9 each time, so the inputs and outputs N is effectively 9000. With 1000 the size required for NCHW is only 2GB, but I still find it unreasonably high, since the sum of 9x1000x256x7x7x4 bytes of inputs and 9x1000x1024x1x1x4 bytes of outputs isn’t even 1GB.


With tensor cores GPU (NVIDIA Volta and Turing GPUs), NHWC is (generally) preferred. For non-tensor-core GPU, NCHW is preferred.


Hi @SunilJB, thanks for the clarification. I’m running on a GeForce GTX 1080 so there shouldn’t be tensor cores right, is the workspace size required supposed to be that huge though?


Can you try using “CUDNN_CONVOLUTION_FWD_NO_WORKSPACE” or “CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT” as cudnn convolution forward preference?
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT by default will return the fastest algorithm regardless of how much workspace is needed to execute it.

Please find below link for your reference:


That worked, but it still takes about 0.13 seconds to perform the convolution, which is not much different from the NHWC performance. Anyway, this is part of a TensorRT plugin I’m writing, so maybe I’ll just have to use int8 precision to speed it up. Thanks though!