Why is my 'trivial' convolution kernel faster than cuDNN?

I implemented a basic CUDA kernel for convolution:

template<typename T>
__global__ void convNoTensorKernel(int imgM, int imgN, int krnlM, int krnlN, const T *img, const T *krnl, T *out) {

    const int outM = imgM - krnlM + 1;
    const int outN = imgN - krnlN + 1;
    const int totalIdxs = outM * outN;

    const int idx = threadIdx.x + blockIdx.x * MAX_CUDA_THREAD_COUNT;
    if (idx >= totalIdxs) {
        return;
    }

    const int outBlockX = idx % outN;
    const int outBlockY = (idx / outN) % outM;
    const int imgStartIdx = outBlockX + outBlockY * imgN;

    T sum = 0;
    for (int i = 0; i < krnlM * krnlN; ++i) {
        const int krnlX = i % krnlN;
        const int krnlY = (i / krnlN) % krnlM;
        const int rowOffset = krnlY * imgN;
        const int imgIdx = imgStartIdx + rowOffset + krnlX;

        const T imgValue = img[imgIdx];
        const T krnlValue = krnl[(krnlM * krnlN) - 1 - i];

        sum += imgValue * krnlValue;
    }

    out[outBlockX + outBlockY * outN] = sum;
}

For a convolution of a 2048x2048 image with a 128x128 kernel, this takes 0.481s.

With cuDNN this takes around 7.4s:

template<typename T>
void cuda_conv(Matrix<T> &img, Matrix<T> &krnl, Matrix<T> &out) {

    const float alpha = 1.0f;
    const float beta = 0.0f;

    // Create a cuDNN handle:
    cudnnHandle_t handle;
    cudnnCreate(&handle);

    // Create your tensor descriptors:
    cudnnTensorDescriptor_t cudnnIdesc;
    cudnnFilterDescriptor_t cudnnFdesc;
    cudnnTensorDescriptor_t cudnnOdesc;
    cudnnConvolutionDescriptor_t cudnnConvDesc;
    cudnnCreateTensorDescriptor( &cudnnIdesc );
    cudnnCreateFilterDescriptor( &cudnnFdesc );
    cudnnCreateTensorDescriptor( &cudnnOdesc );
    cudnnCreateConvolutionDescriptor( &cudnnConvDesc );

    checkCUDAError( "SetImgDescriptor failed", cudnnSetTensor4dDescriptor(cudnnIdesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, img.B(), img.C(), img.M(), img.N()) );

    checkCUDAError( "SetFilterDescriptor failed", cudnnSetFilter4dDescriptor(cudnnFdesc, CUDNN_DATA_HALF, CUDNN_TENSOR_NCHW, krnl.B(), krnl.C(), krnl.M(), krnl.N()) );

    checkCUDAError( "SetOutDescriptor failed", cudnnSetTensor4dDescriptor(cudnnOdesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, out.B(), out.C(), out.M(), out.N()) );

    checkCUDAError( "SetConvDescriptor failed", cudnnSetConvolution2dDescriptor(cudnnConvDesc, 0, 0, 1, 1, 1, 1, CUDNN_CONVOLUTION, CUDNN_DATA_HALF) );

    // Set the math type to allow cuDNN to use Tensor Cores:
    checkCUDAError( "SetConvMathType failed", cudnnSetConvolutionMathType(cudnnConvDesc, CUDNN_TENSOR_OP_MATH) );

    // Choose a supported algorithm:
    int algoCount = 0;
    cudnnConvolutionFwdAlgoPerf_t algoPerf;
    checkCUDAError( "GetConvForwardAlgo failed", cudnnFindConvolutionForwardAlgorithm(handle, cudnnIdesc, cudnnFdesc, cudnnConvDesc, cudnnOdesc, 1, &algoCount, &algoPerf) );
    checkCUDAError( "ConvolutionForwardAlgorithm failed", algoPerf.status );

    // Allocate your workspace:
    uint8_t *workSpace = nullptr;
    size_t workSpaceSize = 0;
    checkCUDAError( "WorkspaceSize failed", cudnnGetConvolutionForwardWorkspaceSize(handle, cudnnIdesc, cudnnFdesc, cudnnConvDesc, cudnnOdesc, algoPerf.algo, &workSpaceSize) );
    if (workSpaceSize > 0) {
        checkCUDAError( "Workspace malloc failed", cudaMalloc((void**)&workSpace, workSpaceSize) );
    }

    checkCUDAError( "Conv failed", cudnnConvolutionForward(handle, (void*)(&alpha), cudnnIdesc, img.dataOnGPU,
                                        cudnnFdesc, krnl.dataOnGPU, cudnnConvDesc, algoPerf.algo,
                                        workSpace, workSpaceSize, (void*)(&beta),
                                        cudnnOdesc, out.dataOnGPU) );
}

Compiled with nvcc --std=c++17 -lineinfo -g -O3 -arch=sm_75 on RTX2080.

Even after skipping cudnnFindConvolutionForwardAlgorithm and using CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM it still takes around 1s.

Are there errors in my cuDNN code that I am not aware of?

I am very grateful for any hints or tips.

Can you provide the complete code, with timing?

I measure time with std::chrono::high_resolution_clock.
There really is not much to my code. Matrix is just a templated class holding the data in an array dataOnGPU which is allocated via cudaMalloc( (void**)&dataOnGPU, sizeof(T) * B() * C() * M() * N() ) and filled with data beforehand.

Well, I do not see std::chrono::high_resolution_clock in the code you posted. Without knowing what exactly you are measuring it is hard to help. That is why I asked for the complete code.

This is the function that launches the cuDNN code:

    Matrix<T> convcudnn(Matrix<T> &K) {
        Matrix<T> out(_B, K.B(), _M - K.M() + 1, _N - K.N() + 1);
        out.allocateGPU();

        K.toGPU();
        this->toGPU();

        cuda_conv<T>(*this, K, out);

        out.toCPU();

        return out;
    }

where this is the image and K is the kernel.

I measure

    const auto begin = std::chrono::high_resolution_clock::now();
    Matrix<half> out = IMG.convcudnn(KRNL);
    const auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::high_resolution_clock::now() - begin);

And this is how I launch my custom kernel function:

    Matrix<T> convNoTensor(Matrix<T> &kernel) {
        const int outM = M() - kernel.M() + 1;
        const int outN = N() - kernel.N() + 1;
        const int threads = outM * outN;
        const int blocks = std::ceil(threads / (float)MAX_CUDA_THREAD_COUNT);

        Matrix<T> out(outM, outN);
        out.allocateGPU();

        this->toGPU();
        kernel.toGPU();

        std::cout << "Launching " << threads << " threads for convolution." << std::endl;
        convNoTensorKernel<T><<<blocks, MAX_CUDA_THREAD_COUNT>>>(M(), N(), kernel.M(), kernel.N(), dataOnGPU, kernel.dataOnGPU, out.dataOnGPU);

        checkCUDAError( "PeekAtLastError for convNoTensor failed", cudaPeekAtLastError() );
        checkCUDAError( "DeviceSync for convNoTensor failed", cudaDeviceSynchronize() );

        out.toCPU();
        return out;
    }

Measuring is the same, just exchange convcudnn with convNoTensor.