cuDNN runs pretty slow

I wrote a simple convolution program in cudnn but the elapsed time I measured al always ~280ms while my own implementation based on cufft just takes ~10ms. I am using cudnn 8.8.1.3 on CUDA 11.8, geforce 1080Ti.

Something to mention. The elapsed time did not change much on different conv algorithms I specified, it even did not change when the batchSize was changed from10 to 1. I suspect that it is because the timer did not work correctly due to some concurrency issues.

Here is my code:


#include <iostream>
#include <vector>
#include <cudnn.h>

#define CHECK_CUDNN(expression)                             \
{                                                           \
    cudnnStatus_t status = (expression);                   \
    if (status != CUDNN_STATUS_SUCCESS) {                  \
        std::cerr << "Error on line " << __LINE__ << ": "  \
                  << cudnnGetErrorString(status) << std::endl; \
        std::exit(EXIT_FAILURE);                           \
    }                                                       \
}

int main() {
    cudnnHandle_t cudnn;
    CHECK_CUDNN(cudnnCreate(&cudnn));

    // Set convolution parameters
    const int batchSize = 10;
    const int inputChannels = 3;
    const int inputHeight = 256;
    const int inputWidth = 256;
    const int kernelHeight = 5;
    const int kernelWidth = 5;
    const int padHeight = 0;
    const int padWidth = 0;
    const int outputChannels = 10;
    const int outputHeight = (inputHeight + 2 * padHeight - kernelHeight) + 1;
    const int outputWidth = (inputWidth + 2 * padWidth - kernelWidth) + 1;

    // Create tensor descriptors
    cudnnTensorDescriptor_t input_descriptor;
    CHECK_CUDNN(cudnnCreateTensorDescriptor(&input_descriptor));
    CHECK_CUDNN(cudnnSetTensor4dDescriptor(input_descriptor,
                                           CUDNN_TENSOR_NCHW,
                                           CUDNN_DATA_FLOAT,
                                           batchSize,
                                           inputChannels,
                                           inputHeight,
                                           inputWidth));

    cudnnFilterDescriptor_t kernel_descriptor;
    CHECK_CUDNN(cudnnCreateFilterDescriptor(&kernel_descriptor));
    CHECK_CUDNN(cudnnSetFilter4dDescriptor(kernel_descriptor,
                                           CUDNN_DATA_FLOAT,
                                           CUDNN_TENSOR_NCHW,
                                           outputChannels,
                                           inputChannels,
                                           kernelHeight,
                                           kernelWidth));

    cudnnConvolutionDescriptor_t convolution_descriptor;
    CHECK_CUDNN(cudnnCreateConvolutionDescriptor(&convolution_descriptor));
    CHECK_CUDNN(cudnnSetConvolution2dDescriptor(convolution_descriptor,
                                                 padHeight, padWidth, 1, 1, 1, 1,
                                                 CUDNN_CONVOLUTION,
                                                 CUDNN_DATA_FLOAT));

    cudnnTensorDescriptor_t output_descriptor;
    CHECK_CUDNN(cudnnCreateTensorDescriptor(&output_descriptor));
    CHECK_CUDNN(cudnnSetTensor4dDescriptor(output_descriptor,
                                           CUDNN_TENSOR_NCHW,
                                           CUDNN_DATA_FLOAT,
                                           batchSize,
                                           outputChannels,
                                           outputHeight,
                                           outputWidth));

    // Allocate memory on the device
    float *d_input, *d_kernel, *d_output, *d_workspace;
    size_t workspace_size;

    CHECK_CUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnn,
                                                        input_descriptor,
                                                        kernel_descriptor,
                                                        convolution_descriptor,
                                                        output_descriptor,
                                                        CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING,
                                                        &workspace_size));

    cudaMalloc(&d_input, batchSize * inputChannels * inputHeight * inputWidth * sizeof(float));
    cudaMalloc(&d_kernel, outputChannels * inputChannels * kernelHeight * kernelWidth * sizeof(float));
    cudaMalloc(&d_output, batchSize * outputChannels * outputHeight * outputWidth * sizeof(float));
    cudaMalloc(&d_workspace, workspace_size);

    // Initialize memory on the host
    std::vector<float> h_input(batchSize * inputChannels * inputHeight * inputWidth);
    std::vector<float> h_kernel(outputChannels * inputChannels * kernelHeight * kernelWidth);
    std::vector<float> h_output(batchSize * outputChannels * outputHeight * outputWidth);
    
    // Fill input and kernel with random values
    const float a = 5.0;
    const float b = 1.0;
    for (int i = 0; i < h_input.size(); i++) {
        h_input[i] = static_cast<float>(rand()) / static_cast<float>(RAND_MAX) * a;
    }
    for (int i = 0; i < h_kernel.size(); i++) {
        h_kernel[i] = static_cast<float>(rand()) / static_cast<float>(RAND_MAX) * b;
    }

    // Copy input and kernel to device
    cudaMemcpy(d_input, h_input.data(), batchSize * inputChannels * inputHeight * inputWidth * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_kernel, h_kernel.data(), outputChannels * inputChannels * kernelHeight * kernelWidth * sizeof(float), cudaMemcpyHostToDevice);

    // Perform convolution and time it
    const float alpha = 1.0f;
    const float beta = 0.0f;

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    //Start timer
    cudaDeviceSynchronize();
    cudaEventRecord(start);

    CHECK_CUDNN(cudnnConvolutionForward(cudnn,
                                        &alpha,
                                        input_descriptor, d_input,
                                        kernel_descriptor, d_kernel,
                                        convolution_descriptor,
                                        CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING,
                                        d_workspace, workspace_size,
                                        &beta,
                                        output_descriptor, d_output));

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    //Stop timer

    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    std::cout << "Elapsed time: " << milliseconds << " ms" << std::endl;

    // Copy output to host and print it
    cudaMemcpy(h_output.data(), d_output, batchSize * outputChannels * outputHeight * outputWidth * sizeof(float), cudaMemcpyDeviceToHost);

    // Cleanup
    cudaFree(d_input);
    cudaFree(d_kernel);
    cudaFree(d_output);
    cudaFree(d_workspace);

    cudnnDestroyTensorDescriptor(input_descriptor);
    cudnnDestroyFilterDescriptor(kernel_descriptor);
    cudnnDestroyConvolutionDescriptor(convolution_descriptor);
    cudnnDestroyTensorDescriptor(output_descriptor);

    cudnnDestroy(cudnn);

    return 0;

}

Problem solved. The answer is here: cuDNN8: extreamly slow first iteration of CNN training or inference

Just add cudnnCnnInferVersionCheck(); to preload the kernel before the timer start, so it could correctly get only the convolution time.

1 Like

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.