I wrote a simple convolution program in cudnn but the elapsed time I measured al always ~280ms while my own implementation based on cufft just takes ~10ms. I am using cudnn 8.8.1.3 on CUDA 11.8, geforce 1080Ti.
Something to mention. The elapsed time did not change much on different conv algorithms I specified, it even did not change when the batchSize was changed from10 to 1. I suspect that it is because the timer did not work correctly due to some concurrency issues.
Here is my code:
#include <iostream>
#include <vector>
#include <cudnn.h>
#define CHECK_CUDNN(expression) \
{ \
cudnnStatus_t status = (expression); \
if (status != CUDNN_STATUS_SUCCESS) { \
std::cerr << "Error on line " << __LINE__ << ": " \
<< cudnnGetErrorString(status) << std::endl; \
std::exit(EXIT_FAILURE); \
} \
}
int main() {
cudnnHandle_t cudnn;
CHECK_CUDNN(cudnnCreate(&cudnn));
// Set convolution parameters
const int batchSize = 10;
const int inputChannels = 3;
const int inputHeight = 256;
const int inputWidth = 256;
const int kernelHeight = 5;
const int kernelWidth = 5;
const int padHeight = 0;
const int padWidth = 0;
const int outputChannels = 10;
const int outputHeight = (inputHeight + 2 * padHeight - kernelHeight) + 1;
const int outputWidth = (inputWidth + 2 * padWidth - kernelWidth) + 1;
// Create tensor descriptors
cudnnTensorDescriptor_t input_descriptor;
CHECK_CUDNN(cudnnCreateTensorDescriptor(&input_descriptor));
CHECK_CUDNN(cudnnSetTensor4dDescriptor(input_descriptor,
CUDNN_TENSOR_NCHW,
CUDNN_DATA_FLOAT,
batchSize,
inputChannels,
inputHeight,
inputWidth));
cudnnFilterDescriptor_t kernel_descriptor;
CHECK_CUDNN(cudnnCreateFilterDescriptor(&kernel_descriptor));
CHECK_CUDNN(cudnnSetFilter4dDescriptor(kernel_descriptor,
CUDNN_DATA_FLOAT,
CUDNN_TENSOR_NCHW,
outputChannels,
inputChannels,
kernelHeight,
kernelWidth));
cudnnConvolutionDescriptor_t convolution_descriptor;
CHECK_CUDNN(cudnnCreateConvolutionDescriptor(&convolution_descriptor));
CHECK_CUDNN(cudnnSetConvolution2dDescriptor(convolution_descriptor,
padHeight, padWidth, 1, 1, 1, 1,
CUDNN_CONVOLUTION,
CUDNN_DATA_FLOAT));
cudnnTensorDescriptor_t output_descriptor;
CHECK_CUDNN(cudnnCreateTensorDescriptor(&output_descriptor));
CHECK_CUDNN(cudnnSetTensor4dDescriptor(output_descriptor,
CUDNN_TENSOR_NCHW,
CUDNN_DATA_FLOAT,
batchSize,
outputChannels,
outputHeight,
outputWidth));
// Allocate memory on the device
float *d_input, *d_kernel, *d_output, *d_workspace;
size_t workspace_size;
CHECK_CUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnn,
input_descriptor,
kernel_descriptor,
convolution_descriptor,
output_descriptor,
CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING,
&workspace_size));
cudaMalloc(&d_input, batchSize * inputChannels * inputHeight * inputWidth * sizeof(float));
cudaMalloc(&d_kernel, outputChannels * inputChannels * kernelHeight * kernelWidth * sizeof(float));
cudaMalloc(&d_output, batchSize * outputChannels * outputHeight * outputWidth * sizeof(float));
cudaMalloc(&d_workspace, workspace_size);
// Initialize memory on the host
std::vector<float> h_input(batchSize * inputChannels * inputHeight * inputWidth);
std::vector<float> h_kernel(outputChannels * inputChannels * kernelHeight * kernelWidth);
std::vector<float> h_output(batchSize * outputChannels * outputHeight * outputWidth);
// Fill input and kernel with random values
const float a = 5.0;
const float b = 1.0;
for (int i = 0; i < h_input.size(); i++) {
h_input[i] = static_cast<float>(rand()) / static_cast<float>(RAND_MAX) * a;
}
for (int i = 0; i < h_kernel.size(); i++) {
h_kernel[i] = static_cast<float>(rand()) / static_cast<float>(RAND_MAX) * b;
}
// Copy input and kernel to device
cudaMemcpy(d_input, h_input.data(), batchSize * inputChannels * inputHeight * inputWidth * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_kernel, h_kernel.data(), outputChannels * inputChannels * kernelHeight * kernelWidth * sizeof(float), cudaMemcpyHostToDevice);
// Perform convolution and time it
const float alpha = 1.0f;
const float beta = 0.0f;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
//Start timer
cudaDeviceSynchronize();
cudaEventRecord(start);
CHECK_CUDNN(cudnnConvolutionForward(cudnn,
&alpha,
input_descriptor, d_input,
kernel_descriptor, d_kernel,
convolution_descriptor,
CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING,
d_workspace, workspace_size,
&beta,
output_descriptor, d_output));
cudaEventRecord(stop);
cudaEventSynchronize(stop);
//Stop timer
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
std::cout << "Elapsed time: " << milliseconds << " ms" << std::endl;
// Copy output to host and print it
cudaMemcpy(h_output.data(), d_output, batchSize * outputChannels * outputHeight * outputWidth * sizeof(float), cudaMemcpyDeviceToHost);
// Cleanup
cudaFree(d_input);
cudaFree(d_kernel);
cudaFree(d_output);
cudaFree(d_workspace);
cudnnDestroyTensorDescriptor(input_descriptor);
cudnnDestroyFilterDescriptor(kernel_descriptor);
cudnnDestroyConvolutionDescriptor(convolution_descriptor);
cudnnDestroyTensorDescriptor(output_descriptor);
cudnnDestroy(cudnn);
return 0;
}