I implemented a basic CUDA kernel for convolution:
template<typename T>
__global__ void convNoTensorKernel(int imgM, int imgN, int krnlM, int krnlN, const T *img, const T *krnl, T *out) {
const int outM = imgM - krnlM + 1;
const int outN = imgN - krnlN + 1;
const int totalIdxs = outM * outN;
const int idx = threadIdx.x + blockIdx.x * MAX_CUDA_THREAD_COUNT;
if (idx >= totalIdxs) {
return;
}
const int outBlockX = idx % outN;
const int outBlockY = (idx / outN) % outM;
const int imgStartIdx = outBlockX + outBlockY * imgN;
T sum = 0;
for (int i = 0; i < krnlM * krnlN; ++i) {
const int krnlX = i % krnlN;
const int krnlY = (i / krnlN) % krnlM;
const int rowOffset = krnlY * imgN;
const int imgIdx = imgStartIdx + rowOffset + krnlX;
const T imgValue = img[imgIdx];
const T krnlValue = krnl[(krnlM * krnlN) - 1 - i];
sum += imgValue * krnlValue;
}
out[outBlockX + outBlockY * outN] = sum;
}
For a convolution of a 2048x2048 image with a 128x128 kernel, this takes 0.481s.
With cuDNN this takes around 7.4s:
template<typename T>
void cuda_conv(Matrix<T> &img, Matrix<T> &krnl, Matrix<T> &out) {
const float alpha = 1.0f;
const float beta = 0.0f;
// Create a cuDNN handle:
cudnnHandle_t handle;
cudnnCreate(&handle);
// Create your tensor descriptors:
cudnnTensorDescriptor_t cudnnIdesc;
cudnnFilterDescriptor_t cudnnFdesc;
cudnnTensorDescriptor_t cudnnOdesc;
cudnnConvolutionDescriptor_t cudnnConvDesc;
cudnnCreateTensorDescriptor( &cudnnIdesc );
cudnnCreateFilterDescriptor( &cudnnFdesc );
cudnnCreateTensorDescriptor( &cudnnOdesc );
cudnnCreateConvolutionDescriptor( &cudnnConvDesc );
checkCUDAError( "SetImgDescriptor failed", cudnnSetTensor4dDescriptor(cudnnIdesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, img.B(), img.C(), img.M(), img.N()) );
checkCUDAError( "SetFilterDescriptor failed", cudnnSetFilter4dDescriptor(cudnnFdesc, CUDNN_DATA_HALF, CUDNN_TENSOR_NCHW, krnl.B(), krnl.C(), krnl.M(), krnl.N()) );
checkCUDAError( "SetOutDescriptor failed", cudnnSetTensor4dDescriptor(cudnnOdesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, out.B(), out.C(), out.M(), out.N()) );
checkCUDAError( "SetConvDescriptor failed", cudnnSetConvolution2dDescriptor(cudnnConvDesc, 0, 0, 1, 1, 1, 1, CUDNN_CONVOLUTION, CUDNN_DATA_HALF) );
// Set the math type to allow cuDNN to use Tensor Cores:
checkCUDAError( "SetConvMathType failed", cudnnSetConvolutionMathType(cudnnConvDesc, CUDNN_TENSOR_OP_MATH) );
// Choose a supported algorithm:
int algoCount = 0;
cudnnConvolutionFwdAlgoPerf_t algoPerf;
checkCUDAError( "GetConvForwardAlgo failed", cudnnFindConvolutionForwardAlgorithm(handle, cudnnIdesc, cudnnFdesc, cudnnConvDesc, cudnnOdesc, 1, &algoCount, &algoPerf) );
checkCUDAError( "ConvolutionForwardAlgorithm failed", algoPerf.status );
// Allocate your workspace:
uint8_t *workSpace = nullptr;
size_t workSpaceSize = 0;
checkCUDAError( "WorkspaceSize failed", cudnnGetConvolutionForwardWorkspaceSize(handle, cudnnIdesc, cudnnFdesc, cudnnConvDesc, cudnnOdesc, algoPerf.algo, &workSpaceSize) );
if (workSpaceSize > 0) {
checkCUDAError( "Workspace malloc failed", cudaMalloc((void**)&workSpace, workSpaceSize) );
}
checkCUDAError( "Conv failed", cudnnConvolutionForward(handle, (void*)(&alpha), cudnnIdesc, img.dataOnGPU,
cudnnFdesc, krnl.dataOnGPU, cudnnConvDesc, algoPerf.algo,
workSpace, workSpaceSize, (void*)(&beta),
cudnnOdesc, out.dataOnGPU) );
}
Compiled with nvcc --std=c++17 -lineinfo -g -O3 -arch=sm_75
on RTX2080.
Even after skipping cudnnFindConvolutionForwardAlgorithm
and using CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM
it still takes around 1s.
Are there errors in my cuDNN code that I am not aware of?
I am very grateful for any hints or tips.