I was trying to optimize execution of Convolution->Bias->ReLU sequences by calling cudnnConvolutionBiasActivationForward() function instead of cudnnConvolutionForward() followed by custom kernel for Bias+ReLU. It went well for non-depthwise convolutions (execution time has lowered) but backfired for depthwise ones.
Calling cudnnConvolutionBiasActivationForward() is slower at least two (in most cases way more) times no matter which algorithm I choose. Is this expected?
My specs:
Windows with CUDA 10.2 and cuDNN 7.6.5.
GTX1060 6GB (driver: 441.66)
My benchmarking code:
main.cpp
#include "kernels.h"
#include "cuda_runtime_api.h"
#include "cudnn.h"
#include <algorithm>
#include <chrono>
#include <exception>
#include <vector>
#include <string>
#include <iostream>
void Assert(cudaError_t error, const char* file, int line)
{
if (error != cudaSuccess)
{
std::cerr << cudaGetErrorName(error) << " " << file << ":" << std::to_string(line) << std::endl;
std::abort();
}
}
void Assert(cudnnStatus_t status, const char* file, int line)
{
if (status != CUDNN_STATUS_SUCCESS)
{
std::cerr << cudnnGetErrorString(status) << " " << file << ":" << std::to_string(line) << std::endl;
std::abort();
}
}
std::vector<float> createRandomData(size_t count)
{
std::vector<float> data(count);
std::generate(begin(data), end(data), [](){
return -1.f + 2.f * float(std::rand()) / RAND_MAX;
});
return data;
}
#define ASSERT(expr) Assert((expr), __FILE__, __LINE__)
constexpr auto batchSize = 50;
constexpr auto inTileSize = 112;
constexpr auto inChannels = 32;
constexpr auto inDataCount = batchSize * inTileSize * inTileSize * inChannels;
constexpr auto inDataBytes = inDataCount * sizeof(float);
constexpr auto outTileSize = 112;
constexpr auto outChannels = 32;
constexpr auto outDataCount = batchSize * outTileSize * outTileSize * outChannels;
constexpr auto outDataBytes = outDataCount * sizeof(float);
constexpr auto padding = 1;
constexpr auto stride = 1;
constexpr auto dilation = 1;
constexpr auto kernelSize = 3;
constexpr auto kernelCount = inChannels * outChannels * kernelSize * kernelSize;
constexpr auto kernelBytes = kernelCount * sizeof(float);
constexpr auto groupCount = outChannels;
constexpr auto biasBytes = outChannels * sizeof(float);
constexpr auto tensorFormat = CUDNN_TENSOR_NCHW;
constexpr auto dataType = CUDNN_DATA_FLOAT;
constexpr auto algorithm = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
constexpr auto alpha = 1.f;
constexpr auto beta = 0.f;
constexpr auto reluMaxValue = 6.f;
constexpr auto repeats = 100;
static cudnnHandle_t handle;
static cudnnTensorDescriptor_t inDataDesc;
static cudnnFilterDescriptor_t kernelDesc;
static cudnnTensorDescriptor_t biasDesc;
static cudnnTensorDescriptor_t outDataDesc;
static cudnnActivationDescriptor_t reluDesc;
static cudnnConvolutionDescriptor_t convDesc;
static size_t workspaceSize;
static float* inputDevice;
static float* kernelDevice;
static float* outputDevice;
static float* biasDevice;
static void* workspace;
void init()
{
ASSERT(cudnnCreate(&handle));
ASSERT(cudnnCreateTensorDescriptor(&inDataDesc));
ASSERT(cudnnSetTensor4dDescriptor(inDataDesc, tensorFormat, dataType, batchSize, inChannels, inTileSize, inTileSize));
ASSERT(cudaMalloc(&inputDevice, inDataBytes));
ASSERT(cudaMemcpy(inputDevice, createRandomData(inDataCount).data(), inDataBytes, cudaMemcpyHostToDevice));
ASSERT(cudnnCreateFilterDescriptor(&kernelDesc));
//ASSERT(cudnnSetFilter4dDescriptor(kernelDesc, dataType, tensorFormat, outChannels, inChannels, kernelSize, kernelSize));
ASSERT(cudnnSetFilter4dDescriptor(kernelDesc, dataType, tensorFormat, outChannels, 1, kernelSize, kernelSize));
ASSERT(cudaMalloc(&kernelDevice, kernelBytes));
ASSERT(cudaMemcpy(kernelDevice, createRandomData(kernelCount).data(), kernelBytes, cudaMemcpyHostToDevice));
ASSERT(cudnnCreateConvolutionDescriptor(&convDesc));
ASSERT(cudnnSetConvolution2dDescriptor(convDesc,
padding, padding,
stride, stride,
dilation, dilation,
CUDNN_CROSS_CORRELATION,
dataType
));
ASSERT(cudnnSetConvolutionGroupCount(convDesc, groupCount));
ASSERT(cudnnCreateTensorDescriptor(&outDataDesc));
ASSERT(cudnnSetTensor4dDescriptor(outDataDesc, tensorFormat, dataType, batchSize, outChannels, outTileSize, outTileSize));
ASSERT(cudnnGetConvolutionForwardWorkspaceSize(
handle,
inDataDesc,
kernelDesc,
convDesc,
outDataDesc,
algorithm,
&workspaceSize
));
ASSERT(cudaMalloc(&workspace, workspaceSize));
ASSERT(cudnnCreateTensorDescriptor(&biasDesc));
ASSERT(cudnnSetTensor4dDescriptor(biasDesc, tensorFormat, dataType, 1, outChannels, 1, 1));
ASSERT(cudaMalloc(&biasDevice, biasBytes));
ASSERT(cudaMemcpy(biasDevice, createRandomData(inChannels).data(), inChannels, cudaMemcpyHostToDevice));
ASSERT(cudnnCreateActivationDescriptor(&reluDesc));
ASSERT(cudnnSetActivationDescriptor(reluDesc, CUDNN_ACTIVATION_RELU, CUDNN_NOT_PROPAGATE_NAN, double(reluMaxValue)));
ASSERT(cudaMalloc(&outputDevice, outDataBytes));
}
void cleanup()
{
ASSERT(cudaFree(inputDevice));
ASSERT(cudaFree(kernelDevice));
ASSERT(cudaFree(outputDevice));
ASSERT(cudaFree(workspace));
ASSERT(cudaFree(biasDevice));
ASSERT(cudnnDestroyTensorDescriptor(outDataDesc));
ASSERT(cudnnDestroyActivationDescriptor(reluDesc));
ASSERT(cudnnDestroyTensorDescriptor(biasDesc));
ASSERT(cudnnDestroyConvolutionDescriptor(convDesc));
ASSERT(cudnnDestroyFilterDescriptor(kernelDesc));
ASSERT(cudnnDestroyTensorDescriptor(inDataDesc));
ASSERT(cudnnDestroy(handle));
}
int main()
{
init();
const auto start = std::chrono::high_resolution_clock::now();
for (auto i = 0; i < repeats; ++i)
{
#if 0
//conv separated from bias+relu
ASSERT(cudnnConvolutionForward(
handle,
&alpha,
inDataDesc, inputDevice,
kernelDesc, kernelDevice,
convDesc,
algorithm,
workspace, workspaceSize,
&beta,
outDataDesc, outputDevice
));
ASSERT(Bias_ReLU(outputDevice, biasDevice, batchSize, outChannels, outTileSize, outTileSize, reluMaxValue));
#else
//conv fused with bias and relu
ASSERT(cudnnConvolutionBiasActivationForward(
handle,
&alpha,
inDataDesc, inputDevice,
kernelDesc, kernelDevice,
convDesc,
algorithm,
workspace, workspaceSize,
&beta,
outDataDesc, outputDevice,
biasDesc, biasDevice,
reluDesc,
outDataDesc, outputDevice
));
#endif
}
ASSERT(cudaDeviceSynchronize());
const auto duration = std::chrono::high_resolution_clock::now() - start;
std::cout << "Total:" << std::chrono::duration_cast<std::chrono::milliseconds>(duration).count() << std::endl;
std::cout << "Avg:" << std::chrono::duration_cast<std::chrono::milliseconds>(duration / repeats).count() << std::endl;
cleanup();
}
kernels.h
#pragma once
#include <stdint.h>
#include <cuda_runtime_api.h>
cudaError_t Bias_ReLU(float* data, const float* bias, int n, int c, int h, int w, float reluMaxValue);
kernels.cu
#include "kernels.h"
#include <stdint.h>
#include <math.h>
constexpr auto BlockSize = 256u;
__global__ void Bias_ReLU_k(float* data, int64_t N, const float* bias, int64_t planeArea, int channels, float reluMaxValue)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (auto i = index; i < N; i += stride)
{
int effectiveChannelIdx = i / planeArea;
auto channelIdx = effectiveChannelIdx % channels;
auto tmp = data[i] + bias[channelIdx];
tmp = fmin(tmp, reluMaxValue);
data[i] = fmax(tmp, 0.f);
}
}
cudaError_t Bias_ReLU(float* data, const float* bias, int n, int c, int h, int w, float reluMaxValue)
{
const auto imageArea = h * w;
const auto N = n * c * imageArea;
auto blocksCount = (N + BlockSize - 1) / BlockSize;
Bias_ReLU_k<<<blocksCount, BlockSize>>>(data, N, bias, imageArea, c, reluMaxValue);
return cudaGetLastError();
}