Slow cudnn Convolution Bias Activation Forward for grouped convolution

I was trying to optimize execution of Convolution->Bias->ReLU sequences by calling cudnnConvolutionBiasActivationForward() function instead of cudnnConvolutionForward() followed by custom kernel for Bias+ReLU. It went well for non-depthwise convolutions (execution time has lowered) but backfired for depthwise ones.

Calling cudnnConvolutionBiasActivationForward() is slower at least two (in most cases way more) times no matter which algorithm I choose. Is this expected?

My specs:
Windows with CUDA 10.2 and cuDNN 7.6.5.
GTX1060 6GB (driver: 441.66)

My benchmarking code:

main.cpp
#include "kernels.h"
#include "cuda_runtime_api.h"
#include "cudnn.h"
#include <algorithm>
#include <chrono>
#include <exception>
#include <vector>
#include <string>
#include <iostream>

void Assert(cudaError_t error, const char* file, int line)
{
	if (error != cudaSuccess)
	{
		std::cerr << cudaGetErrorName(error) << " " << file << ":" << std::to_string(line) << std::endl;
		std::abort();
	}
}

void Assert(cudnnStatus_t status, const char* file, int line)
{
	if (status != CUDNN_STATUS_SUCCESS)
	{
		std::cerr << cudnnGetErrorString(status) << " " << file << ":" << std::to_string(line) << std::endl;
		std::abort();
	}
}

std::vector<float> createRandomData(size_t count)
{
	std::vector<float> data(count);
	std::generate(begin(data), end(data), [](){
		return -1.f + 2.f * float(std::rand()) / RAND_MAX;
	});
	return data;
}

#define ASSERT(expr) Assert((expr), __FILE__, __LINE__)

constexpr auto batchSize = 50;

constexpr auto inTileSize = 112;
constexpr auto inChannels = 32;
constexpr auto inDataCount = batchSize * inTileSize * inTileSize * inChannels;
constexpr auto inDataBytes = inDataCount * sizeof(float);

constexpr auto outTileSize = 112;
constexpr auto outChannels = 32;
constexpr auto outDataCount = batchSize * outTileSize * outTileSize * outChannels;
constexpr auto outDataBytes = outDataCount * sizeof(float);

constexpr auto padding = 1;
constexpr auto stride = 1;
constexpr auto dilation = 1;

constexpr auto kernelSize = 3;
constexpr auto kernelCount = inChannels * outChannels * kernelSize * kernelSize;
constexpr auto kernelBytes = kernelCount * sizeof(float);

constexpr auto groupCount = outChannels;

constexpr auto biasBytes = outChannels * sizeof(float);

constexpr auto tensorFormat = CUDNN_TENSOR_NCHW;
constexpr auto dataType = CUDNN_DATA_FLOAT;

constexpr auto algorithm = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;

constexpr auto alpha = 1.f;
constexpr auto beta = 0.f;

constexpr auto reluMaxValue = 6.f;

constexpr auto repeats = 100;

static cudnnHandle_t handle;
static cudnnTensorDescriptor_t inDataDesc;
static cudnnFilterDescriptor_t kernelDesc;
static cudnnTensorDescriptor_t biasDesc;
static cudnnTensorDescriptor_t outDataDesc;
static cudnnActivationDescriptor_t reluDesc;
static cudnnConvolutionDescriptor_t convDesc;
static size_t workspaceSize;

static float* inputDevice;
static float* kernelDevice;
static float* outputDevice;
static float* biasDevice;
static void* workspace;

void init()
{
	ASSERT(cudnnCreate(&handle));

	ASSERT(cudnnCreateTensorDescriptor(&inDataDesc));
	ASSERT(cudnnSetTensor4dDescriptor(inDataDesc, tensorFormat, dataType, batchSize, inChannels, inTileSize, inTileSize));

	ASSERT(cudaMalloc(&inputDevice, inDataBytes));
	ASSERT(cudaMemcpy(inputDevice, createRandomData(inDataCount).data(), inDataBytes, cudaMemcpyHostToDevice));

	ASSERT(cudnnCreateFilterDescriptor(&kernelDesc));
	//ASSERT(cudnnSetFilter4dDescriptor(kernelDesc, dataType, tensorFormat, outChannels, inChannels, kernelSize, kernelSize));
	ASSERT(cudnnSetFilter4dDescriptor(kernelDesc, dataType, tensorFormat, outChannels, 1, kernelSize, kernelSize));

	ASSERT(cudaMalloc(&kernelDevice, kernelBytes));
	ASSERT(cudaMemcpy(kernelDevice, createRandomData(kernelCount).data(), kernelBytes, cudaMemcpyHostToDevice));

	ASSERT(cudnnCreateConvolutionDescriptor(&convDesc));
	ASSERT(cudnnSetConvolution2dDescriptor(convDesc,
		padding, padding,
		stride, stride,
		dilation, dilation,
		CUDNN_CROSS_CORRELATION,
		dataType
	));
	ASSERT(cudnnSetConvolutionGroupCount(convDesc, groupCount));

	ASSERT(cudnnCreateTensorDescriptor(&outDataDesc));
	ASSERT(cudnnSetTensor4dDescriptor(outDataDesc, tensorFormat, dataType, batchSize, outChannels, outTileSize, outTileSize));

	ASSERT(cudnnGetConvolutionForwardWorkspaceSize(
		handle,
		inDataDesc,
		kernelDesc,
		convDesc,
		outDataDesc,
		algorithm,
		&workspaceSize
	));
	ASSERT(cudaMalloc(&workspace, workspaceSize));

	ASSERT(cudnnCreateTensorDescriptor(&biasDesc));
	ASSERT(cudnnSetTensor4dDescriptor(biasDesc, tensorFormat, dataType, 1, outChannels, 1, 1));

	ASSERT(cudaMalloc(&biasDevice, biasBytes));
	ASSERT(cudaMemcpy(biasDevice, createRandomData(inChannels).data(), inChannels, cudaMemcpyHostToDevice));

	ASSERT(cudnnCreateActivationDescriptor(&reluDesc));
	ASSERT(cudnnSetActivationDescriptor(reluDesc, CUDNN_ACTIVATION_RELU, CUDNN_NOT_PROPAGATE_NAN, double(reluMaxValue)));

	ASSERT(cudaMalloc(&outputDevice, outDataBytes));
}

void cleanup()
{
	ASSERT(cudaFree(inputDevice));
	ASSERT(cudaFree(kernelDevice));
	ASSERT(cudaFree(outputDevice));
	ASSERT(cudaFree(workspace));
	ASSERT(cudaFree(biasDevice));

	ASSERT(cudnnDestroyTensorDescriptor(outDataDesc));
	ASSERT(cudnnDestroyActivationDescriptor(reluDesc));
	ASSERT(cudnnDestroyTensorDescriptor(biasDesc));
	ASSERT(cudnnDestroyConvolutionDescriptor(convDesc));
	ASSERT(cudnnDestroyFilterDescriptor(kernelDesc));
	ASSERT(cudnnDestroyTensorDescriptor(inDataDesc));
	ASSERT(cudnnDestroy(handle));
}

int main()
{
	init();

	const auto start = std::chrono::high_resolution_clock::now();
	for (auto i = 0; i < repeats; ++i)
	{
#if 0
		//conv separated from bias+relu
		ASSERT(cudnnConvolutionForward(
			handle,
			&alpha,
			inDataDesc, inputDevice,
			kernelDesc, kernelDevice,
			convDesc,
			algorithm,
			workspace, workspaceSize,
			&beta,
			outDataDesc, outputDevice
		));

		ASSERT(Bias_ReLU(outputDevice, biasDevice, batchSize, outChannels, outTileSize, outTileSize, reluMaxValue));
#else
		//conv fused with bias and relu
		ASSERT(cudnnConvolutionBiasActivationForward(
			handle,
			&alpha,
			inDataDesc, inputDevice,
			kernelDesc, kernelDevice,
			convDesc,
			algorithm,
			workspace, workspaceSize,
			&beta,
			outDataDesc, outputDevice,
			biasDesc, biasDevice,
			reluDesc,
			outDataDesc, outputDevice
		));
#endif
	}
	ASSERT(cudaDeviceSynchronize());
	const auto duration = std::chrono::high_resolution_clock::now() - start;
	std::cout << "Total:" << std::chrono::duration_cast<std::chrono::milliseconds>(duration).count() << std::endl;
	std::cout << "Avg:" << std::chrono::duration_cast<std::chrono::milliseconds>(duration / repeats).count() << std::endl;

	cleanup();
}
kernels.h
#pragma once
#include <stdint.h>
#include <cuda_runtime_api.h>

cudaError_t Bias_ReLU(float* data, const float* bias, int n, int c, int h, int w, float reluMaxValue);
kernels.cu
#include "kernels.h"
#include <stdint.h>

#include <math.h>

constexpr auto BlockSize = 256u;

__global__ void Bias_ReLU_k(float* data, int64_t N, const float* bias, int64_t planeArea, int channels, float reluMaxValue)
{
	int index = blockIdx.x * blockDim.x + threadIdx.x;
	int stride = blockDim.x * gridDim.x;
	for (auto i = index; i < N; i += stride)
	{
		int effectiveChannelIdx = i / planeArea;
		auto channelIdx = effectiveChannelIdx % channels;
		auto tmp = data[i] + bias[channelIdx];
		tmp = fmin(tmp, reluMaxValue);
		data[i] = fmax(tmp, 0.f);
	}
}

cudaError_t Bias_ReLU(float* data, const float* bias, int n, int c, int h, int w, float reluMaxValue)
{
	const auto imageArea = h * w;
	const auto N = n * c * imageArea;
	auto blocksCount = (N + BlockSize - 1) / BlockSize;
	Bias_ReLU_k<<<blocksCount, BlockSize>>>(data, N, bias, imageArea, c, reluMaxValue);
	return cudaGetLastError();
}

Hi,

I would consider this as expected.
cudnnConvolutionBiasActivationForward is likely selecting another depthwise kernel that is slower.

Thanks

Well, that is disappointing.

Thank you for your time.