cudnnBatchNormalizationForwardTrainingEx is giving me CUDNN_STATUS_NOT_SUPPORTED when as far as i can see everything is correct and supported, using cuDNN 8.7 and a Titan V.
ConvLayer.h:
#pragma once
#include <cudnn.h>
#include <cublas_v2.h>
class ConvLayer{
public:
ConvLayer(cudnnHandle_t cudnnHandle, cublasHandle_t cublasHandle, int bitchSize, int inputChannels, int outputChannels, int kernelSize, int stride, int padding, int* width, int* height, int* inputSize);
~ConvLayer();
__half* forward(const __half* input);
__half* backward(const __half* gradInput) const;
void updateParameters(float learningRate) const;
cudnnHandle_t cudnnHandle_;
cublasHandle_t cublasHandle_;
cudnnTensorDescriptor_t inputDesc_, outputDesc_;
cudnnFilterDescriptor_t filterDesc_;
cudnnConvolutionDescriptor_t convDesc_;
cudnnActivationDescriptor_t activDesc_;
cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc_;
cudnnConvolutionFwdAlgo_t fwdAlgo_;
cudnnConvolutionBwdFilterAlgo_t bwdFilterAlgo_;
cudnnConvolutionBwdDataAlgo_t bwdDataAlgo_;
int bitchSize_;
int outputChannels_;
size_t outputSize_;
size_t numWeights_ = 0;
const __half* input_;
__half* output_;
__half* weights_;
__half* gradOutput_;
__half* gradWeights_;
__half* gradBias_;
float* bnScale_;
float* bnBias_;
float* gradBnScale_;
float* gradBnBias_;
float* bnRunningMean_;
float* bnRunningVar_;
float* bnSavedMean_;
float* bnSavedInvVariance_;
float epsilon_ = 1e-5;
size_t workspaceSize_ = 0;
void* workspace_;
size_t reserveSpaceSize_ = 0;
void* reserveSpace_;
};
ConvLayer.cpp:
#include "ConvLayer.h"
#include "common.h"
#include <iostream>
#include <algorithm>
ConvLayer::ConvLayer(cudnnHandle_t cudnnHandle, cublasHandle_t cublasHandle, int bitchSize, int inputChannels, int outputChannels, int kernelSize, int stride, int padding, int* width, int* height, int* inputSize) : cudnnHandle_(
cudnnHandle), cublasHandle_(cublasHandle), fwdAlgo_(), bwdFilterAlgo_(), bwdDataAlgo_(), bitchSize_(bitchSize), outputChannels_(outputChannels), input_(nullptr), output_(nullptr), weights_(nullptr), bnScale_(nullptr), bnBias_(nullptr){
checkCUDNN(cudnnCreateTensorDescriptor(&inputDesc_));
checkCUDNN(cudnnCreateTensorDescriptor(&outputDesc_));
checkCUDNN(cudnnCreateFilterDescriptor(&filterDesc_));
checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc_));
checkCUDNN(cudnnCreateActivationDescriptor(&activDesc_));
checkCUDNN(cudnnCreateTensorDescriptor(&bnScaleBiasMeanVarDesc_)); // Batch normalization tensor descriptor
checkCUDNN(cudnnSetActivationDescriptor(activDesc_, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, 0.0));
checkCUDNN(cudnnSetTensor4dDescriptor(inputDesc_, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, bitchSize_, inputChannels, *height, *width));
checkCUDNN(cudnnSetFilter4dDescriptor(filterDesc_, CUDNN_DATA_HALF, CUDNN_TENSOR_NCHW, outputChannels, inputChannels, kernelSize, kernelSize));
checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc_, padding, padding, stride, stride, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_HALF));
int n, c;
checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc_, inputDesc_, filterDesc_, &n, &c, height, width));
checkCUDNN(cudnnSetTensor4dDescriptor(outputDesc_, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, n, c, *height, *width));
checkCUDNN(cudnnDeriveBNTensorDescriptor(bnScaleBiasMeanVarDesc_, outputDesc_, CUDNN_BATCHNORM_SPATIAL));
outputSize_ = n * c * *height * *width;
auto outSizeBytes = outputSize_ * sizeof(__half);
const auto fanIn = inputChannels * kernelSize * kernelSize;
numWeights_ = outputChannels * fanIn;
checkCUDA(cudaMalloc(&weights_, numWeights_ * sizeof(__half)));
checkCUDA(cudaMalloc(&gradWeights_, numWeights_));
checkCUDA(cudaMalloc(&gradBias_, outSizeBytes));
auto bnSizeBytes = outputSize_ * sizeof(float);
checkCUDA(cudaMalloc(&bnScale_, bnSizeBytes));
checkCUDA(cudaMalloc(&bnBias_, bnSizeBytes));
checkCUDA(cudaMalloc(&gradBnScale_, bnSizeBytes));
checkCUDA(cudaMalloc(&gradBnBias_, bnSizeBytes));
checkCUDA(cudaMalloc(&bnRunningMean_, bnSizeBytes));
checkCUDA(cudaMalloc(&bnRunningVar_, bnSizeBytes));
checkCUDA(cudaMalloc(&bnSavedMean_, bnSizeBytes));
checkCUDA(cudaMalloc(&bnSavedInvVariance_, bnSizeBytes));
checkCUDA(cudaMalloc(&output_, outSizeBytes));
checkCUDA(cudaMalloc(&gradOutput_, *inputSize * sizeof(__half)));
HeInit(weights_, numWeights_, sqrtf(2.0f / fanIn));
const std::vector<__half> osOnes(outputSize_, __float2half(1.0f));
checkCUDA(cudaMemcpy(bnScale_, osOnes.data(), outSizeBytes, cudaMemcpyHostToDevice));
checkCUDA(cudaMemcpy(bnRunningVar_, osOnes.data(), outSizeBytes, cudaMemcpyHostToDevice));
checkCUDA(cudaMemset(bnBias_, 0, outSizeBytes));
checkCUDA(cudaMemset(bnRunningMean_, 0, outSizeBytes));
checkCUDA(cudaMemset(bnSavedMean_, 0, outSizeBytes));
checkCUDA(cudaMemset(bnSavedInvVariance_, 0, outSizeBytes));
checkCUDA(cudaMemset(gradOutput_, 0, *inputSize * sizeof(__half)));
cudnnConvolutionFwdAlgoPerf_t fwdAlgoPerf[10];
cudnnConvolutionBwdFilterAlgoPerf_t bwdFilterAlgoPerf[10];
cudnnConvolutionBwdDataAlgoPerf_t bwdDataAlgoPerf[10];
// Select forward algorithm
int returnedAlgoCount;
checkCUDNN(cudnnGetConvolutionForwardAlgorithm_v7(cudnnHandle_, inputDesc_, filterDesc_, convDesc_, outputDesc_, 10, &returnedAlgoCount, fwdAlgoPerf));
fwdAlgo_ = fwdAlgoPerf[0].algo;
const size_t fwdWorkspaceSize = fwdAlgoPerf[0].memory;
// Select backward filter algorithm
checkCUDNN(cudnnGetConvolutionBackwardFilterAlgorithm_v7(cudnnHandle_, inputDesc_, outputDesc_, convDesc_, filterDesc_, 10, &returnedAlgoCount, bwdFilterAlgoPerf));
bwdFilterAlgo_ = bwdFilterAlgoPerf[0].algo;
const size_t bwdFilterWorkspaceSize = bwdFilterAlgoPerf[0].memory;
// Select backward data algorithm
checkCUDNN(cudnnGetConvolutionBackwardDataAlgorithm_v7(cudnnHandle_, filterDesc_, outputDesc_, convDesc_, inputDesc_, 10, &returnedAlgoCount, bwdDataAlgoPerf));
bwdDataAlgo_ = bwdDataAlgoPerf[0].algo;
const size_t bwdDataWorkspaceSize = bwdDataAlgoPerf[0].memory;
workspaceSize_ = std::max(fwdWorkspaceSize, std::max(bwdFilterWorkspaceSize, bwdDataWorkspaceSize));
checkCUDA(cudaMalloc(&workspace_, workspaceSize_));
checkCUDNN(cudnnGetBatchNormalizationTrainingExReserveSpaceSize(cudnnHandle_, CUDNN_BATCHNORM_SPATIAL, CUDNN_BATCHNORM_OPS_BN_ACTIVATION, activDesc_, inputDesc_, &reserveSpaceSize_));
checkCUDA(cudaMalloc(&reserveSpace_, reserveSpaceSize_));
*inputSize = outputSize_;
}
ConvLayer::~ConvLayer(){
cudaFree(output_);
cudaFree(weights_);
cudaFree(gradOutput_);
cudaFree(gradWeights_);
cudaFree(gradBias_);
cudaFree(bnScale_);
cudaFree(bnBias_);
cudaFree(gradBnScale_);
cudaFree(gradBnBias_);
cudaFree(bnRunningMean_);
cudaFree(bnRunningVar_);
cudaFree(bnSavedMean_);
cudaFree(bnSavedInvVariance_);
cudaFree(workspace_);
cudaFree(reserveSpace_);
checkCUDNN(cudnnDestroyTensorDescriptor(inputDesc_));
checkCUDNN(cudnnDestroyTensorDescriptor(outputDesc_));
checkCUDNN(cudnnDestroyFilterDescriptor(filterDesc_));
checkCUDNN(cudnnDestroyConvolutionDescriptor(convDesc_));
checkCUDNN(cudnnDestroyActivationDescriptor(activDesc_));
checkCUDNN(cudnnDestroyTensorDescriptor(bnScaleBiasMeanVarDesc_));
}
__half* ConvLayer::forward(const __half* input){
input_ = input;
const float alpha = 1.0f;
const float beta1 = 1.0f;
const float beta0 = 0.0f;
checkCUDNN(cudnnConvolutionForward(cudnnHandle_, &alpha, inputDesc_, input, filterDesc_, weights_, convDesc_, fwdAlgo_, workspace_, workspaceSize_, &beta1, outputDesc_, output_));
checkCUDNN(cudnnBatchNormalizationForwardTrainingEx(cudnnHandle_, CUDNN_BATCHNORM_SPATIAL, CUDNN_BATCHNORM_OPS_BN_ACTIVATION, &alpha, &beta0, outputDesc_, output_, nullptr, nullptr, outputDesc_, output_, bnScaleBiasMeanVarDesc_,
bnScale_, bnBias_, 1.0, bnRunningMean_, bnRunningVar_, epsilon_, bnSavedMean_, bnSavedInvVariance_, activDesc_, workspace_, workspaceSize_, reserveSpace_, reserveSpaceSize_));
return output_;
}
__half* ConvLayer::backward(const __half* gradInput) const{
const float alpha = 1.0f;
const float beta = 0.0f;
checkCUDNN(cudnnBatchNormalizationBackwardEx(
cudnnHandle_,
CUDNN_BATCHNORM_PER_ACTIVATION,
CUDNN_BATCHNORM_OPS_BN_ACTIVATION,
&alpha, &beta, &alpha, &beta,
inputDesc_, input_, // x (input to the forward pass)
outputDesc_, output_, // y (output from forward pass)
outputDesc_, gradInput, // dy (backpropagated gradient input)
nullptr, nullptr, // z and dz not used
inputDesc_, gradOutput_,// dx (gradient output)
bnScaleBiasMeanVarDesc_,// dBnScaleBiasDesc
bnScale_, // bnScaleData (batch normalization scale parameter)
bnBias_, // bnBiasData (batch normalization bias parameter)
gradBnScale_, // dBnScaleData (gradients of bnScaleData)
gradBnBias_, // dBnBiasData (gradients of bnBiasData)
epsilon_,
bnSavedMean_, bnSavedInvVariance_,
activDesc_,
workspace_, workspaceSize_,
reserveSpace_, reserveSpaceSize_));
checkCUDNN(cudnnConvolutionBackwardFilter(cudnnHandle_, &alpha, inputDesc_, input_, outputDesc_, gradInput, convDesc_, bwdFilterAlgo_, workspace_, workspaceSize_, &beta, filterDesc_, gradWeights_));
checkCUDNN(cudnnConvolutionBackwardBias(cudnnHandle_, &alpha, outputDesc_, gradInput, &beta, outputDesc_, gradBias_));
checkCUDNN(cudnnConvolutionBackwardData(cudnnHandle_, &alpha, filterDesc_, weights_, outputDesc_, gradInput, convDesc_, bwdDataAlgo_, workspace_, workspaceSize_, &beta, inputDesc_, gradOutput_));
return gradOutput_;
}
void ConvLayer::updateParameters(float learningRate) const{
learningRate = -learningRate;
// Update weights: W = W - learningRate*gradWeights
SGDHalf(weights_, learningRate, gradWeights_, numWeights_);
// Update biases (dense layer): bias = bias - learningRate*gradBias
SGDHalf(output_, learningRate, gradBias_, outputSize_);
// Update batch normalization scale: bnScale = bnScale - learningRate*gradBnScale
SGDFloat(bnScale_, learningRate, gradBnScale_, outputSize_);
// Update batch normalization bias: bnBias = bnBias - learningRate*gradBnBias
SGDFloat(bnBias_, learningRate, gradBnBias_, outputSize_);
}
Massive thanks for any help.