cudnnBatchNormalizationForwardTrainingEx CUDNN_STATUS_NOT_SUPPORTED

cudnnBatchNormalizationForwardTrainingEx is giving me CUDNN_STATUS_NOT_SUPPORTED when as far as i can see everything is correct and supported, using cuDNN 8.7 and a Titan V.
ConvLayer.h:

#pragma once
#include <cudnn.h>
#include <cublas_v2.h>
class ConvLayer{
public:
	ConvLayer(cudnnHandle_t cudnnHandle, cublasHandle_t cublasHandle, int bitchSize, int inputChannels, int outputChannels, int kernelSize, int stride, int padding, int* width, int* height, int* inputSize);
	~ConvLayer();
	__half* forward(const __half* input);
	__half* backward(const __half* gradInput) const;
	void updateParameters(float learningRate) const;
	cudnnHandle_t cudnnHandle_;
	cublasHandle_t cublasHandle_;
	cudnnTensorDescriptor_t inputDesc_, outputDesc_;
	cudnnFilterDescriptor_t filterDesc_;
	cudnnConvolutionDescriptor_t convDesc_;
	cudnnActivationDescriptor_t activDesc_;
	cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc_;
	cudnnConvolutionFwdAlgo_t fwdAlgo_;
	cudnnConvolutionBwdFilterAlgo_t bwdFilterAlgo_;
	cudnnConvolutionBwdDataAlgo_t bwdDataAlgo_;
	int bitchSize_;
	int outputChannels_;
	size_t outputSize_;
	size_t numWeights_ = 0;
	const __half* input_;
	__half* output_;
	__half* weights_;
	__half* gradOutput_;
	__half* gradWeights_;
	__half* gradBias_;
	float* bnScale_;
	float* bnBias_;
	float* gradBnScale_;
	float* gradBnBias_;
	float* bnRunningMean_;
	float* bnRunningVar_;
	float* bnSavedMean_;
	float* bnSavedInvVariance_;
	float epsilon_ = 1e-5;
	size_t workspaceSize_ = 0;
	void* workspace_;
	size_t reserveSpaceSize_ = 0;
	void* reserveSpace_;
};

ConvLayer.cpp:

#include "ConvLayer.h"
#include "common.h"
#include <iostream>
#include <algorithm>
ConvLayer::ConvLayer(cudnnHandle_t cudnnHandle, cublasHandle_t cublasHandle, int bitchSize, int inputChannels, int outputChannels, int kernelSize, int stride, int padding, int* width, int* height, int* inputSize) : cudnnHandle_(
		cudnnHandle), cublasHandle_(cublasHandle), fwdAlgo_(), bwdFilterAlgo_(), bwdDataAlgo_(), bitchSize_(bitchSize), outputChannels_(outputChannels), input_(nullptr), output_(nullptr), weights_(nullptr), bnScale_(nullptr), bnBias_(nullptr){
	checkCUDNN(cudnnCreateTensorDescriptor(&inputDesc_));
	checkCUDNN(cudnnCreateTensorDescriptor(&outputDesc_));
	checkCUDNN(cudnnCreateFilterDescriptor(&filterDesc_));
	checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc_));
	checkCUDNN(cudnnCreateActivationDescriptor(&activDesc_));
	checkCUDNN(cudnnCreateTensorDescriptor(&bnScaleBiasMeanVarDesc_)); // Batch normalization tensor descriptor
	checkCUDNN(cudnnSetActivationDescriptor(activDesc_, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, 0.0));
	checkCUDNN(cudnnSetTensor4dDescriptor(inputDesc_, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, bitchSize_, inputChannels, *height, *width));
	checkCUDNN(cudnnSetFilter4dDescriptor(filterDesc_, CUDNN_DATA_HALF, CUDNN_TENSOR_NCHW, outputChannels, inputChannels, kernelSize, kernelSize));
	checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc_, padding, padding, stride, stride, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_HALF));
	int n, c;
	checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc_, inputDesc_, filterDesc_, &n, &c, height, width));
	checkCUDNN(cudnnSetTensor4dDescriptor(outputDesc_, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, n, c, *height, *width));
	checkCUDNN(cudnnDeriveBNTensorDescriptor(bnScaleBiasMeanVarDesc_, outputDesc_, CUDNN_BATCHNORM_SPATIAL));
	outputSize_ = n * c * *height * *width;
	auto outSizeBytes = outputSize_ * sizeof(__half);
	const auto fanIn = inputChannels * kernelSize * kernelSize;
	numWeights_ = outputChannels * fanIn;
	checkCUDA(cudaMalloc(&weights_, numWeights_ * sizeof(__half)));
	checkCUDA(cudaMalloc(&gradWeights_, numWeights_));
	checkCUDA(cudaMalloc(&gradBias_, outSizeBytes));
	auto bnSizeBytes = outputSize_ * sizeof(float);
	checkCUDA(cudaMalloc(&bnScale_, bnSizeBytes));
	checkCUDA(cudaMalloc(&bnBias_, bnSizeBytes));
	checkCUDA(cudaMalloc(&gradBnScale_, bnSizeBytes));
	checkCUDA(cudaMalloc(&gradBnBias_, bnSizeBytes));
	checkCUDA(cudaMalloc(&bnRunningMean_, bnSizeBytes));
	checkCUDA(cudaMalloc(&bnRunningVar_, bnSizeBytes));
	checkCUDA(cudaMalloc(&bnSavedMean_, bnSizeBytes));
	checkCUDA(cudaMalloc(&bnSavedInvVariance_, bnSizeBytes));
	checkCUDA(cudaMalloc(&output_, outSizeBytes));
	checkCUDA(cudaMalloc(&gradOutput_, *inputSize * sizeof(__half)));
	HeInit(weights_, numWeights_, sqrtf(2.0f / fanIn));
	const std::vector<__half> osOnes(outputSize_, __float2half(1.0f));
	checkCUDA(cudaMemcpy(bnScale_, osOnes.data(), outSizeBytes, cudaMemcpyHostToDevice));
	checkCUDA(cudaMemcpy(bnRunningVar_, osOnes.data(), outSizeBytes, cudaMemcpyHostToDevice));
	checkCUDA(cudaMemset(bnBias_, 0, outSizeBytes));
	checkCUDA(cudaMemset(bnRunningMean_, 0, outSizeBytes));
	checkCUDA(cudaMemset(bnSavedMean_, 0, outSizeBytes));
	checkCUDA(cudaMemset(bnSavedInvVariance_, 0, outSizeBytes));
	checkCUDA(cudaMemset(gradOutput_, 0, *inputSize * sizeof(__half)));
	cudnnConvolutionFwdAlgoPerf_t fwdAlgoPerf[10];
	cudnnConvolutionBwdFilterAlgoPerf_t bwdFilterAlgoPerf[10];
	cudnnConvolutionBwdDataAlgoPerf_t bwdDataAlgoPerf[10];
	// Select forward algorithm
	int returnedAlgoCount;
	checkCUDNN(cudnnGetConvolutionForwardAlgorithm_v7(cudnnHandle_, inputDesc_, filterDesc_, convDesc_, outputDesc_, 10, &returnedAlgoCount, fwdAlgoPerf));
	fwdAlgo_ = fwdAlgoPerf[0].algo;
	const size_t fwdWorkspaceSize = fwdAlgoPerf[0].memory;
	// Select backward filter algorithm
	checkCUDNN(cudnnGetConvolutionBackwardFilterAlgorithm_v7(cudnnHandle_, inputDesc_, outputDesc_, convDesc_, filterDesc_, 10, &returnedAlgoCount, bwdFilterAlgoPerf));
	bwdFilterAlgo_ = bwdFilterAlgoPerf[0].algo;
	const size_t bwdFilterWorkspaceSize = bwdFilterAlgoPerf[0].memory;
	// Select backward data algorithm
	checkCUDNN(cudnnGetConvolutionBackwardDataAlgorithm_v7(cudnnHandle_, filterDesc_, outputDesc_, convDesc_, inputDesc_, 10, &returnedAlgoCount, bwdDataAlgoPerf));
	bwdDataAlgo_ = bwdDataAlgoPerf[0].algo;
	const size_t bwdDataWorkspaceSize = bwdDataAlgoPerf[0].memory;
	workspaceSize_ = std::max(fwdWorkspaceSize, std::max(bwdFilterWorkspaceSize, bwdDataWorkspaceSize));
	checkCUDA(cudaMalloc(&workspace_, workspaceSize_));
	checkCUDNN(cudnnGetBatchNormalizationTrainingExReserveSpaceSize(cudnnHandle_, CUDNN_BATCHNORM_SPATIAL, CUDNN_BATCHNORM_OPS_BN_ACTIVATION, activDesc_, inputDesc_, &reserveSpaceSize_));
	checkCUDA(cudaMalloc(&reserveSpace_, reserveSpaceSize_));
	*inputSize = outputSize_;
}
ConvLayer::~ConvLayer(){
	cudaFree(output_);
	cudaFree(weights_);
	cudaFree(gradOutput_);
	cudaFree(gradWeights_);
	cudaFree(gradBias_);
	cudaFree(bnScale_);
	cudaFree(bnBias_);
	cudaFree(gradBnScale_);
	cudaFree(gradBnBias_);
	cudaFree(bnRunningMean_);
	cudaFree(bnRunningVar_);
	cudaFree(bnSavedMean_);
	cudaFree(bnSavedInvVariance_);
	cudaFree(workspace_);
	cudaFree(reserveSpace_);
	checkCUDNN(cudnnDestroyTensorDescriptor(inputDesc_));
	checkCUDNN(cudnnDestroyTensorDescriptor(outputDesc_));
	checkCUDNN(cudnnDestroyFilterDescriptor(filterDesc_));
	checkCUDNN(cudnnDestroyConvolutionDescriptor(convDesc_));
	checkCUDNN(cudnnDestroyActivationDescriptor(activDesc_));
	checkCUDNN(cudnnDestroyTensorDescriptor(bnScaleBiasMeanVarDesc_));
}
__half* ConvLayer::forward(const __half* input){
	input_ = input;
	const float alpha = 1.0f;
	const float beta1 = 1.0f;
	const float beta0 = 0.0f;
	checkCUDNN(cudnnConvolutionForward(cudnnHandle_, &alpha, inputDesc_, input, filterDesc_, weights_, convDesc_, fwdAlgo_, workspace_, workspaceSize_, &beta1, outputDesc_, output_));
	checkCUDNN(cudnnBatchNormalizationForwardTrainingEx(cudnnHandle_, CUDNN_BATCHNORM_SPATIAL, CUDNN_BATCHNORM_OPS_BN_ACTIVATION, &alpha, &beta0, outputDesc_, output_, nullptr, nullptr, outputDesc_, output_, bnScaleBiasMeanVarDesc_,
		bnScale_, bnBias_, 1.0, bnRunningMean_, bnRunningVar_, epsilon_, bnSavedMean_, bnSavedInvVariance_, activDesc_, workspace_, workspaceSize_, reserveSpace_, reserveSpaceSize_));
	return output_;
}
__half* ConvLayer::backward(const __half* gradInput) const{
	const float alpha = 1.0f;
	const float beta = 0.0f;
	checkCUDNN(cudnnBatchNormalizationBackwardEx(
		cudnnHandle_,
		CUDNN_BATCHNORM_PER_ACTIVATION,
		CUDNN_BATCHNORM_OPS_BN_ACTIVATION,
		&alpha, &beta, &alpha, &beta,
		inputDesc_, input_,		// x (input to the forward pass)
		outputDesc_, output_,	// y (output from forward pass)
		outputDesc_, gradInput,	// dy (backpropagated gradient input)
		nullptr, nullptr,		// z and dz not used
		inputDesc_, gradOutput_,// dx (gradient output)
		bnScaleBiasMeanVarDesc_,// dBnScaleBiasDesc
		bnScale_,				// bnScaleData (batch normalization scale parameter)
		bnBias_,				// bnBiasData (batch normalization bias parameter)
		gradBnScale_,			// dBnScaleData (gradients of bnScaleData)
		gradBnBias_,			// dBnBiasData (gradients of bnBiasData)
		epsilon_,
		bnSavedMean_, bnSavedInvVariance_,
		activDesc_,
		workspace_, workspaceSize_,
		reserveSpace_, reserveSpaceSize_));
	checkCUDNN(cudnnConvolutionBackwardFilter(cudnnHandle_, &alpha, inputDesc_, input_, outputDesc_, gradInput, convDesc_, bwdFilterAlgo_, workspace_, workspaceSize_, &beta, filterDesc_, gradWeights_));
	checkCUDNN(cudnnConvolutionBackwardBias(cudnnHandle_, &alpha, outputDesc_, gradInput, &beta, outputDesc_, gradBias_));
	checkCUDNN(cudnnConvolutionBackwardData(cudnnHandle_, &alpha, filterDesc_, weights_, outputDesc_, gradInput, convDesc_, bwdDataAlgo_, workspace_, workspaceSize_, &beta, inputDesc_, gradOutput_));
	return gradOutput_;
}
void ConvLayer::updateParameters(float learningRate) const{
	learningRate = -learningRate;
	// Update weights: W = W - learningRate*gradWeights
	SGDHalf(weights_, learningRate, gradWeights_, numWeights_);
	// Update biases (dense layer): bias = bias - learningRate*gradBias
	SGDHalf(output_, learningRate, gradBias_, outputSize_);
	// Update batch normalization scale: bnScale = bnScale - learningRate*gradBnScale
	SGDFloat(bnScale_, learningRate, gradBnScale_, outputSize_);
	// Update batch normalization bias: bnBias = bnBias - learningRate*gradBnBias
	SGDFloat(bnBias_, learningRate, gradBnBias_, outputSize_);
}

Massive thanks for any help.