Convolution incompatible with NHWC

I’m trying to get my convolutional layer to work with NHWC but after days of trying every conceivable permutation of data types etc. I am unable to get a convolutional layer to train with NHWC data, this is my current code where cudnnConvolutionBackwardFilter returns CUDNN_STATUS_NOT_SUPPORTED:

ConvLayer::ConvLayer(cudnnHandle_t cudnnHandle, cublasHandle_t cublasHandle, int bitchSize, int inputChannels, int outputChannels, int filterSize, int stride, int padding, int* width, int* height, int* inputCHW, const char* layerName, bool train) : cudnnHandle_(cudnnHandle),
					cublasHandle_(cublasHandle), fwdAlgo_(), bwdFilterAlgo_(), bwdDataAlgo_(), batchSize_(bitchSize), inC_(inputChannels), inCHW_(*inputCHW), outC_(outputChannels), inData_(nullptr), outData_(nullptr), weights_(nullptr){
	layerName_ = layerName;
	train_ = train;
	int inWidth = *width, inHeight = *height, outWidth, outHeight;
	checkCUDNN(cudnnCreateTensorDescriptor(&inDesc_));
	checkCUDNN(cudnnCreateTensorDescriptor(&outDesc_));
	checkCUDNN(cudnnCreateFilterDescriptor(&filterDesc_));
	checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc_));
	checkCUDNN(cudnnCreateTensorDescriptor(&biasDesc_));
	checkCUDNN(cudnnSetTensor4dDescriptor(biasDesc_, CUDNN_TENSOR_NHWC, CUDNN_DATA_HALF, 1, outC_, 1, 1));
	checkCUDNN(cudnnSetTensor4dDescriptor(inDesc_, CUDNN_TENSOR_NHWC, CUDNN_DATA_HALF, batchSize_, inC_, inHeight, inWidth));
	checkCUDNN(cudnnSetFilter4dDescriptor(filterDesc_, CUDNN_DATA_HALF, CUDNN_TENSOR_NHWC, outC_, inC_, filterSize, filterSize));
	checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc_, padding, padding, stride, stride, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_HALF));
	checkCUDNN(cudnnSetConvolutionMathType(convDesc_, CUDNN_TENSOR_OP_MATH)); //S
	int n, c;
	checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc_, inDesc_, filterDesc_, &n, &c, &outHeight, &outWidth));
	checkCUDNN(cudnnSetTensor4dDescriptor(outDesc_, CUDNN_TENSOR_NHWC, CUDNN_DATA_HALF, batchSize_, outC_, outHeight, outWidth));
	outCHW_ = outC_*outHeight*outWidth;
	outNCHW_ = outCHW_*batchSize_;
	gradOutSize_ = inCHW_*batchSize_*sizeof(__half);
	const auto fanIn = inC_*filterSize*filterSize;
	weightCount_ = outC_*fanIn;
	checkCUDA(cudaMalloc(&inDataNCHW_, inCHW_*batchSize_*sizeof(__half)));
	checkCUDA(cudaMalloc(&outData_, outNCHW_*sizeof(__half)));
	checkCUDA(cudaMalloc(&weights_, weightCount_*sizeof(__half)));
	checkCUDA(cudaMalloc(&bias_, outC_*sizeof(__half)));
	checkCUDA(cudaMemset(outData_, 0, outNCHW_*sizeof(__half)));
	HeInit(weights_, weightCount_, fanIn);
	checkCUDA(cudaMemset(bias_, 0, outC_*sizeof(__half)));
	checkCUDA(cudaMemset(inDataNCHW_, 0, inCHW_*batchSize_*sizeof(__half)));
	if(train_){
		checkCUDNN(cudnnCreateTensorDescriptor(&outGradDesc_));
		checkCUDNN(cudnnCreateTensorDescriptor(&inGradDesc_));
		checkCUDNN(cudnnSetTensor4dDescriptor(outGradDesc_, CUDNN_TENSOR_NHWC, CUDNN_DATA_HALF, batchSize_, inC_, inHeight, inWidth));
		checkCUDNN(cudnnSetTensor4dDescriptor(inGradDesc_, CUDNN_TENSOR_NHWC, CUDNN_DATA_HALF, batchSize_, outC_, outHeight, outWidth));
		checkCUDA(cudaMalloc(&gradWeights_, weightCount_*sizeof(__half)));
		checkCUDA(cudaMalloc(&gradBias_, outC_*sizeof(__half)));
		checkCUDA(cudaMalloc(&gradOut_, gradOutSize_));
		checkCUDA(cudaMemset(gradWeights_, 0, weightCount_*sizeof(__half)));
		checkCUDA(cudaMemset(gradBias_, 0, outC_*sizeof(__half)));
		checkCUDA(cudaMemset(gradOut_, 0, gradOutSize_));
		if(useAdamW_){
			checkCUDA(cudaMalloc(&m_Weights_, weightCount_*sizeof(float)));
			checkCUDA(cudaMalloc(&v_Weights_, weightCount_*sizeof(float)));
			checkCUDA(cudaMalloc(&m_Bias_, outC_*sizeof(float)));
			checkCUDA(cudaMalloc(&v_Bias_, outC_*sizeof(float)));
			checkCUDA(cudaMemset(m_Weights_, 0, weightCount_*sizeof(float)));
			checkCUDA(cudaMemset(v_Weights_, 0, weightCount_*sizeof(float)));
			checkCUDA(cudaMemset(m_Bias_, 0, outC_*sizeof(float)));
			checkCUDA(cudaMemset(v_Bias_, 0, outC_*sizeof(float)));
		}
	}
	cudnnConvolutionFwdAlgoPerf_t fwdAlgoPerf[10];
	cudnnConvolutionBwdFilterAlgoPerf_t bwdFilterAlgoPerf[10];
	cudnnConvolutionBwdDataAlgoPerf_t bwdDataAlgoPerf[10];
	int returnedAlgoCount;
	checkCUDNN(cudnnGetConvolutionForwardAlgorithm_v7(cudnnHandle_, inDesc_, filterDesc_, convDesc_, outDesc_, 10, &returnedAlgoCount, fwdAlgoPerf));
	fwdAlgo_ = fwdAlgoPerf[0].algo;
	const size_t fwdWorkspaceSize = fwdAlgoPerf[0].memory;
	checkCUDNN(cudnnGetConvolutionBackwardFilterAlgorithm_v7(cudnnHandle_, inDesc_, outDesc_, convDesc_, filterDesc_, 10, &returnedAlgoCount, bwdFilterAlgoPerf));
	bwdFilterAlgo_ = bwdFilterAlgoPerf[0].algo;
	const size_t bwdFilterWorkspaceSize = bwdFilterAlgoPerf[0].memory;
	checkCUDNN(cudnnGetConvolutionBackwardDataAlgorithm_v7(cudnnHandle_, filterDesc_, outDesc_, convDesc_, inDesc_, 10, &returnedAlgoCount, bwdDataAlgoPerf));
	bwdDataAlgo_ = bwdDataAlgoPerf[0].algo;
	const size_t bwdDataWorkspaceSize = bwdDataAlgoPerf[0].memory;
	workspaceSize_ = std::max(fwdWorkspaceSize, std::max(bwdFilterWorkspaceSize, bwdDataWorkspaceSize));
	checkCUDA(cudaMalloc(&workspace_, workspaceSize_));
	*inputCHW = outCHW_;
	*width = outWidth;
	*height = outHeight;
}
ConvLayer::~ConvLayer(){
	cudaFree(outData_);
	cudaFree(weights_);
	cudaFree(bias_);
	cudaFree(workspace_);
	checkCUDNN(cudnnDestroyTensorDescriptor(inDesc_));
	checkCUDNN(cudnnDestroyTensorDescriptor(outDesc_));
	checkCUDNN(cudnnDestroyFilterDescriptor(filterDesc_));
	checkCUDNN(cudnnDestroyConvolutionDescriptor(convDesc_));
	checkCUDNN(cudnnDestroyTensorDescriptor(biasDesc_));
	if(train_){
		cudaFree(gradWeights_);
		cudaFree(gradBias_);
		cudaFree(gradOut_);
		if(useAdamW_){
			cudaFree(m_Weights_);
			cudaFree(v_Weights_);
			cudaFree(m_Bias_);
			cudaFree(v_Bias_);
		}
		checkCUDNN(cudnnDestroyTensorDescriptor(outGradDesc_));
	}
}
__half* ConvLayer::Forward(__half* data, bool train){
	inData_ = data;
	checkCUDNN(cudnnConvolutionForward(cudnnHandle_, &alpha, inDesc_, data, filterDesc_, weights_, convDesc_, fwdAlgo_, workspace_, workspaceSize_, &beta0, outDesc_, outData_));
	checkCUDNN(cudnnAddTensor(cudnnHandle_, &alpha, biasDesc_, bias_, &beta1, outDesc_, outData_));
	return outData_;
}
__half* ConvLayer::Backward(__half* grad){
	checkCUDNN(cudnnConvolutionBackwardFilter(cudnnHandle_, &alpha, inDesc_, inData_, inGradDesc_, grad, convDesc_, bwdFilterAlgo_, workspace_, workspaceSize_, &beta0, filterDesc_, gradWeights_));
	checkCUDNN(cudnnConvolutionBackwardBias(cudnnHandle_, &alpha, inGradDesc_, grad, &beta0, biasDesc_, gradBias_));
	checkCUDNN(cudnnConvolutionBackwardData(cudnnHandle_, &alpha, filterDesc_, weights_, inGradDesc_, grad, convDesc_, bwdDataAlgo_, workspace_, workspaceSize_, &beta0, outGradDesc_, gradOut_));
	return gradOut_;
}
void ConvLayer::UpdateParameters(float learningRate){
	if(useAdamW_){
		AdamWHalf(weights_, m_Weights_, v_Weights_, learningRate, gradWeights_, weightCount_, t_, 0.0001F);
		AdamWHalf(bias_, m_Bias_, v_Bias_, learningRate, gradBias_, outC_, t_, 0.0001F);
		++t_;
	} else{
		SGDHalf(weights_, learningRate, gradWeights_, weightCount_);
		SGDHalf(bias_, learningRate, gradBias_, outC_);
	}
}

I’ve also noticed some serious performance issues with cudnnConvolutionBackwardFilter when using batch sizes of 64 or greater where the GPU will be 100% but it consumes virtually no power with very little frame buffer and bus utilization like the GPU itself is spin waiting for something.

Can you please help us with repro steps and complete log trace?
Thanks

I dont know what logs you mean, just create an instance and run the backward pass to see the error.

The performance issue turned out to be the kernel size being 5 rather than the 4 required by the tensor cores which i didnt see anywhere in the documentation but the NHWC incompatibility is still a mystery because i can clearly see from the kernels executed the data gets internal converted from NCHW to NHWC for the very step that wont accept NHWC tensors.