Batch normalization implementation using cuDNN

I have implemented batch normalization layer to be used after 3D convolution. It only runs the first data point and it produces a constant result for rest of the data point.

Dimensions:
Input shape - (16, 16, 16)
Batch Size - 32
NumInpChannels - 32
lstfWeights[0] - 32
lstfWeights[1] - 32

checkCudnnErr(cudnnCreateTensorDescriptor(&m_cudnnInpTensorDesc));
checkCudnnErr(cudnnCreateTensorDescriptor(&m_cudnnOutTensorDesc));
checkCudnnErr(cudnnCreateTensorDescriptor(&m_cudnnBiasMeanVarDesc));

int nlInpDims[5];
nlInpDims[0] = nBatchSize;
nlInpDims[1] = nNumInpChannels;
nlInpDims[2] = nlInpShape[0];
nlInpDims[3] = nlInpShape[1];
nlInpDims[4] = nlInpShape[2];

m_nOutputShape = new int[5];
m_nOutputShape[0] = nBatchSize;
m_nOutputShape[1] = nNumInpChannels;
m_nOutputShape[2] = nlInpShape[0];
m_nOutputShape[3] = nlInpShape[1];
m_nOutputShape[4] = nlInpShape[2];

m_cudnnBatchNormMode = CUDNN_BATCHNORM_SPATIAL;

int nlMVBDims[5] = {1, nNumInpChannels, 1, 1, 1};
checkCudnnErr(cudnnSetTensorNdDescriptor(m_cudnnInpTensorDesc,
                                            cudnnDtype,
                                            5,
                                            nlInpDims,
                                            nlInpStrides));
checkCudnnErr(cudnnSetTensorNdDescriptor(m_cudnnOutTensorDesc,
                                            cudnnDtype,
                                            5,
                                            m_nOutputShape,
                                            nlOutputStrides));
checkCudnnErr(cudnnSetTensorNdDescriptor(m_cudnnBiasMeanVarDesc,
                                            CUDNN_DATA_FLOAT,
                                            5,
                                            nlMVBDims,
                                            nlNVBStrides));

const float alpha = 1.0f, beta = 0.0f;
checkCudnnErr(cudnnBatchNormalizationForwardInference(m_cudnnHandle,
                                                m_cudnnBatchNormMode,
                                                &alpha,
                                                &beta,
                                                m_cudnnInpTensorDesc,
                                                lstfInputs[0],
                                                m_cudnnOutTensorDesc,
                                                lstfOutputs[0],
                                                m_cudnnBiasMeanVarDesc,
                                                lstfWeights[3],
                                                lstfWeights[2],
                                                lstfWeights[0],
                                                lstfWeights[1],
                                                (double)1e-6));

Is the implementation correct?

Hi @sks3i,
Kindly check the below document for more details.
https://docs.nvidia.com/deeplearning/cudnn/best-practices/index.html

Thanks!