cuDNN batched input - not running correctly

I have implemented a convolution 3D network using cuDNN. I noticed that the first data point’s output is correct and output of rest of the data points are same (not correct). I have checked the inputs, and they are correct.

I have coded by conv3d as follows:

     inline void GenerateStrides(const int* dimA, int* strideA, int nbDims, bool isNchw) 
    {
        if (isNchw) {
            strideA[nbDims-1] = 1 ;
            for(int d = nbDims-2 ; d >= 0 ; d--) {
                strideA[d] = strideA[d+1] * dimA[d+1] ;
            }
        } else {
            strideA[1] = 1;
            strideA[nbDims-1] = strideA[1]*dimA[1];
            for(int d = nbDims-2 ; d >= 2 ; d--) {
                strideA[d] = strideA[d+1] * dimA[d+1] ;
            }
            strideA[0] = strideA[2]*dimA[2];
        }
    }

   .........

    checkCudnnErr(cudnnCreateTensorDescriptor(&m_sOps.cudnnInputTensor));
    checkCudnnErr(cudnnCreateTensorDescriptor(&m_sOps.cudnnOutputTensor));
    checkCudnnErr(cudnnCreateFilterDescriptor(&m_sOps.cudnnFilter));
    checkCudnnErr(cudnnCreateConvolutionDescriptor(&m_sOps.cudnnConvDesc));

    int nlInpDims[5];
    nlInpDims[0] = nBatchSize;
    nlInpDims[1] = nInpChannels;
    nlInpDims[2] = nlInpShape[0];
    nlInpDims[3] = nlInpShape[1];
    nlInpDims[4] = nlInpShape[2];

    int nlFilterDims[5];
    int nlDilation[3] = {1, 1, 1}; 
    int nlInpStrides[5], nlOutStrides[5];
    nlFilterDims[0] = nNumFilters;
    nlFilterDims[1] = nInpChannels;
    nlFilterDims[2] = nlFilterShape[0];
    nlFilterDims[3] = nlFilterShape[1];
    nlFilterDims[4] = nlFilterShape[2];

   GenerateStrides(nlInpDims, nlInpStrides, 5, 1);

   checkCudnnErr(cudnnSetTensorNdDescriptor(m_sOps.cudnnInputTensor,
                                cudnnDtype,
                                5,
                                nlInpDims,
                                nlInpStrides));

  checkCudnnErr(cudnnSetFilterNdDescriptor(m_sOps.cudnnFilter,
                                        cudnnDtype,
                                        CUDNN_TENSOR_NCHW,
                                        5,
                                        nlFilterDims));

checkCudnnErr(cudnnSetConvolutionNdDescriptor(m_sOps.cudnnConvDesc,
                                            3,
                                            nlPadding,
                                            nlStrides,
                                            nlDilation,
                                            CUDNN_CROSS_CORRELATION,
                                            CUDNN_DATA_FLOAT));

checkCudnnErr(cudnnGetConvolutionNdForwardOutputDim(m_sOps.cudnnConvDesc,
                                                    m_sOps.cudnnInputTensor,
                                                    m_sOps.cudnnFilter,
                                                    5,
                                                    m_nlOutShape));

GenerateStrides(m_nlOutShape, nlOutStrides, 5, 1);
checkCudnnErr(cudnnSetTensorNdDescriptor(m_sOps.cudnnOutputTensor,
                                        cudnnDtype,
                                        5,
                                        m_nlOutShape,
                                        nlOutStrides));

m_cudnnConvAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;

checkCudnnErr(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle,
                                                    m_sOps.cudnnInputTensor,
                                                    m_sOps.cudnnFilter,
                                                    m_sOps.cudnnConvDesc,
                                                    m_sOps.cudnnOutputTensor,
                                                    m_cudnnConvAlgo,
                                                    &m_nWorkspaceSize));
checkCudaErr( cudaMalloc(&m_sOps.lWorkspace, m_nWorkspaceSize));

I have tried the code on both cuda 10.2 and cuda 11.0, cudnn 7.6.5 and cudnn 8.0.

I tried batch size of 4, 32, 256. All have similar results.

Let me know more information is required.