cudnnBatchNormalizationForwardInference results are different from Tensorflow

I have a batchnorm layer in my net with very small variance values (1.2e-37) and this causes this layer to produce slightly different results, and this difference starts accumulating in later layers (more than 40 layers follow) and the final output of the network differs from the tensorflow version. I’ve verified this by replacing the output of this batchnorm layer with the correct output, which results in correct final outputs.

This doesn’t make sense considering tensorflow supposedly uses the exact same method of doing the computation. Is there anything that needs to be kept into account with very small variance values? I set my epsilon correctly and the layer is setup and launched as below:

//--------------------------------------------------
	//					layer setup batch_normalization_7
	//--------------------------------------------------
	cudnnTensorDescriptor_t  desc_tsrBatch_normalization_7;
	cudnnTensorDescriptor_t  desc_wgtBatch_normalization_7;

	float* outBatch_normalization_7;
	float* dev_outBatch_normalization_7;
	float* gBatch_normalization_7;
	float* bBatch_normalization_7;
	float* mBatch_normalization_7;
	float* vBatch_normalization_7;

	float* dev_gBatch_normalization_7;
	float* dev_bBatch_normalization_7;
	float* dev_mBatch_normalization_7;
	float* dev_vBatch_normalization_7;

	int h_inshpBatch_normalization_7 = h_outshpConcatenate_5;
	int w_inshpBatch_normalization_7 = w_outshpConcatenate_5;
	int c_inshpBatch_normalization_7 = c_outshpConcatenate_5;

	int h_outshpBatch_normalization_7 = h_inshpBatch_normalization_7;
	int w_outshpBatch_normalization_7 = w_inshpBatch_normalization_7;
	int c_outshpBatch_normalization_7 = c_inshpBatch_normalization_7;

	CUDNN_ERROR(cudnnCreateTensorDescriptor(&desc_tsrBatch_normalization_7), "cudnnCreateTensorDescriptor");
	CUDNN_ERROR(cudnnCreateTensorDescriptor(&desc_wgtBatch_normalization_7), "cudnnCreateTensorDescriptor");

	CUDNN_ERROR(cudnnSetTensor4dDescriptor(desc_tsrBatch_normalization_7, CUDNN_TENSOR_NHWC, CUDNN_DATA_FLOAT, 1, c_inshpBatch_normalization_7, h_inshpBatch_normalization_7, w_inshpBatch_normalization_7), "cudnnSetTensor4dDescriptor");
	CUDNN_ERROR(cudnnDeriveBNTensorDescriptor(desc_wgtBatch_normalization_7, desc_tsrConcatenate_5, CUDNN_BATCHNORM_SPATIAL), "cudnnDeriveBNTensorDescriptor");

	outBatch_normalization_7 = (float*)malloc(c_inshpBatch_normalization_7 * h_inshpBatch_normalization_7 * w_inshpBatch_normalization_7 * sizeof(float));
	CUDA_ERROR(cudaMalloc((void**)&dev_outBatch_normalization_7, c_inshpBatch_normalization_7 * h_inshpBatch_normalization_7 * w_inshpBatch_normalization_7 * sizeof(float)), "cudaMalloc");
	CUDA_ERROR(cudaMalloc((void**)&dev_gBatch_normalization_7, c_inshpBatch_normalization_7 * sizeof(float)), "cudaMalloc");
	CUDA_ERROR(cudaMalloc((void**)&dev_bBatch_normalization_7, c_inshpBatch_normalization_7 * sizeof(float)), "cudaMalloc");
	CUDA_ERROR(cudaMalloc((void**)&dev_mBatch_normalization_7, c_inshpBatch_normalization_7 * sizeof(float)), "cudaMalloc");
	CUDA_ERROR(cudaMalloc((void**)&dev_vBatch_normalization_7, c_inshpBatch_normalization_7 * sizeof(float)), "cudaMalloc");

	gBatch_normalization_7 = (float*)malloc(c_inshpBatch_normalization_7 * sizeof(float));
	bBatch_normalization_7 = (float*)malloc(c_inshpBatch_normalization_7 * sizeof(float));
	mBatch_normalization_7 = (float*)malloc(c_inshpBatch_normalization_7 * sizeof(float));
	vBatch_normalization_7 = (float*)malloc(c_inshpBatch_normalization_7 * sizeof(float));

	uclLoadDataf("wgts1DenseNet45/gbatch_normalization_7.csv", gBatch_normalization_7, c_inshpBatch_normalization_7);
	uclLoadDataf("wgts1DenseNet45/bbatch_normalization_7.csv", bBatch_normalization_7, c_inshpBatch_normalization_7);
	uclLoadDataf("wgts1DenseNet45/mbatch_normalization_7.csv", mBatch_normalization_7, c_inshpBatch_normalization_7);
	uclLoadDataf("wgts1DenseNet45/vbatch_normalization_7.csv", vBatch_normalization_7, c_inshpBatch_normalization_7);
	preprocWgtsBatchnorm(vBatch_normalization_7, 1,1,1,c_outshpBatch_normalization_7);
	printf("outshp is: %i\n", c_outshpBatch_normalization_7);
	
	CUDA_ERROR(cudaMemcpy(dev_gBatch_normalization_7, gBatch_normalization_7, c_inshpBatch_normalization_7 * sizeof(float), cudaMemcpyHostToDevice), "cudaMemcpy");
	CUDA_ERROR(cudaMemcpy(dev_bBatch_normalization_7, bBatch_normalization_7, c_inshpBatch_normalization_7 * sizeof(float), cudaMemcpyHostToDevice), "cudaMemcpy");
	CUDA_ERROR(cudaMemcpy(dev_mBatch_normalization_7, mBatch_normalization_7, c_inshpBatch_normalization_7 * sizeof(float), cudaMemcpyHostToDevice), "cudaMemcpy");
	CUDA_ERROR(cudaMemcpy(dev_vBatch_normalization_7, vBatch_normalization_7, c_inshpBatch_normalization_7 * sizeof(float), cudaMemcpyHostToDevice), "cudaMemcpy");
CUDNN_ERROR(cudnnBatchNormalizationForwardInference(hnd_cuDNN, CUDNN_BATCHNORM_SPATIAL, &alpha1, &beta1, desc_tsrConcatenate_5, dev_outConcatenate_5, desc_tsrBatch_normalization_7, dev_outBatch_normalization_7, desc_wgtBatch_normalization_7, dev_gBatch_normalization_7, dev_bBatch_normalization_7, dev_mBatch_normalization_7, dev_vBatch_normalization_7, 1e-05), "cudnnBatchNormalizationForwardInference");

Hi,

Could you please provide details on the platforms you are using so we can better help:
o Linux distro and version
o GPU type
o Nvidia driver version
o CUDA version
o CUDNN version
o Python version [if using python]
o Tensorflow and PyTorch version

If possible, please share the script & model file to reproduce the issue.

Thanks