I have a batchnorm layer in my net with very small variance values (1.2e-37) and this causes this layer to produce slightly different results, and this difference starts accumulating in later layers (more than 40 layers follow) and the final output of the network differs from the tensorflow version. I’ve verified this by replacing the output of this batchnorm layer with the correct output, which results in correct final outputs.
This doesn’t make sense considering tensorflow supposedly uses the exact same method of doing the computation. Is there anything that needs to be kept into account with very small variance values? I set my epsilon correctly and the layer is setup and launched as below:
//--------------------------------------------------
// layer setup batch_normalization_7
//--------------------------------------------------
cudnnTensorDescriptor_t desc_tsrBatch_normalization_7;
cudnnTensorDescriptor_t desc_wgtBatch_normalization_7;
float* outBatch_normalization_7;
float* dev_outBatch_normalization_7;
float* gBatch_normalization_7;
float* bBatch_normalization_7;
float* mBatch_normalization_7;
float* vBatch_normalization_7;
float* dev_gBatch_normalization_7;
float* dev_bBatch_normalization_7;
float* dev_mBatch_normalization_7;
float* dev_vBatch_normalization_7;
int h_inshpBatch_normalization_7 = h_outshpConcatenate_5;
int w_inshpBatch_normalization_7 = w_outshpConcatenate_5;
int c_inshpBatch_normalization_7 = c_outshpConcatenate_5;
int h_outshpBatch_normalization_7 = h_inshpBatch_normalization_7;
int w_outshpBatch_normalization_7 = w_inshpBatch_normalization_7;
int c_outshpBatch_normalization_7 = c_inshpBatch_normalization_7;
CUDNN_ERROR(cudnnCreateTensorDescriptor(&desc_tsrBatch_normalization_7), "cudnnCreateTensorDescriptor");
CUDNN_ERROR(cudnnCreateTensorDescriptor(&desc_wgtBatch_normalization_7), "cudnnCreateTensorDescriptor");
CUDNN_ERROR(cudnnSetTensor4dDescriptor(desc_tsrBatch_normalization_7, CUDNN_TENSOR_NHWC, CUDNN_DATA_FLOAT, 1, c_inshpBatch_normalization_7, h_inshpBatch_normalization_7, w_inshpBatch_normalization_7), "cudnnSetTensor4dDescriptor");
CUDNN_ERROR(cudnnDeriveBNTensorDescriptor(desc_wgtBatch_normalization_7, desc_tsrConcatenate_5, CUDNN_BATCHNORM_SPATIAL), "cudnnDeriveBNTensorDescriptor");
outBatch_normalization_7 = (float*)malloc(c_inshpBatch_normalization_7 * h_inshpBatch_normalization_7 * w_inshpBatch_normalization_7 * sizeof(float));
CUDA_ERROR(cudaMalloc((void**)&dev_outBatch_normalization_7, c_inshpBatch_normalization_7 * h_inshpBatch_normalization_7 * w_inshpBatch_normalization_7 * sizeof(float)), "cudaMalloc");
CUDA_ERROR(cudaMalloc((void**)&dev_gBatch_normalization_7, c_inshpBatch_normalization_7 * sizeof(float)), "cudaMalloc");
CUDA_ERROR(cudaMalloc((void**)&dev_bBatch_normalization_7, c_inshpBatch_normalization_7 * sizeof(float)), "cudaMalloc");
CUDA_ERROR(cudaMalloc((void**)&dev_mBatch_normalization_7, c_inshpBatch_normalization_7 * sizeof(float)), "cudaMalloc");
CUDA_ERROR(cudaMalloc((void**)&dev_vBatch_normalization_7, c_inshpBatch_normalization_7 * sizeof(float)), "cudaMalloc");
gBatch_normalization_7 = (float*)malloc(c_inshpBatch_normalization_7 * sizeof(float));
bBatch_normalization_7 = (float*)malloc(c_inshpBatch_normalization_7 * sizeof(float));
mBatch_normalization_7 = (float*)malloc(c_inshpBatch_normalization_7 * sizeof(float));
vBatch_normalization_7 = (float*)malloc(c_inshpBatch_normalization_7 * sizeof(float));
uclLoadDataf("wgts1DenseNet45/gbatch_normalization_7.csv", gBatch_normalization_7, c_inshpBatch_normalization_7);
uclLoadDataf("wgts1DenseNet45/bbatch_normalization_7.csv", bBatch_normalization_7, c_inshpBatch_normalization_7);
uclLoadDataf("wgts1DenseNet45/mbatch_normalization_7.csv", mBatch_normalization_7, c_inshpBatch_normalization_7);
uclLoadDataf("wgts1DenseNet45/vbatch_normalization_7.csv", vBatch_normalization_7, c_inshpBatch_normalization_7);
preprocWgtsBatchnorm(vBatch_normalization_7, 1,1,1,c_outshpBatch_normalization_7);
printf("outshp is: %i\n", c_outshpBatch_normalization_7);
CUDA_ERROR(cudaMemcpy(dev_gBatch_normalization_7, gBatch_normalization_7, c_inshpBatch_normalization_7 * sizeof(float), cudaMemcpyHostToDevice), "cudaMemcpy");
CUDA_ERROR(cudaMemcpy(dev_bBatch_normalization_7, bBatch_normalization_7, c_inshpBatch_normalization_7 * sizeof(float), cudaMemcpyHostToDevice), "cudaMemcpy");
CUDA_ERROR(cudaMemcpy(dev_mBatch_normalization_7, mBatch_normalization_7, c_inshpBatch_normalization_7 * sizeof(float), cudaMemcpyHostToDevice), "cudaMemcpy");
CUDA_ERROR(cudaMemcpy(dev_vBatch_normalization_7, vBatch_normalization_7, c_inshpBatch_normalization_7 * sizeof(float), cudaMemcpyHostToDevice), "cudaMemcpy");
CUDNN_ERROR(cudnnBatchNormalizationForwardInference(hnd_cuDNN, CUDNN_BATCHNORM_SPATIAL, &alpha1, &beta1, desc_tsrConcatenate_5, dev_outConcatenate_5, desc_tsrBatch_normalization_7, dev_outBatch_normalization_7, desc_wgtBatch_normalization_7, dev_gBatch_normalization_7, dev_bBatch_normalization_7, dev_mBatch_normalization_7, dev_vBatch_normalization_7, 1e-05), "cudnnBatchNormalizationForwardInference");