cudnnBatchNormalizationForwardTraining Results in batchNormOutputTensor with Same Large Negative Double

Using cuDNN primitives and methods, I’ve implemented activation of a twenty seven element array of doubles representing the output tensor of convolution of a three pixel by three pixel image / OpenCV Mat object.

I run into trouble when I insert object definitions and a call to cudnnBatchNormalizationForwardTraining beginning after the comment “Perform batch normalization”. The call to cudnnBatchNormalizationForwardTraining succeeds based on the output of the method. However, even though the input tensor to cudnnActivationForward is still set to d_convolutionOutputTensor, all the values in the array d_activationOutputTensor become -6.27744e+66. Similarly, when I change the input tensor to cudnnActivationForward to d_batchNormOutputTensor, all the values in the array d_activationOutputTensor become -6.27744e+66, even though I’m applying a RELU activation function. Why is cudnnBatchNormalizationForwardTraining overwriting d_convolutionOutputTensor and writing an unrealistic d_batchNormOutputTensor? How do I use this method correctly?

If you would offer advice, I would encourage you to compile my code by using a Windows-10 PC, installing an NVIDIA GPU, installing appropriate NVIDIA drivers, installing CUDA 10.2, installing cuDNN 7.6.5, inserting the below code into a cleared kernel.cu in a new Visual Studio 2019 project using the CUDA 10.2 runtime, adding “cudnn.lib;” to the Project Convolution → Linker → Input → Additional Dependencies property, building the solution in Debug x64 mode, and starting debugging.

// ---------------------------------------
// Allow use of cuDNN classes and methods.
// ---------------------------------------
// Add "cudnn.lib;" to Project Convolution -> Linker -> Input -> Additional Dependencies.
#include <cudnn.h>


int main()
{

	// -------------------------------------
	// Initialize handle to library context.
	// -------------------------------------
	cudnnHandle_t cudnnHandle;

	cudnnCreate(&cudnnHandle);

	// -------------------------------------------------------------------------
	// Define some properties of a convolutionOutputTensor.
	// -------------------------------------------------------------------------
	const int numberOfOutputTensors = 1;
	const int numberOfChannelsInOutputTensor = 3;
	const int heightOfOutputTensor = 3;
	const int widthOfOutputTensor = 3;

	const int numberOfElementsInOutputTensor =
		numberOfChannelsInOutputTensor *
		heightOfOutputTensor *
		widthOfOutputTensor;

	int outputTensorBytes = numberOfElementsInOutputTensor * sizeof(double);

	// -------------------------------------------
	// Create an outputTensorDescriptor.
	// -------------------------------------------
	// Declare a convolutionOutputTensorDescriptor.
	cudnnTensorDescriptor_t outputTensorDescriptor;

	// Initialize the convolutionOutputTensorDescriptor.
	cudnnCreateTensorDescriptor(&outputTensorDescriptor);

	// Set all the properties of the convolutionOutputTensorDescriptor.
	cudnnSetTensor4dDescriptor(
		/*tensorDesc=*/outputTensorDescriptor,
		/*format=*/CUDNN_TENSOR_NCHW,
		/*dataType=*/CUDNN_DATA_DOUBLE,
		/*n=*/numberOfOutputTensors,
		/*c=*/numberOfChannelsInOutputTensor,
		/*h=*/heightOfOutputTensor,
		/*w=*/widthOfOutputTensor);

	double h_convolutionOutputTensor[numberOfElementsInOutputTensor] = {
		-21.0118,
		-4.45882,
		4.49412,
		-4.45882,
		13.4824,
		-18.0118,
		4.49412,
		-18.0118,
		-5.95294,
		-21.0118,
		-4.45882,
		4.49412,
		-4.45882,
		13.4824,
		-18.0118,
		4.49412,
		-18.0118,
		-5.95294,
		-21.0118,
		-4.45882,
		4.49412,
		-4.45882,
		13.4824,
		-18.0118,
		4.49412,
		-18.0118,
		-5.95294
	};

	double* d_convolutionOutputTensor;

	cudaMalloc(&d_convolutionOutputTensor, outputTensorBytes);

	cudaMemcpy(
		d_convolutionOutputTensor,
		h_convolutionOutputTensor,
		outputTensorBytes,
		cudaMemcpyHostToDevice);

	// ------------------------------------------------------------------
	// Declare and allocate memory on device for a batchNormOutputTensor.
	// ------------------------------------------------------------------
	// Declare a pointer to d_batchNormOutputTensor.
	double* d_batchNormOutputTensor;

	// Allocate GPU buffer at address pointed to by d_batchNormOutputTensor.
	cudaMalloc(&d_batchNormOutputTensor, outputTensorBytes);

	// ----------------------------
	// Perform batch normalization.
	// ----------------------------
	// Define batchNormMode.
	cudnnBatchNormMode_t batchNormMode = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;

	// Declare derivedBNTensorDescriptor.
	cudnnTensorDescriptor_t derivedBNTensorDescriptor;

	// Initialize derivedBNTensorDescriptor.
	cudnnCreateTensorDescriptor(&derivedBNTensorDescriptor);
	
	// Re-initialize derivedBNTensorDescriptor.
	cudnnDeriveBNTensorDescriptor(
		/*derivedBNDesc=*/derivedBNTensorDescriptor,
		/*xDesc=*/outputTensorDescriptor,
		/*mode=*/batchNormMode);

	// calloc is necessary to prevent buffer overrun.
	double* batchNormScales = (double*)calloc(numberOfChannelsInOutputTensor, sizeof(double));
	for (int channel = 0; channel < numberOfChannelsInOutputTensor; ++channel) {
		batchNormScales[channel] = 1.0; // f indicates double, not double.
	}

	double* batchNormBiases = (double*)calloc(numberOfChannelsInOutputTensor, sizeof(double));

	double expAverageFactor = 1.0;

	double* resultRunningMeans = (double*)malloc(numberOfChannelsInOutputTensor * sizeof(double));

	double* resultRunningVariances = (double*)malloc(numberOfChannelsInOutputTensor * sizeof(double));

	double epsln = 0.00001;

	double* pointerToSaveResultMean = (double*)malloc(numberOfChannelsInOutputTensor * sizeof(double));

	double* pointerToSaveResultInvVariance =
		(double*)malloc(numberOfChannelsInOutputTensor * sizeof(double));

	double one = 1.0;
	double zero = 0.0;

	cudnnBatchNormalizationForwardTraining(
		/*handle=*/cudnnHandle,
		/*mode=*/batchNormMode,
		/**alpha=*/&one,
		/**beta=*/&zero,
		/*xDesc=*/outputTensorDescriptor,
		/**x=*/d_convolutionOutputTensor,
		/*yDesc=*/outputTensorDescriptor,
		/**y=*/d_batchNormOutputTensor,
		/*bnScaleBiasMeanVarDesc=*/derivedBNTensorDescriptor,
		/*bnScaleData=*/batchNormScales,
		/*bnBiasData=*/batchNormBiases,
		/*exponentialAverageFactor=*/expAverageFactor,
		/*resultRunningMeanData=*/resultRunningMeans,
		/*resultRunningVarianceData=*/resultRunningVariances,
		/*epsilon=*/epsln,
		/*resultSaveMean=*/pointerToSaveResultMean,
		/*resultSaveInvVariance=*/pointerToSaveResultInvVariance);

	// -----------------------------------------
	// Create an activation-function descriptor.
	// -----------------------------------------
	// Declare activationDescriptor.
	cudnnActivationDescriptor_t activationDescriptor;

	// Construct activationDescriptor.
	cudnnCreateActivationDescriptor(&activationDescriptor);

	// Initialize activationDescriptor.
	cudnnSetActivationDescriptor(
		/*activationDesc=*/activationDescriptor,
		/*mode=*/CUDNN_ACTIVATION_RELU,
		/*reluNanOpt=*/CUDNN_PROPAGATE_NAN,
		/*coef=*/std::numeric_limits<double>::infinity());

	// --------------------------------------------------------------------
	// Declare and allocate memory on device for an activationOutputTensor.
	// --------------------------------------------------------------------
	// Declare a pointer to d_activationOutputTensor.
	double* d_activationOutputTensor;

	// Allocate GPU buffer at address pointed to by d_activationOutputTensor.
	cudaMalloc(&d_activationOutputTensor, outputTensorBytes);

	// -----------------------------------------------------------
	// Apply the activation function to the batchNormOutputTensor.
	// -----------------------------------------------------------
	cudnnActivationForward(
		/*handle=*/cudnnHandle,
		/*activationDesc=*/activationDescriptor,
		/**alpha=*/&one,
		/*xDesc=*/outputTensorDescriptor,
		/**x=*/d_batchNormOutputTensor,
		/**beta=*/&zero,
		/*yDesc=*/outputTensorDescriptor,
		/**y=*/d_activationOutputTensor);

	// --------------------------------------
	// Copy d_activationOutputTensor to host.
	// --------------------------------------
	double* h_activationOutputTensor = (double*)malloc(outputTensorBytes);

	cudaMemcpy(
		h_activationOutputTensor,
		d_activationOutputTensor,
		outputTensorBytes,
		cudaMemcpyDeviceToHost);

    for (int i = 0; i < numberOfElementsInOutputTensor; ++i)
    {
         std::cout << h_activationOutputTensor[i] << std::endl;
    }

    // ------------
    // Free memory.
    // ------------
    cudnnDestroy(cudnnHandle);
	
} // main

I have resolved my own issue.

d_batchNormScales, d_batchNormBiases, d_resultRunningMeans, d_resultRunningVariances, d_resultSaveMean, and d_resultSaveInvVariance need to be allocated and/or initialized on the GPU.

Thanks for updating the forum.