cufftExecR2C and cufftExecC2R API calls generates different results in different CUDA tool kit versions

The output generated for cufftExecR2C and cufftExecC2R in CUDA 8.0 and CUDA 10.2 tool kit is different.
Most of the difference is in the floating point decimal values, however there are few locations in which there is huge difference.

Please find below the output:-
line | x y |
131580 | 252 511 |
CUDA 10.2: Real : 327664, Complex : 1.0679e+07
CUDA 8.0 : Real : 327712, Complex : 1.0679e+007

Is this expected behaviour?

Please find below the system configuration used:-
CUDA 10.2:
CUDA ToolKit : 10.2
Windows : 10
IDE : Visual Studio 2017
GPU CARD : P5000

CUDA 8.0:
CUDA ToolKit : 8.0
Windows : 7
IDE : Visual Studio 2010
GPU CARD : P5000

Note:-The output data is consistent for every run.

Please find below the sample code created to dump cufftExecR2C output data.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <cufft.h>
#include <malloc.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <sstream>
#include <fstream>
#include <iostream>

using namespace std;

typedef float2 Complex;
float * dev_vol = NULL;

#define IMAGE_DIM 512

void 
	dev_save_complex(std::string file_name,Complex *data,int xDim,int yDim)
{
	cudaError_t cuda_err;
	cufftComplex *host_buff;

	int noOfComplexElmtsInPix = yDim * (xDim / 2 + 1);
	int noOfComplexElmtsInBytes = noOfComplexElmtsInPix * sizeof(cufftComplex);

	//alloc the host buffer
	host_buff = (cufftComplex*)malloc(noOfComplexElmtsInBytes);
	if(host_buff == NULL){
		printf("[dev_save_complex] Error in memory allocation");
		abort();
	}

	cuda_err= cudaMemcpy(host_buff, data, noOfComplexElmtsInBytes,cudaMemcpyDeviceToHost);
	if(cuda_err != cudaSuccess) { 
		printf("[dev_save_complex] Error in cudaMemcpy : %s\n",cudaGetErrorString(cuda_err));
		abort();
	}

	ofstream outdata; 
	outdata.open(file_name);
	if(!outdata) {
		cerr << "[dev_save_complex] Error: file could not be opened" << endl;
		exit(1);
	}

	// print the complex data so you can see what it looks like
	for (int y=0; y < yDim; y++) {
		for (int x=0; x < (xDim/2+1); x++) {
			// initialize the input image memory somehow
			// (this probably comes from a file or image buffer or something)
			outdata <<  x << " " << y << " " << host_buff[y*(xDim/2+1)+x].x << " " << host_buff[y*(xDim/2+1)+x].y << endl;
		}
	}
	outdata.close();

	free(host_buff);
}

cudaError_t cufftTest () {

	cudaError_t cudaStatus = cudaSuccess;
	cufftResult cudaCUFFTStatus = CUFFT_SUCCESS;

	// Declare handles to the FFT plans
	cufftHandle forwardFFTPlan;
	cufftHandle inverseFFTPlan;

	// Create the plans -- forward and reverse (Real2Complex, Complex2Real)
	cufftPlan2d(&forwardFFTPlan, IMAGE_DIM, IMAGE_DIM, CUFFT_R2C);
	cufftPlan2d(&inverseFFTPlan, IMAGE_DIM, IMAGE_DIM, CUFFT_C2R);

	int num_real_elements = IMAGE_DIM * IMAGE_DIM;
	int num_complex_elements = IMAGE_DIM * (IMAGE_DIM / 2 + 1);

	// HOST MEMORY
	float *h_img;
	float *h_imgF;
	cufftComplex* h_complex_imgSpec;

	// ALLOCATE HOST MEMORY
	h_img = (float*)malloc(num_real_elements * sizeof(float));
	h_complex_imgSpec = (cufftComplex*)malloc(num_complex_elements * sizeof(cufftComplex));
	h_imgF = (float*)malloc(num_real_elements * sizeof(float));

	// Generate Input Data
	for (int x=0; x < IMAGE_DIM; x++) {
		for (int y=0; y < IMAGE_DIM; y++) {
			float value = (float)(x * y);
			h_img[y*IMAGE_DIM+x] = value + (value * 0.000001f);
		}
	}

	// DEVICE MEMORY
	float *d_img;
	cufftComplex *d_complex_imgSpec;
	float *d_imgF;

	// ALLOCATE DEVICE MEMORY
	cudaMalloc( (void**) &d_img, num_real_elements * sizeof(float));
	cudaMalloc( (void**) &d_imgF, num_real_elements * sizeof(float));
	cudaMalloc( (void**) &d_complex_imgSpec, num_complex_elements * sizeof(cufftComplex));	

	// Copy host memory to device (input image)
	cudaStatus = cudaMemcpy( d_img, h_img, num_real_elements * sizeof(float), cudaMemcpyHostToDevice);
	if(cudaStatus != cudaSuccess) { 
		printf("[cufftTest] Error cudaMemcpy : %s\n",cudaGetErrorString(cudaStatus));
		return cudaStatus;
	}

	// Forward FFT (real to complex)
	cudaCUFFTStatus = cufftExecR2C(forwardFFTPlan, d_img, d_complex_imgSpec);
	if(cudaCUFFTStatus != CUFFT_SUCCESS) { 
		printf("[cufftTest] Error cufftExecR2C : %s\n",cudaGetErrorString(cudaStatus));
		return cudaErrorPriorLaunchFailure;
	}

	// Copy the DEVICE complex data to the HOST
	cudaStatus = cudaMemcpy( h_complex_imgSpec, d_complex_imgSpec, num_complex_elements * sizeof(cufftComplex), cudaMemcpyDeviceToHost);
	if(cudaStatus != cudaSuccess) { 
		printf("[cufftTest] Error cudaMemcpy : %s\n",cudaGetErrorString(cudaStatus));
		return cudaStatus;
	}

	// IFFT (complex to real)
	cudaCUFFTStatus = cufftExecC2R(inverseFFTPlan, d_complex_imgSpec, d_imgF);
	if(cudaCUFFTStatus != CUFFT_SUCCESS) { 
		printf("[cufftTest] Error cufftExecC2R : %s\n",cudaGetErrorString(cudaStatus));
		return cudaErrorPriorLaunchFailure;
	}

	// Copy the DEVICE memory to the HOST memory
	cudaStatus = cudaMemcpy( h_imgF, d_imgF, num_real_elements * sizeof(float), cudaMemcpyDeviceToHost);
	if(cudaStatus != cudaSuccess) { 
		printf("[cufftTest] Error cudaMemcpy : %s\n",cudaGetErrorString(cudaStatus));
		return cudaStatus;
	}

#if 1
	// Generate Ouput data into file
	std::ostringstream fileNo;
	std::string fileName = "SA_CUFFT_Cmplx_CUDA10_3.txt";
	dev_save_complex(fileName, d_complex_imgSpec, IMAGE_DIM, IMAGE_DIM);
#endif

	// CLEANUP HOST MEMORY
	free(h_img);
	free(h_imgF);
	free(h_complex_imgSpec);

	// CLEANUP DEVICE MEMORY
	cudaFree(d_img);
	cudaFree(d_imgF);
	cudaFree(d_complex_imgSpec);

	return cudaStatus;
}

int main()
{

	// Call Helper Function for cufftTest
	cudaError_t cudaStatus = cufftTest();
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cufftTest Failed!\n");
		return 1;
	} else {
		fprintf(stderr, "cufftTest Successful!\n");
	}

	return 0;
}

It is certainly possible behavior. An FFT consists of a sum of multiplications. Depending on how that sum is constructed (i.e. order of operations) the results can change. “Large” deviations can come about in a several ways. And changes in the order of processing/summing intermediate results could lead to changes in the numerical result produced. You might wish to read this.

Regarding what is correct, I don’t know. The difference you have shown is on the order of 1e-4 when looking at the magnitude of the real component. Since you are doing float arithmetic, you probably should never expect better accuracy than 1e-6. An interesting test would be to look at the results using double arithmetic. It’s possible that the CUDA 8.0 result was “farther” from the correctly rounded result at arbitrary precision.