The output generated for cufftExecR2C and cufftExecC2R in CUDA 8.0 and CUDA 10.2 tool kit is different.
Most of the difference is in the floating point decimal values, however there are few locations in which there is huge difference.
Please find below the output:-
line | x y |
131580 | 252 511 |
CUDA 10.2: Real : 327664, Complex : 1.0679e+07
CUDA 8.0 : Real : 327712, Complex : 1.0679e+007
Is this expected behaviour?
Please find below the system configuration used:-
CUDA 10.2:
CUDA ToolKit : 10.2
Windows : 10
IDE : Visual Studio 2017
GPU CARD : P5000
CUDA 8.0:
CUDA ToolKit : 8.0
Windows : 7
IDE : Visual Studio 2010
GPU CARD : P5000
Note:-The output data is consistent for every run.
Please find below the sample code created to dump cufftExecR2C output data.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <cufft.h>
#include <malloc.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <sstream>
#include <fstream>
#include <iostream>
using namespace std;
typedef float2 Complex;
float * dev_vol = NULL;
#define IMAGE_DIM 512
void
dev_save_complex(std::string file_name,Complex *data,int xDim,int yDim)
{
cudaError_t cuda_err;
cufftComplex *host_buff;
int noOfComplexElmtsInPix = yDim * (xDim / 2 + 1);
int noOfComplexElmtsInBytes = noOfComplexElmtsInPix * sizeof(cufftComplex);
//alloc the host buffer
host_buff = (cufftComplex*)malloc(noOfComplexElmtsInBytes);
if(host_buff == NULL){
printf("[dev_save_complex] Error in memory allocation");
abort();
}
cuda_err= cudaMemcpy(host_buff, data, noOfComplexElmtsInBytes,cudaMemcpyDeviceToHost);
if(cuda_err != cudaSuccess) {
printf("[dev_save_complex] Error in cudaMemcpy : %s\n",cudaGetErrorString(cuda_err));
abort();
}
ofstream outdata;
outdata.open(file_name);
if(!outdata) {
cerr << "[dev_save_complex] Error: file could not be opened" << endl;
exit(1);
}
// print the complex data so you can see what it looks like
for (int y=0; y < yDim; y++) {
for (int x=0; x < (xDim/2+1); x++) {
// initialize the input image memory somehow
// (this probably comes from a file or image buffer or something)
outdata << x << " " << y << " " << host_buff[y*(xDim/2+1)+x].x << " " << host_buff[y*(xDim/2+1)+x].y << endl;
}
}
outdata.close();
free(host_buff);
}
cudaError_t cufftTest () {
cudaError_t cudaStatus = cudaSuccess;
cufftResult cudaCUFFTStatus = CUFFT_SUCCESS;
// Declare handles to the FFT plans
cufftHandle forwardFFTPlan;
cufftHandle inverseFFTPlan;
// Create the plans -- forward and reverse (Real2Complex, Complex2Real)
cufftPlan2d(&forwardFFTPlan, IMAGE_DIM, IMAGE_DIM, CUFFT_R2C);
cufftPlan2d(&inverseFFTPlan, IMAGE_DIM, IMAGE_DIM, CUFFT_C2R);
int num_real_elements = IMAGE_DIM * IMAGE_DIM;
int num_complex_elements = IMAGE_DIM * (IMAGE_DIM / 2 + 1);
// HOST MEMORY
float *h_img;
float *h_imgF;
cufftComplex* h_complex_imgSpec;
// ALLOCATE HOST MEMORY
h_img = (float*)malloc(num_real_elements * sizeof(float));
h_complex_imgSpec = (cufftComplex*)malloc(num_complex_elements * sizeof(cufftComplex));
h_imgF = (float*)malloc(num_real_elements * sizeof(float));
// Generate Input Data
for (int x=0; x < IMAGE_DIM; x++) {
for (int y=0; y < IMAGE_DIM; y++) {
float value = (float)(x * y);
h_img[y*IMAGE_DIM+x] = value + (value * 0.000001f);
}
}
// DEVICE MEMORY
float *d_img;
cufftComplex *d_complex_imgSpec;
float *d_imgF;
// ALLOCATE DEVICE MEMORY
cudaMalloc( (void**) &d_img, num_real_elements * sizeof(float));
cudaMalloc( (void**) &d_imgF, num_real_elements * sizeof(float));
cudaMalloc( (void**) &d_complex_imgSpec, num_complex_elements * sizeof(cufftComplex));
// Copy host memory to device (input image)
cudaStatus = cudaMemcpy( d_img, h_img, num_real_elements * sizeof(float), cudaMemcpyHostToDevice);
if(cudaStatus != cudaSuccess) {
printf("[cufftTest] Error cudaMemcpy : %s\n",cudaGetErrorString(cudaStatus));
return cudaStatus;
}
// Forward FFT (real to complex)
cudaCUFFTStatus = cufftExecR2C(forwardFFTPlan, d_img, d_complex_imgSpec);
if(cudaCUFFTStatus != CUFFT_SUCCESS) {
printf("[cufftTest] Error cufftExecR2C : %s\n",cudaGetErrorString(cudaStatus));
return cudaErrorPriorLaunchFailure;
}
// Copy the DEVICE complex data to the HOST
cudaStatus = cudaMemcpy( h_complex_imgSpec, d_complex_imgSpec, num_complex_elements * sizeof(cufftComplex), cudaMemcpyDeviceToHost);
if(cudaStatus != cudaSuccess) {
printf("[cufftTest] Error cudaMemcpy : %s\n",cudaGetErrorString(cudaStatus));
return cudaStatus;
}
// IFFT (complex to real)
cudaCUFFTStatus = cufftExecC2R(inverseFFTPlan, d_complex_imgSpec, d_imgF);
if(cudaCUFFTStatus != CUFFT_SUCCESS) {
printf("[cufftTest] Error cufftExecC2R : %s\n",cudaGetErrorString(cudaStatus));
return cudaErrorPriorLaunchFailure;
}
// Copy the DEVICE memory to the HOST memory
cudaStatus = cudaMemcpy( h_imgF, d_imgF, num_real_elements * sizeof(float), cudaMemcpyDeviceToHost);
if(cudaStatus != cudaSuccess) {
printf("[cufftTest] Error cudaMemcpy : %s\n",cudaGetErrorString(cudaStatus));
return cudaStatus;
}
#if 1
// Generate Ouput data into file
std::ostringstream fileNo;
std::string fileName = "SA_CUFFT_Cmplx_CUDA10_3.txt";
dev_save_complex(fileName, d_complex_imgSpec, IMAGE_DIM, IMAGE_DIM);
#endif
// CLEANUP HOST MEMORY
free(h_img);
free(h_imgF);
free(h_complex_imgSpec);
// CLEANUP DEVICE MEMORY
cudaFree(d_img);
cudaFree(d_imgF);
cudaFree(d_complex_imgSpec);
return cudaStatus;
}
int main()
{
// Call Helper Function for cufftTest
cudaError_t cudaStatus = cufftTest();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cufftTest Failed!\n");
return 1;
} else {
fprintf(stderr, "cufftTest Successful!\n");
}
return 0;
}