Hello,
apparently, FFT of CUDA 7.5 produces sporadically wrong results in the following mode:
- 1-D FFT R2C (at least)
- in-place
- NATIVE Compatibility (deprecated)
GPU Test Systems:
- GTX 760
- Titan X
Switching to FFTW compatibility, the FFT produces correct results, so I suspect that this is due to the NATIVE Compatibilite (possibly in connection with in-place). The FFT of CUDA 5.0 and CUDA 6.5 provide correct results with NATIVE Compatibility. (NATIVE Compatibility is also declared deprecated in version CUDA 6.5).
Originally, I detected artifacts in results of a large code that uses in-place 1-D FFT R2C and in-place 1-D FFT C2R. To verify I wrote the test code below which checks the amplitudes from 1D-FFT R2C. I did neither test 1D-FFT C2R nor other transformations, seperately.
The test code runs 1D-FFT R2C on 200 time series (traces) with
- NATIVE Compatibility (in-place)
- FFTW Compatibility (in-place)
and compares the amplitudes from both results.
The test-code requires to include the samples/common/inc directory from the CUDA samples. A quick way to compile it is to use the directory structure and Makefile of the CUDA samples, i.e. compile it likewise to e.g. samples/7_CUDALibraries/simpleCUFFT.
By default (without options) the code prints a summary of the traces that differ (if any). “testCUFFT -h” shows a short description and some options. Running with “-o” produces different binary output files.
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <cuda_runtime.h>
#include <cufft.h>
#include <helper_cuda.h>
typedef float2 Complex;
const char *_sdoc[] = {
" ",
" testCUFFT: compute R2C-FFT with Native and FFTW compatibilty ",
" and compare amplitudes of both FFTs ",
" ",
" Usage: ",
" testCUFFT [-h] [-o] [-v] ",
" ",
" Optional parameters: ",
" -h : print this doc ",
" -v : print values for all samples that differ in amplitudes ",
" (may provide large output) ",
" -o : write binary files ",
" ",
" Output binary files: ",
" fft_data.bin : input date for FFT ",
" fft_natv.bin : FFT amplitudes with Native compatibility ",
" fft_fftw.bin : FFT amplitudes with FFTW compatibility ",
" fft_diff.bin : FFT amplitudes of traces that differ ",
" (1. trace Native, 2nd trace FFTW, ...) ",
" Binary file format: ",
" 1-D: values for each trace ",
" Length and sampling of input data traces (in time) and output ",
" FFT-amplitudes (in frequency) are written to terminal ",
" ",
NULL };
// number of traces
#ifndef DATA_NX
#define DATA_NX 200
#endif
// number of time samples for each trace
#ifndef DATA_NT
#define DATA_NT 3072
#endif
// time sampling rate
#ifndef DATA_DT
#define DATA_DT 0.002f
#endif
// time origin
#ifndef DATA_OT
#define DATA_OT 0.f
#endif
// center of ricker wavelet in time
#ifndef DATA_TC
#define DATA_TC ((DATA_NT/4)*DATA_DT + DATA_OT);
#endif
// max. allowed difference between amplitudes of FFT(Native) and FFT(FFTW)
#ifndef DIFF_EPS
#define DIFF_EPS 0.00001f
#endif
#define PI 3.14159265358979323844
#define PI2 9.86960440108935861869
// forward declarations -------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
void runTest( const int argc, const char **argv );
void ricker( float tc, float beta, float dt_ext, float ot, float dt, int nt, float *data );
void print_sdoc( int ierr );
// ----------------------------------------------------------------------------------------
int main(const int argc, const char **argv)
{
runTest(argc, argv);
}
void runTest( const int argc, const char **argv )
{
// -------------------------------------------------------
int nx = DATA_NX; // number of traces
int nt = DATA_NT; // number of samples
// ricker wavelet definition -----------------------------
float ot = DATA_OT;
float dt = DATA_DT;
float tc = DATA_TC;
float beta = 30;
float dt_ext = (float) (3.0*sqrt(6)/(PI*beta));
// -------------------------------------------------------
// frequencies
int nfrq = nt/2+1;
float dfrq = (float) ((1.0/(2.0*dt))/((double) (nfrq-1)));
// --------------------------------------------------------
// padded number of samples for FFTW compatibility
int nt_fftw = 2*nfrq;
// --------------------------------------------------------
size_t size_nx;
size_t size_nt;
size_t size_natv;
size_t size_fftw;
size_t size_nfrq;
int ix, it;
float re, im;
FILE *fp;
// parameters
int verbose = 0;
int write_bin = 0;
// check differences
int diff_num;
float diff;
float diff_eps = DIFF_EPS;
float *diff_max;
int *diff_nix;
int *diff_itx;
// data/results on CPU
float *dat_natv; // for FFT with native compatibility mode
float *dat_fftw; // for FFT with FFTW compatibility mode
// FFT on GPU
Complex *dat_gpu;
// FFT on CPU
Complex *dat_cpu;
cufftHandle cufft_plan;
for ( ix=1; ix<argc; ix++ ){
if ( !strcmp(argv[ix], "-h") ){
print_sdoc( 0 );
}
if ( !strcmp(argv[ix], "-v") ){
verbose = 1;
}
else if ( !strcmp(argv[ix], "-o") ){
ix++;
write_bin = true;
}
else{
printf( "error: undefined argument \"%s\"\n", argv[ix] );
print_sdoc( 1 );
}
}
findCudaDevice( argc, argv );
printf( "processing %d traces\n", nx );
printf( "time : samples and sampling rate: n1=%-5d d1=%-6.5f\n", nt, dt );
printf( "frequ: samples and sampling rate: n1=%-5d d1=%-6.5f\n", nfrq, dfrq );
size_nx = nx * sizeof(float);
size_nt = nt * sizeof(float);
size_natv = nt * nx * sizeof(float);
size_fftw = nt_fftw * nx * sizeof(float);
size_nfrq = nfrq * nx * sizeof(Complex);
// allocate data on CPU and GPU -----------------------------
dat_natv = (float *) malloc(size_natv);
dat_fftw = (float *) malloc(size_fftw);
checkCudaErrors( cudaMalloc((void **)&dat_gpu, size_nfrq) );
dat_cpu = (Complex *) malloc(size_nfrq);
// ----------------------------------------------------------
// allocate max difference for each ix
diff_max = (float *) malloc(size_nx);
diff_nix = (int *) malloc(nx*sizeof(int));
diff_itx = (int *) malloc(nx*sizeof(int));
// ----------------------------------------------------------
// fill CPU data --------------------------------------------------------
// ----------------------------------------------------------------------
for ( ix=0; ix<nx; ix++ ){
for (it=0; it<nt; it++ )
dat_natv[ix*nt+it] = 0.f;
for (it=0; it<nt_fftw; it++ )
dat_fftw[ix*nt_fftw+it] = 0.f;
ricker( tc, beta, dt_ext, ot, dt, nt, &dat_natv[ix*nt] );
memcpy( &dat_fftw[ix*nt_fftw], &dat_natv[ix*nt], size_nt );
}
// ----------------------------------------------------------------------
// ----------------------------------------------------------------------
// optinal write of input data ------------------------------------------
// ----------------------------------------------------------------------
if ( write_bin ){
if ( !(fp = fopen("fft_inp_data.bin","w")) ){
printf("error: could not open file fft_inp_data.bin" );
exit(1);
}
fwrite( &dat_natv[0], nx, nt*sizeof(float), fp );
fclose(fp);
}
// ----------------------------------------------------------------------
// ----------------------------------------------------------------------
// FFT with NATIVE compatibility --------------------------------------------------------------
// --------------------------------------------------------------------------------------------
// initialize
checkCudaErrors( cudaMemset( dat_gpu, 0, size_nfrq ) );
// copy to GPU
checkCudaErrors( cudaMemcpy( dat_gpu, dat_natv, size_natv,
cudaMemcpyHostToDevice) );
// foward FFT real -> complex
checkCudaErrors( cufftPlan1d( &cufft_plan, nt, CUFFT_R2C, nx ) );
checkCudaErrors( cufftSetCompatibilityMode( cufft_plan, CUFFT_COMPATIBILITY_NATIVE ) );
checkCudaErrors( cufftExecR2C( cufft_plan, (cufftReal*) dat_gpu, (cufftComplex*) dat_gpu ) );
cufftDestroy( cufft_plan );
// copy back to CPU
checkCudaErrors( cudaMemcpy( dat_cpu, dat_gpu, size_nfrq,
cudaMemcpyDeviceToHost) );
// store amplitudes in input array
for ( ix=0; ix<nx; ix++ ){
for ( it=0; it<nfrq; it++ ){
re = dat_cpu[ix*nfrq+it].x;
im = dat_cpu[ix*nfrq+it].y;
dat_natv[ix*nt+it] = sqrtf(re*re+im*im);
}
for (; it<nt; it++ )
dat_natv[ix*nt+it] = 0.f;
}
// --------------------------------------------------------------------------------------------
// --------------------------------------------------------------------------------------------
// FFT with FFTW compatibility ---------------------------------------------------------------
// --------------------------------------------------------------------------------------------
// initialize
checkCudaErrors( cudaMemset( dat_gpu, 0, size_nfrq ) );
// copy to GPU
checkCudaErrors( cudaMemcpy( dat_gpu, dat_fftw, size_fftw,
cudaMemcpyHostToDevice) );
// foward FFT real -> complex
checkCudaErrors( cufftPlan1d( &cufft_plan, nt, CUFFT_R2C, nx ) );
checkCudaErrors( cufftSetCompatibilityMode( cufft_plan, CUFFT_COMPATIBILITY_FFTW_PADDING ) );
checkCudaErrors( cufftExecR2C( cufft_plan, (cufftReal*) dat_gpu, (cufftComplex*) dat_gpu ) );
cufftDestroy( cufft_plan );
// copy back to CPU
checkCudaErrors( cudaMemcpy( dat_cpu, dat_gpu, size_nfrq,
cudaMemcpyDeviceToHost) );
for ( ix=0; ix<nx; ix++ ){
for ( it=0; it<nfrq; it++ ){
re = dat_cpu[ix*nfrq+it].x;
im = dat_cpu[ix*nfrq+it].y;
dat_fftw[ix*nt_fftw+it] = sqrtf(re*re+im*im);
}
for (; it<nt_fftw; it++ )
dat_fftw[ix*nt_fftw+it] = 0.f;
}
// --------------------------------------------------------------------------------------------
// --------------------------------------------------------------------------------------------
// optinal write of FFTs (amplitudes) ---------------------------------------------------------
// --------------------------------------------------------------------------------------------
if ( write_bin ){
if ( !(fp = fopen("fft_amp_natv.bin","w")) ){
printf("error: could not open file fft_amp_natv.bin" );
exit(1);
}
for ( ix=0; ix<nx; ix++ ){
fwrite( &dat_natv[ix*nt], nfrq, sizeof(float), fp );
}
fclose(fp);
}
if ( write_bin ){
if ( !(fp = fopen("fft_amp_fftw.bin","w")) ){
printf("error: could not open file fft_amp_fftw.bin" );
exit(1);
}
for ( ix=0; ix<nx; ix++ ){
fwrite( &dat_fftw[ix*nt_fftw], nfrq, sizeof(float), fp );
}
fclose(fp);
}
// --------------------------------------------------------------------------------------------
// --------------------------------------------------------------------------------------------
printf( "\ndifferences of 1D R2C-FFT amplitudes with Native and FFTW compatibility:\n" );
if ( verbose ){
printf( "(ids of trace (ix) and sample (it) start at 0)\n" );
}
diff_num = 0;
for ( ix=0; ix<nx; ix++ ){
diff_max[ix] = 0.f;
diff_nix[ix] = 0;
for ( it=0; it<nfrq; it++ ){
diff = dat_natv[ix*nt+it] - dat_fftw[ix*nt_fftw+it];
if ( fabsf(diff) > fabsf(diff_max[ix]) ){
diff_max[ix] = diff;
diff_itx[ix] = it;
}
if ( fabsf(diff) > diff_eps ){
diff_num++;
diff_nix[ix]++;
if ( verbose ){
printf( "ix=%-4d it=%-4d : %10.5e - %10.5e = %10.5e\n", ix, it,
dat_natv[ix*nt+it], dat_fftw[ix*nt_fftw+it], diff );
}
}// diff
}// it
}// ix
if ( diff_num ){
if ( write_bin ){
if ( !(fp = fopen("fft_amp_diff.bin","w")) ){
printf( "error: could not open file fft_amp_diff.bin" );
exit(1);
}
}
else{
fp = 0;
}
printf( "differences exceeding %8.1e (ids of trace and sample (it) start at 0):\n", diff_eps );
for ( ix=0; ix<nx; ix++ ){
if ( diff_max[ix] > diff_eps ){
if ( fp ){
fwrite( &dat_natv[ix*nt], nfrq, sizeof(float), fp );
fwrite( &dat_fftw[ix*nt_fftw], nfrq, sizeof(float), fp );
}
printf( "trace %5d: %5d samples, max diff at it=%5d, natv=%8.3e fftw=%8.3e diff=%8.3e\n", ix, diff_nix[ix],
diff_itx[ix], dat_natv[ix*nt+diff_itx[ix]], dat_fftw[ix*nt_fftw+diff_itx[ix]], diff_max[ix] );
}
}
printf("total number of different samples: %d\n", diff_num, diff_eps);
if ( fp )
fclose(fp);
}
else{
printf ("no differences exceeding %8.1e\n", diff_eps );
}
free( dat_natv );
free( dat_fftw );
checkCudaErrors( cudaFree(dat_gpu ) );
free( dat_cpu );
free( diff_max);
free( diff_nix);
free( diff_itx);
cudaDeviceReset();
}
void print_sdoc( int ierr )
{
const char **pdoc = _sdoc;
while ( *pdoc )
printf( "%s\n", *pdoc++);
exit( ierr );
}
void ricker( float tc, float beta, float dt_ext, float ot, float dt, int nt, float *data )
{
double beta2, dtc, dtc2;
int i, it1, it2;
beta2 = ((double) beta) * ((double) beta);
dtc = (double) (tc - dt_ext);
it1 = (int) ((dtc-ot)/dt);
it1 = it1 > 0 ? it1 : 0;
dtc = (double) (tc + dt_ext);
it2 = (int) ((dtc-ot)/dt) + 1;
it2 = it2 < nt ? it2+1 : nt;
for ( i=it1; i<it2; i++ ){
dtc = ot + i*dt - tc;
dtc2 = dtc * dtc;
data[i] = (float) ((1.0-2.0*(PI2*beta2*dtc2))*exp(-(PI2*beta2*dtc2)));
}
}
If you have gnuplot available the binary file containing different results (traces) from NATIVE and FFTW can be visualized with the following gnuplot commands:
nf=1537
df=0.08138
set vi 70,50
set xlabel "Frequency [Hz]"
set ylabel "Diff. Trace #"
set ytics 1
fft_file = 'fft_amp_diff.bin'
splot \
fft_file binary array=(nf,max_ntr) origin=(0,0,0) dx=df dy=1 format='%float' \
title "FFT amplitudes: traces NATIVE: 0,2,..; traces FFTW: 1,3,.." with impulses lt 1
You can save these lines in a file (e.g. fft.gp) and run “gnuplot -persist fft.gp” to visualize the amplitudes of the FFT for traces that differ.
On my test systems the erroneous amplitudes and occurence from seperate runs may differ. It also depends on the number of samples, type of GPU, and possibly more.
Best regards.