What is wrong? Am I missing something?
I am comparing the results of 3 calculations (R2C). The inputs are all the same. The CUDA is single precision, others are double.
[codebox]
cuFFT
real vector input (same for all cases)
i = 0 h_r = -8.56089e-03
i = 1 h_r = 1.08910e-02
i = 2 h_r = 4.03077e-04
i = 3 h_r = 1.08910e-02
i = 4 h_r = 1.08910e-02
i = 5 h_r = 2.14187e-03
i = 6 h_r = 1.43777e-04
i = 7 h_r = 2.14187e-03
i = 8 h_r = 4.03077e-04
i = 9 h_r = 1.43777e-04
i = 10 h_r = 1.64003e-05
i = 11 h_r = 1.43777e-04
i = 12 h_r = 1.08910e-02
i = 13 h_r = 2.14187e-03
i = 14 h_r = 1.43777e-04
i = 15 h_r = 2.14187e-03
i = 16 h_r = -3.35135e-02
i = 17 h_r = -3.75033e-03
i = 18 h_r = -1.46343e-04
i = 19 h_r = -3.75033e-03
i = 20 h_r = -3.75033e-03
i = 21 h_r = -8.15940e-04
i = 22 h_r = -5.06648e-05
i = 23 h_r = -8.15940e-04
i = 24 h_r = -1.46343e-04
i = 25 h_r = -5.06648e-05
i = 26 h_r = -5.40964e-06
i = 27 h_r = -5.06648e-05
i = 28 h_r = -3.75033e-03
i = 29 h_r = -8.15940e-04
i = 30 h_r = -5.06648e-05
i = 31 h_r = -8.15940e-04
i = 32 h_r = -3.02584e-03
i = 33 h_r = -1.01135e-03
i = 34 h_r = -1.04121e-04
i = 35 h_r = -1.01135e-03
i = 36 h_r = -1.01135e-03
i = 37 h_r = -3.61816e-04
i = 38 h_r = -4.11104e-05
i = 39 h_r = -3.61816e-04
i = 40 h_r = -1.04121e-04
i = 41 h_r = -4.11104e-05
i = 42 h_r = -5.52676e-06
i = 43 h_r = -4.11104e-05
i = 44 h_r = -1.01135e-03
i = 45 h_r = -3.61816e-04
i = 46 h_r = -4.11104e-05
i = 47 h_r = -3.61816e-04
i = 48 h_r = -3.35135e-02
i = 49 h_r = -3.75033e-03
i = 50 h_r = -1.46343e-04
i = 51 h_r = -3.75033e-03
i = 52 h_r = -3.75033e-03
i = 53 h_r = -8.15940e-04
i = 54 h_r = -5.06648e-05
i = 55 h_r = -8.15940e-04
i = 56 h_r = -1.46343e-04
i = 57 h_r = -5.06648e-05
i = 58 h_r = -5.40964e-06
i = 59 h_r = -5.06648e-05
i = 60 h_r = -3.75033e-03
i = 61 h_r = -8.15940e-04
i = 62 h_r = -5.06648e-05
i = 63 h_r = -8.15940e-04
complex vector output
i = 0 c.x = -6.84871e-02 c.y = 0.00000e+00
i = 1 c.x = 3.96291e-02 c.y = -2.88948e-02
i = 2 c.x = 8.35904e-02 c.y = -4.78450e-02
i = 3 c.x = 2.92280e-03 c.y = -2.05540e-02
i = 4 c.x = -7.55090e-02 c.y = -3.11094e-03
i = 5 c.x = 1.18741e-02 c.y = -2.70728e-02
i = 6 c.x = 5.93808e-02 c.y = -5.42796e-02
i = 7 c.x = -2.30137e-02 c.y = -2.97453e-02
i = 8 c.x = -8.33654e-02 c.y = -2.95085e-03
i = 9 c.x = -1.88215e-02 c.y = 3.19200e-03
i = 10 c.x = 5.28139e-02 c.y = 3.53218e-03
i = 11 c.x = -5.62651e-03 c.y = -5.38738e-03
i = 12 c.x = -7.99111e-02 c.y = -1.29117e-03
i = 13 c.x = -1.71458e-02 c.y = 2.40943e-03
i = 14 c.x = 4.94980e-02 c.y = 2.02500e-02
i = 15 c.x = 6.86291e-03 c.y = 1.42692e-02
i = 16 c.x = -7.38585e-02 c.y = 0.00000e+00
i = 17 c.x = 1.24877e-02 c.y = -1.67480e-02
i = 18 c.x = 6.05266e-02 c.y = -2.90662e-02
i = 19 c.x = -8.04747e-03 c.y = -1.05635e-02
i = 20 c.x = -7.73376e-02 c.y = -1.28238e-03
i = 21 c.x = 6.20521e-03 c.y = -1.64751e-02
i = 22 c.x = 5.81067e-02 c.y = -4.31460e-02
i = 23 c.x = -2.22999e-02 c.y = -2.65534e-02
i = 24 c.x = -8.33654e-02 c.y = -2.94355e-03
i = 25 c.x = -1.87752e-02 c.y = 1.27940e-03
i = 26 c.x = 5.14595e-02 c.y = -5.00663e-03
i = 27 c.x = -1.02615e-02 c.y = -1.38290e-02
i = 28 c.x = -8.17221e-02 c.y = -3.10215e-03
i = 29 c.x = -2.70703e-02 c.y = -6.32641e-03
i = 30 c.x = 2.81458e-02 c.y = 3.52320e-03
i = 31 c.x = -1.74807e-02 c.y = 3.72732e-03
fftw3
complex vector output
0.68487E-01 0.00000E+00
-0.73858E-01 -0.13010E-17
-0.79194E-01 0.00000E+00
-0.73858E-01 0.13010E-17
-0.73858E-01 0.58818E-17
-0.78626E-01 0.69118E-17
-0.83369E-01 0.80502E-17
-0.78626E-01 0.69118E-17
-0.79194E-01 0.00000E+00
-0.83369E-01 0.13010E-17
-0.87530E-01 0.00000E+00
-0.83369E-01 -0.13010E-17
-0.73858E-01 -0.58818E-17
-0.78626E-01 -0.69118E-17
-0.83369E-01 -0.80502E-17
-0.78626E-01 -0.69118E-17
0.53865E-01 -0.21220E-16
0.17878E-01 -0.13935E-16
-0.14513E-01 -0.20081E-16
0.17878E-01 -0.27813E-16
0.17878E-01 -0.97070E-17
-0.65275E-02 -0.69626E-17
-0.28992E-01 -0.11279E-16
-0.65275E-02 -0.13902E-16
-0.14513E-01 -0.20949E-16
-0.28992E-01 -0.18977E-16
-0.42833E-01 -0.20678E-16
-0.28992E-01 -0.22446E-16
0.17878E-01 -0.33017E-16
-0.65275E-02 -0.27888E-16
-0.28992E-01 -0.29385E-16
-0.65275E-02 -0.34827E-16
0.14063E+00 0.00000E+00 line 31
0.89771E-01 0.69389E-17
0.42997E-01 0.00000E+00
0.89771E-01 -0.69389E-17
0.89771E-01 0.15558E-16
0.54279E-01 0.20871E-16
0.21066E-01 0.12089E-16
0.54279E-01 0.69931E-17
0.42997E-01 0.00000E+00
0.21066E-01 0.52042E-17
-0.44214E-04 0.00000E+00
0.21066E-01 -0.52042E-17
0.89771E-01 -0.15558E-16
0.54279E-01 -0.69931E-17
0.21066E-01 -0.12089E-16
0.54279E-01 -0.20871E-16
Octave
b = fft(a)
Complex vector output
-6.84871391589000e-02 + 0.00000000000000e+00i
3.96291077041512e-02 - 2.88947668673021e-02i
8.35904349173265e-02 - 4.78449580681125e-02i
2.92279943131487e-03 - 2.05540386989352e-02i
-7.55090222331402e-02 - 3.11093954685982e-03i
1.18740556658649e-02 - 2.70727689088251e-02i
5.93807985865659e-02 - 5.42796522627613e-02i
-2.30137097036718e-02 - 2.97453135658939e-02i
-8.33653884200000e-02 - 2.95084582214299e-03i
-1.88214735731874e-02 + 3.19200166325915e-03i
5.28138930260020e-02 + 3.53217713963965e-03i
-5.62650972052747e-03 - 5.38737858168509e-03i
-7.99111297521887e-02 - 1.29116797218873e-03i
-1.71457894871483e-02 + 2.40943106264560e-03i
4.94980204951858e-02 + 2.02500254068004e-02i
6.86290655312948e-03 + 1.42691858240473e-02i
-7.38584538091000e-02 + 0.00000000000000e+00i
1.24877352230575e-02 - 1.67479631573974e-02i
6.05266368092228e-02 - 2.90661730003730e-02i
-8.04746678939571e-03 - 1.05635255068958e-02i
-7.73375841108678e-02 - 1.28237766913215e-03i
6.20520975502444e-03 - 1.64750962242344e-02i
5.81067321808034e-02 - 4.31460086050482e-02i
-2.22999378339696e-02 - 2.65533702977674e-02i
-8.33653884200000e-02 - 2.94354589194300e-03i
-1.87751887382770e-02 + 1.27940352218526e-03i
5.14595050066286e-02 - 5.00663618962864e-03i
-1.02615345580985e-02 - 1.38290446091110e-02i
-8.17221110238032e-02 - 3.10214924380323e-03i
-2.70703509455312e-02 - 6.32640827771623e-03i
2.81458365782649e-02 + 3.52319110253461e-03i
-1.74807361827355e-02 + 3.72731824885600e-03i -line 31
-7.91937741029000e-02 + 0.00000000000000e+00i
-1.74807361827355e-02 - 3.72731824885597e-03i
2.81458365782649e-02 - 3.52319110253456e-03i
-2.70703509455312e-02 + 6.32640827771627e-03i
-8.17221110238032e-02 + 3.10214924380323e-03i
-1.02615345580985e-02 + 1.38290446091110e-02i
5.14595050066287e-02 + 5.00663618962861e-03i
-1.87751887382770e-02 - 1.27940352218527e-03i
-8.33653884200000e-02 + 2.94354589194299e-03i
-2.22999378339695e-02 + 2.65533702977674e-02i
5.81067321808034e-02 + 4.31460086050481e-02i
6.20520975502445e-03 + 1.64750962242343e-02i
-7.73375841108678e-02 + 1.28237766913214e-03i
-8.04746678939572e-03 + 1.05635255068958e-02i
6.05266368092228e-02 + 2.90661730003730e-02i
1.24877352230575e-02 + 1.67479631573973e-02i
-7.38584538091000e-02 + 0.00000000000000e+00i
6.86290655312943e-03 - 1.42691858240473e-02i
4.94980204951858e-02 - 2.02500254068003e-02i
-1.71457894871483e-02 - 2.40943106264555e-03i
-7.99111297521887e-02 + 1.29116797218874e-03i
-5.62650972052746e-03 + 5.38737858168510e-03i
5.28138930260020e-02 - 3.53217713963964e-03i
-1.88214735731875e-02 - 3.19200166325913e-03i
-8.33653884200000e-02 + 2.95084582214300e-03i
-2.30137097036717e-02 + 2.97453135658940e-02i
5.93807985865660e-02 + 5.42796522627613e-02i
1.18740556658649e-02 + 2.70727689088251e-02i
-7.55090222331402e-02 + 3.11093954685982e-03i
2.92279943131489e-03 + 2.05540386989352e-02i
8.35904349173266e-02 + 4.78449580681125e-02i
3.96291077041513e-02 + 2.88947668673020e-02i
[/codebox]
As you can see, cuda agrees with Octave but not with fftw3. In my calculations, the fftw3 output is the good one.