CUFFT run wrong

i compute a FFT use cufft,when i compute a big size FFT (eg.65536),the result is wrong,but when the number of FFT is small,the result is correct,why?

#define NX 32768
#define BATCH 1

using namespace std;

//Complex data type
typedef float2 Complex;
int main()
{
cufftHandle plan;
cufftComplex *idata,*odata;
clock_t start,end;

//Allocate host memory for the signal
Complex  *h_signal=(Complex*)malloc(sizeof(Complex)*NX);
Complex  *h_result=(Complex*)malloc(sizeof(Complex)*NX);

//Initalize host memory for the signal
for(unsigned int i=0;i<NX;i++)
{
	h_signal[i].x=(float)i;
	h_signal[i].y=0;

}

/* Allocate device memory */
cudaMalloc((void**)&idata,sizeof(cufftComplex)*NX*BATCH);
cudaMalloc((void**)&odata,sizeof(cufftComplex)*NX*BATCH);

/* 主机设备数据传输*/	
cudaMemcpy(idata,h_signal,sizeof(Complex)*NX,cudaMemcpyHostToDevice);

	
if(cufftPlan1d(&plan,NX,CUFFT_C2C,BATCH)!=CUFFT_SUCCESS) {
	fprintf(stderr,"CUFFT error: Plan creation failed");
	return;	
}
/* Use the CUFFT plan to transform the signal in place .*/
if(cufftExecC2C(plan,idata,odata,CUFFT_FORWARD)!=CUFFT_SUCCESS) {
    fprintf(stderr,"CUFFT error:ExecC2C Forward failed");
	return;
}



if (cudaThreadSynchronize()!=cudaSuccess)
{
	fprintf(stderr,"Cuda error: Failed to synchronize\n");
	return;
}

cudaMemcpy(h_result,odata,sizeof(Complex)*NX,cudaMemcpyDeviceToHost);

/* Destroy the CUFFT plan. */
cufftDestroy(plan);
cudaFree(idata);
cudaFree(odata);

What are you comparing against to determine it is “wrong”?
And if it still doesn’t work, post your code including the comparison.

I was able to run your code but am too lazy to modify to compare against other libraries for accuracy.

/* 主机设备数据传输*/ LOL

my correct code which is serial processing


//a.cpp : Defines the entry point for the console application.
//

/时间抽选基2FFT及IFFT算法C语言实现/
/Author :Junyi Sun/
/Copyright 2004-2005/
/Mail:ccnusjy@yahoo.com.cn/

#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include

using namespace std;
#define N 65536
/定义复数类型/
typedef struct
{
double real;
double img;
}complex;
void fft(); /快速傅里叶变换/
void ifft();
void initW(); /初始化变换核/
void change(); /变址/
void add(complex ,complex ,complex *); /复数加法/
void mul(complex ,complex ,complex *); /复数乘法/
void sub(complex ,complex ,complex *); /复数减法/
void divi(complex ,complex ,complex *); /复数除法/
void output(); /输出结果/

complex x[N], *W; /输入序列,变换核/
int size_x=0; /输入序列的大小,在本程序中仅限2的次幂/
double PI; /圆周率/

int main()
{
int i,method;
PI=atan(1.0)*4;
printf(“Please input the size of x:\n”);
scanf("%d",&size_x);
printf(“Please input the data in x[N]:\n”);
/*for(i=0;i<size_x;i++)
scanf("%lf%lf",&x[i].real,&x[i].img); */
for(i=0;i<size_x;i++) {
x[i].real=i;
x[i].img=0;
}
initW();
printf(“Use FFT(0) or IFFT(1)?\n”);
clock_t start,end;
scanf("%d",&method);
if(method==0) {
start=clock();
fft();
end=clock();
}

else   
	ifft();
output();   
cout<<"GPU use "<<(end-start)<<" ms"<<endl;
return   0;   

}

/快速傅里叶变换/
void fft()
{
int i=0,j=0,k=0,l=0;
complex up,down,product;
change();
for(i=0;i< (int) (log((float)size_x)/log((float)2)) ;i++)/一级蝶形运算/
{
l=1<<i; //l=2^i i从0开始,l为一个蝶型元算输入数据直接的距离,2^l为蝶距
for(j=0;j<size_x;j+= 2*l )/一组蝶形运算/
{
for(k=0;k<l;k++)/一个蝶形运算/
{
mul(x[j+k+l],W[(size_x/2/l)k],&product); //计算一次复数乘product=WNkX2(k)

			add(x[j+k],product,&up);                  //计算一次复数加up=X1(k)+ WNk*X2(k)

			sub(x[j+k],product,&down);                //计算一次复数减down=X1(k)-WNk*X2(k)
			x[j+k]=up;                                //up=X[k]
			x[j+k+l]=down;                          //down=X[k+N/2]
		}   
	}   
}   

}

/快速傅里叶逆变换/
void ifft()
{
int i=0,j=0,k=0,l=size_x;
complex up,down;
for(i=0;i< (int)( log((float)size_x)/log((float)2) );i++) /一级蝶形运算/
{
l/=2;
for(j=0;j<size_x;j+= 2*l ) /一组蝶形运算/
{
for(k=0;k<l;k++) /一个蝶形运算/
{
add(x[j+k],x[j+k+l],&up);
up.real/=2;up.img/=2;
sub(x[j+k],x[j+k+l],&down);
down.real/=2;down.img/=2;
divi(down,W,&down);
x[j+k]=up;
x[j+k+l]=down;
}
}
}
change();
}

/初始化变换核/
void initW()
{
int i;
W=(complex )malloc(sizeof(complex) * size_x);
for(i=0;i<size_x;i++)
{
W[i].real=cos(2
PI/size_xi);
W[i].img=-1
sin(2PI/size_xi);
}
}

/变址计算,将x(n)码位倒置/
void change()
{
complex temp;
unsigned long i=0,j=0,k=0;
double t;
for(i=0;i<size_x;i++)
{
k=i;j=0;
t=(log((float)size_x)/log((float)2));
while( (t–)>0 )
{
j=j<<1;
j|=(k & 1);
k=k>>1;
}
if(j>i)
{
temp=x[i];
x[i]=x[j];
x[j]=temp;
}
}
}

/输出傅里叶变换的结果/
void output()
{
int i;
printf(“The result are as follows\n”);
for(i=0;i<size_x;i++)
{
printf("%.4f",x[i].real);
if(x[i].img>=0.0001)
printf("+%.4fj\n",x[i].img);
else if(fabs(x[i].img)<0.0001)
printf("\n");
else
printf("%.4fj\n",x[i].img);
}
}
void add(complex a,complex b,complex *c)
{
c->real=a.real+b.real;
c->img=a.img+b.img;
}

void mul(complex a,complex b,complex c)
{
c->real=a.real
b.real - a.imgb.img;
c->img=a.real
b.img + a.imgb.real;
}
void sub(complex a,complex b,complex c)
{
c->real=a.real-b.real;
c->img=a.img-b.img;
}
void divi(complex a,complex b,complex c)
{
c->real=( a.real
b.real+a.img
b.img )/( b.real
b.real+b.imgb.img);
c->img=( a.img
b.real-a.realb.img)/(b.realb.real+b.img*b.img);
}

my cufft is below

//#include “cuda_runtime.h”
//#include “device_launch_parameters.h”
#include <cufft.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include

#define NX 512
#define BATCH 1

using namespace std;

//Complex data type
typedef float2 Complex;
int main()
{
cufftHandle plan;
cufftComplex *idata,*odata;
clock_t start,end;

//Allocate host memory for the signal
Complex  *h_signal=(Complex*)malloc(sizeof(Complex)*NX);
Complex  *h_result=(Complex*)malloc(sizeof(Complex)*NX);

//Initalize host memory for the signal
for(unsigned int i=0;i<NX;i++)
{
	h_signal[i].x=(float)i;
	h_signal[i].y=0;

}

/* Allocate device memory */
cudaMalloc((void**)&idata,sizeof(cufftComplex)*NX*BATCH);
cudaMalloc((void**)&odata,sizeof(cufftComplex)*NX*BATCH);

/* 主机设备数据传输*/	
cudaMemcpy(idata,h_signal,sizeof(Complex)*NX,cudaMemcpyHostToDevice);
if(cudaGetLastError()!=cudaSuccess) {
  fprintf(stderr,"Cuda error: Failed to allocate\n");
  return;
}

start=clock();
/* Create a 1D FFT plan. */

if(cufftPlan1d(&plan,NX,CUFFT_C2C,BATCH)!=CUFFT_SUCCESS) {
	fprintf(stderr,"CUFFT error: Plan creation failed");
	return;	
}
/* Use the CUFFT plan to transform the signal in place .*/
if(cufftExecC2C(plan,idata,odata,CUFFT_FORWARD)!=CUFFT_SUCCESS) {
    fprintf(stderr,"CUFFT error:ExecC2C Forward failed");
	return;
}
end=clock();

/* Inverse transform the signal in place.
if (cufftExecC2C(plan,idata,odata,CUFFT_INVERSE)!=CUFFT_SUCCESS)
{ 
	fprintf(stderr,"CUFFT error:ExecC2C Inverse failed");
	return;
} */

if (cudaThreadSynchronize()!=cudaSuccess)
{
	fprintf(stderr,"Cuda error: Failed to synchronize\n");
	return;
}

cudaMemcpy(h_result,odata,sizeof(Complex)*NX,cudaMemcpyDeviceToHost);

/* Show the result.*/
printf("The   result   are   as   follows\n");   
for(int i=0;i<NX;i++)   
{   
	printf("%.4f",h_result[i].x);   
	if(h_result[i].y>=0.0001)   
		printf("+%.4fj\n",h_result[i].y);   
	else   if(fabs(h_result[i].y)<0.0001)   
		printf("\n");   
	else     
		printf("%.4fj\n",h_result[i].y);   
}  

cout<<"GPU use "<<(end-start)<<" ms"<<endl;
/* Destroy the CUFFT plan. */
cufftDestroy(plan);
cudaFree(idata);
cudaFree(odata);

return 0;

}

when my NX which is the numbers of FFT points is bigger(eg, NX=65536),the result is wrong,i try my best to find where is wrong,but i cannnot ,thanks for your reply.

I believe your comparison is wrong. Most likely your code downloaded from /Author :Junyi Sun/ is not working.

Try to compare using standard library. FFTW is my recommendation.

I used 3.2.16 and it runs correctly.

I used #define NX 65536 btw

i have take your advice,and i run my cufft,then compare with FFTW,the result is the same,that is cufft’s result is wrong,here is my code,please tell me why this could happen.
my cufft code is below

#include  <cufft.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include <iostream>

#define NX 16384
#define  BATCH 1

using namespace std;

//Complex data type
typedef float2 Complex;
int main()
{
    cufftHandle plan;
	cufftComplex *idata,*odata;
	clock_t start,end;

	//Allocate host memory for the signal
	Complex  *h_signal=(Complex*)malloc(sizeof(Complex)*NX);
	Complex  *h_result=(Complex*)malloc(sizeof(Complex)*NX);
	
	//Initalize host memory for the signal
	for(unsigned int i=0;i<NX;i++)
	{
		h_signal[i].x=(float)i;
		h_signal[i].y=0;
	}
	
	/* Allocate device memory */
	cudaMalloc((void**)&idata,sizeof(cufftComplex)*NX*BATCH);
	cudaMalloc((void**)&odata,sizeof(cufftComplex)*NX*BATCH);

    /* 主机设备数据传输*/	
	cudaMemcpy(idata,h_signal,sizeof(Complex)*NX,cudaMemcpyHostToDevice);
	if(cudaGetLastError()!=cudaSuccess) {
	  fprintf(stderr,"Cuda error: Failed to allocate\n");
	  return;
	}

	start=clock();
	/* Create a 1D FFT plan. */	
	if(cufftPlan1d(&plan,NX,CUFFT_C2C,BATCH)!=CUFFT_SUCCESS) {
		fprintf(stderr,"CUFFT error: Plan creation failed");
		return;	
	}
	/* Use the CUFFT plan to transform the signal in place .*/
	if(cufftExecC2C(plan,idata,odata,CUFFT_FORWARD)!=CUFFT_SUCCESS) {
	    fprintf(stderr,"CUFFT error:ExecC2C Forward failed");
		return;
	}
	end=clock();

	/* Inverse transform the signal in place.
	if (cufftExecC2C(plan,idata,odata,CUFFT_INVERSE)!=CUFFT_SUCCESS)
	{ 
		fprintf(stderr,"CUFFT error:ExecC2C Inverse failed");
		return;
	} */

	if (cudaThreadSynchronize()!=cudaSuccess)
	{
		fprintf(stderr,"Cuda error: Failed to synchronize\n");
		return;
	}

	cudaMemcpy(h_result,odata,sizeof(Complex)*NX,cudaMemcpyDeviceToHost);

	/* Show the result.*/
	printf("The   result   are   as   follows\n");   
	for(int i=0;i<NX;i++)   
	{   
		printf("%.4f",h_result[i].x);   
		if(h_result[i].y>=0.0001)   
			printf("+%.4fj\n",h_result[i].y);   
		else   if(fabs(h_result[i].y)<0.0001)   
			printf("\n");   
		else     
			printf("%.4fj\n",h_result[i].y);   
	}  

	cout<<"GPU use "<<(end-start)<<" ms"<<endl;
	/* Destroy the CUFFT plan. */[url]
	cufftDestroy(plan);
	cudaFree(idata);
	cudaFree(odata);

    return 0;
}

the result is wrong,then,i use FFTW to cumpute the same data,but the result is not same
,my fftw code is below

#include <complex>
#include <fftw3.h>
#include <math.h>
#include <iostream>
#include <sys/time.h>

double get_mseconds() {
   struct timeval tp;
   gettimeofday(&tp,NULL);
   return (double)1000*(tp.tv_sec+((1e-6)*tp.tv_usec));
}

#define N 16384
using namespace std;
int main(int argc, char * argv[]){
int i;
double start,end;
fftw_complex *in,*out;
fftw_plan p;
in = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * N);
out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * N);

if((in==NULL)||(out==NULL))
{
    printf("Error:insufficient available memory\n");
}
else
{

for(i=0; i<N; i++)/*测试数据 */
{
   in[i][0] = i;
   in[i][1] = 0;
}
}


start=get_mseconds();
p = fftw_plan_dft_1d(N, in, out, FFTW_FORWARD,FFTW_ESTIMATE);
fftw_execute(p); /* repeat as needed */
end=get_mseconds();


fftw_destroy_plan(p);
fftw_cleanup();

/*for(i=0;i<N;i++)
{
printf("%f,%fi\n",in[i][0],in[i][1]);
}*/
printf("\n");
for(i=0;i<N;i++)
{
printf("%f,%fi\n",out[i][0],out[i][1]);
}

cout<<"it use "<<(end-start)<<"ms"<<endl;
if(in!=NULL) fftw_free(in);
if(out!=NULL) fftw_free(out);
return 0;


  }

i cumpute the same data,but the result is different,i have reserch a long time,but i cannot find why ,pelase help me.thanks.

I don’t have time to run and verify this today but what’s your error function to determine it’s wrong?

use this function instead:

//
// The relative forward error is bound by ~logN, see Th. 24.2 in:
//
// Higham, N. J. 2002. Accuracy and Stability of Numerical Algorithms, SIAM.
// (Available online at http://books.google.com/books?id=epilvM5MMxwC)
//

const float ulp = 1.192092896e-07f;

float relative_error( float2 *reference, float2 *result, int n, int batch )
{
float error = 0;
for( int i = 0; i < batch; i++ )
{
float diff = 0, norm = 0;
for( int j = 0; j < n; j++ )
{
diff += norm2( reference[j] - result[j] );
norm += norm2( reference[j] );
}
if( _isnan( diff ) )
return -1;

    error = max( error, diff / norm );
    
    reference += n;
    result += n;
}
return sqrt( error ) / ulp;

}

This comparison function is what I borrowed from Vasily Volkov’s website.

From the way you generate the input data, the larger the size, the max value is larger, so you can’t compare the difference against a fixed “smaller value”.

I don’t see CUFFT went wrong in my test.

sorry,i generate a solid input data,and do FFT in cufft,in the same way,i generate the same data,and do fft in fftw,the result is the same,donnot??i compare the result,they are different,there are something wrong in my code.where?you say you do not see CUFFT went wrong in your test,could you give me your test code,and i test your code in my platform,thank you very much.

how can i know my cumpute results are correct?can you give me some ways,thanks.

------------using FFTW-> GET–>fftw_output
----------//----------------------------\
Input data -----------------------------==> compare the difference of the two outputs using the function I posted.
----------\----------------------------//
-------------using cufft-> GET–>cufft_output

You should need do a type casting for the data type.
FFTW uses fftw_complex for double precision, and fftwf_complex for single precision numbers.
CUFFT has their type names.

The error function I posted above is really straightforward.

thanks for your reply,i saw CUFFT result error in this forum many times,so many people meet this qustion.for example Erroneous output data with CUFFT Is it a bug ? Can somebody explain this? (cufft strange result!)etc.Now i have my questions,one,in your way it is float type,but the FFT result is complex.second ,what is norm2,i do not know the function of norm2,please tell me?thank you very much.

I apologize for the norm2 definition.

float norm2( float2 a ) { return a.xa.x+a.ya.y ; }

i use your way to compute the error,when NX=1024,relative_error=0.4,when NX=4096,relative_error=0.68;when NX=32768,relative_error=0.77;when NX=1048576,relative_error=1.02828.can this prove CUFFT error?

what is the value of relative_error,can we concider the CUFFT is correct?

the relative error is it’s absolute error comparing to ULP (unit in the least place)

sqrt( error ) / ulp;

why not take a look at the output data and tell me if you think CUFFT is correct?

thank you very much,i will take a deep look it the cuFFT.