CUFFT run wrong

bjtuffh · May 15, 2013, 9:34am

i compute a FFT use cufft,when i compute a big size FFT (eg.65536),the result is wrong,but when the number of FFT is small,the result is correct,why?

#define NX 32768
#define BATCH 1

using namespace std;

//Complex data type
typedef float2 Complex;
int main()
{
cufftHandle plan;
cufftComplex *idata,*odata;
clock_t start,end;

//Allocate host memory for the signal
Complex  *h_signal=(Complex*)malloc(sizeof(Complex)*NX);
Complex  *h_result=(Complex*)malloc(sizeof(Complex)*NX);

//Initalize host memory for the signal
for(unsigned int i=0;i<NX;i++)
{
	h_signal[i].x=(float)i;
	h_signal[i].y=0;

}

/* Allocate device memory */
cudaMalloc((void**)&idata,sizeof(cufftComplex)*NX*BATCH);
cudaMalloc((void**)&odata,sizeof(cufftComplex)*NX*BATCH);

/* 主机设备数据传输*/	
cudaMemcpy(idata,h_signal,sizeof(Complex)*NX,cudaMemcpyHostToDevice);

	
if(cufftPlan1d(&plan,NX,CUFFT_C2C,BATCH)!=CUFFT_SUCCESS) {
	fprintf(stderr,"CUFFT error: Plan creation failed");
	return;	
}
/* Use the CUFFT plan to transform the signal in place .*/
if(cufftExecC2C(plan,idata,odata,CUFFT_FORWARD)!=CUFFT_SUCCESS) {
    fprintf(stderr,"CUFFT error:ExecC2C Forward failed");
	return;
}



if (cudaThreadSynchronize()!=cudaSuccess)
{
	fprintf(stderr,"Cuda error: Failed to synchronize\n");
	return;
}

cudaMemcpy(h_result,odata,sizeof(Complex)*NX,cudaMemcpyDeviceToHost);

/* Destroy the CUFFT plan. */
cufftDestroy(plan);
cudaFree(idata);
cudaFree(odata);

aimjwizards · May 15, 2013, 8:10pm

What are you comparing against to determine it is “wrong”?
And if it still doesn’t work, post your code including the comparison.

I was able to run your code but am too lazy to modify to compare against other libraries for accuracy.

/* 主机设备数据传输*/ LOL

bjtuffh · May 16, 2013, 2:07am

my correct code which is serial processing

//a.cpp : Defines the entry point for the console application.
//

/时间抽选基2FFT及IFFT算法C语言实现/
/Author :Junyi Sun/
/Copyright 2004-2005/
/Mail:ccnusjy@yahoo.com.cn/

#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include

using namespace std;
#define N 65536
/定义复数类型/
typedef struct
{
double real;
double img;
}complex;
void fft(); /快速傅里叶变换/
void ifft();
void initW(); /初始化变换核/
void change(); /变址/
void add(complex ,complex ,complex *); /复数加法/
void mul(complex ,complex ,complex *); /复数乘法/
void sub(complex ,complex ,complex *); /复数减法/
void divi(complex ,complex ,complex *); /复数除法/
void output(); /输出结果/

complex x[N], *W; /输入序列,变换核/
int size_x=0; /输入序列的大小，在本程序中仅限2的次幂/
double PI; /圆周率/

int main()
{
int i,method;
PI=atan(1.0)*4;
printf(“Please input the size of x:\n”);
scanf(“%d”,&size_x);
printf(“Please input the data in x[N]:\n”);
/*for(i=0;i<size_x;i++)
scanf(“%lf%lf”,&x[i].real,&x[i].img); */
for(i=0;i<size_x;i++) {
x[i].real=i;
x[i].img=0;
}
initW();
printf(“Use FFT(0) or IFFT(1)?\n”);
clock_t start,end;
scanf(“%d”,&method);
if(method==0) {
start=clock();
fft();
end=clock();
}

else   
	ifft();
output();   
cout<<"GPU use "<<(end-start)<<" ms"<<endl;
return   0;

}

/快速傅里叶变换/
void fft()
{
int i=0,j=0,k=0,l=0;
complex up,down,product;
change();
for(i=0;i< (int) (log((float)size_x)/log((float)2)) ;i++)/一级蝶形运算/
{
l=1<<i; //l=2^i i从0开始，l为一个蝶型元算输入数据直接的距离，2^l为蝶距
for(j=0;j<size_x;j+= 2*l )/一组蝶形运算/
{
for(k=0;k<l;k++)/一个蝶形运算/
{
mul(x[j+k+l],W[(size_x/2/l)k],&product); //计算一次复数乘product=WNkX2(k)

			add(x[j+k],product,&up);                  //计算一次复数加up=X1(k)+ WNk*X2(k)

			sub(x[j+k],product,&down);                //计算一次复数减down=X1(k)-WNk*X2(k)
			x[j+k]=up;                                //up=X[k]
			x[j+k+l]=down;                          //down=X[k+N/2]
		}   
	}   
}

}

/快速傅里叶逆变换/
void ifft()
{
int i=0,j=0,k=0,l=size_x;
complex up,down;
for(i=0;i< (int)( log((float)size_x)/log((float)2) );i++) /一级蝶形运算/
{
l/=2;
for(j=0;j<size_x;j+= 2*l ) /一组蝶形运算/
{
for(k=0;k<l;k++) /一个蝶形运算/
{
add(x[j+k],x[j+k+l],&up);
up.real/=2;up.img/=2;
sub(x[j+k],x[j+k+l],&down);
down.real/=2;down.img/=2;
divi(down,W,&down);
x[j+k]=up;
x[j+k+l]=down;
}
}
}
change();
}

/初始化变换核/
void initW()
{
int i;
W=(complex )malloc(sizeof(complex) * size_x);
for(i=0;i<size_x;i++)
{
W[i].real=cos(2PI/size_xi);
W[i].img=-1sin(2PI/size_xi);
}
}

/变址计算，将x(n)码位倒置/
void change()
{
complex temp;
unsigned long i=0,j=0,k=0;
double t;
for(i=0;i<size_x;i++)
{
k=i;j=0;
t=(log((float)size_x)/log((float)2));
while( (t–)>0 )
{
j=j<<1;
j|=(k & 1);
k=k>>1;
}
if(j>i)
{
temp=x[i];
x[i]=x[j];
x[j]=temp;
}
}
}

/输出傅里叶变换的结果/
void output()
{
int i;
printf(“The result are as follows\n”);
for(i=0;i<size_x;i++)
{
printf(“%.4f”,x[i].real);
if(x[i].img>=0.0001)
printf(“+%.4fj\n”,x[i].img);
else if(fabs(x[i].img)<0.0001)
printf(“\n”);
else
printf(“%.4fj\n”,x[i].img);
}
}
void add(complex a,complex b,complex *c)
{
c->real=a.real+b.real;
c->img=a.img+b.img;
}

void mul(complex a,complex b,complex c)
{
c->real=a.realb.real - a.imgb.img;
c->img=a.realb.img + a.imgb.real;
}
void sub(complex a,complex b,complex c)
{
c->real=a.real-b.real;
c->img=a.img-b.img;
}
void divi(complex a,complex b,complex c)
{
c->real=( a.realb.real+a.imgb.img )/( b.realb.real+b.imgb.img);
c->img=( a.imgb.real-a.realb.img)/(b.realb.real+b.img*b.img);
}

my cufft is below

//#include “cuda_runtime.h”
//#include “device_launch_parameters.h”
#include <cufft.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include

#define NX 512
#define BATCH 1

using namespace std;

//Complex data type
typedef float2 Complex;
int main()
{
cufftHandle plan;
cufftComplex *idata,*odata;
clock_t start,end;

//Allocate host memory for the signal
Complex  *h_signal=(Complex*)malloc(sizeof(Complex)*NX);
Complex  *h_result=(Complex*)malloc(sizeof(Complex)*NX);

//Initalize host memory for the signal
for(unsigned int i=0;i<NX;i++)
{
	h_signal[i].x=(float)i;
	h_signal[i].y=0;

}

/* Allocate device memory */
cudaMalloc((void**)&idata,sizeof(cufftComplex)*NX*BATCH);
cudaMalloc((void**)&odata,sizeof(cufftComplex)*NX*BATCH);

/* 主机设备数据传输*/	
cudaMemcpy(idata,h_signal,sizeof(Complex)*NX,cudaMemcpyHostToDevice);
if(cudaGetLastError()!=cudaSuccess) {
  fprintf(stderr,"Cuda error: Failed to allocate\n");
  return;
}

start=clock();
/* Create a 1D FFT plan. */

if(cufftPlan1d(&plan,NX,CUFFT_C2C,BATCH)!=CUFFT_SUCCESS) {
	fprintf(stderr,"CUFFT error: Plan creation failed");
	return;	
}
/* Use the CUFFT plan to transform the signal in place .*/
if(cufftExecC2C(plan,idata,odata,CUFFT_FORWARD)!=CUFFT_SUCCESS) {
    fprintf(stderr,"CUFFT error:ExecC2C Forward failed");
	return;
}
end=clock();

/* Inverse transform the signal in place.
if (cufftExecC2C(plan,idata,odata,CUFFT_INVERSE)!=CUFFT_SUCCESS)
{ 
	fprintf(stderr,"CUFFT error:ExecC2C Inverse failed");
	return;
} */

if (cudaThreadSynchronize()!=cudaSuccess)
{
	fprintf(stderr,"Cuda error: Failed to synchronize\n");
	return;
}

cudaMemcpy(h_result,odata,sizeof(Complex)*NX,cudaMemcpyDeviceToHost);

/* Show the result.*/
printf("The   result   are   as   follows\n");   
for(int i=0;i<NX;i++)   
{   
	printf("%.4f",h_result[i].x);   
	if(h_result[i].y>=0.0001)   
		printf("+%.4fj\n",h_result[i].y);   
	else   if(fabs(h_result[i].y)<0.0001)   
		printf("\n");   
	else     
		printf("%.4fj\n",h_result[i].y);   
}  

cout<<"GPU use "<<(end-start)<<" ms"<<endl;
/* Destroy the CUFFT plan. */
cufftDestroy(plan);
cudaFree(idata);
cudaFree(odata);

return 0;

}

when my NX which is the numbers of FFT points is bigger(eg, NX=65536)，the result is wrong,i try my best to find where is wrong,but i cannnot ,thanks for your reply.

aimjwizards · May 16, 2013, 3:44pm

I believe your comparison is wrong. Most likely your code downloaded from /Author :Junyi Sun/ is not working.

Try to compare using standard library. FFTW is my recommendation.

I used 3.2.16 and it runs correctly.

aimjwizards · May 16, 2013, 3:45pm

I used #define NX 65536 btw

bjtuffh · May 17, 2013, 9:31am

i have take your advice,and i run my cufft,then compare with FFTW,the result is the same,that is cufft’s result is wrong,here is my code,please tell me why this could happen.
my cufft code is below

#include  <cufft.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include <iostream>

#define NX 16384
#define  BATCH 1

using namespace std;

//Complex data type
typedef float2 Complex;
int main()
{
    cufftHandle plan;
	cufftComplex *idata,*odata;
	clock_t start,end;

	//Allocate host memory for the signal
	Complex  *h_signal=(Complex*)malloc(sizeof(Complex)*NX);
	Complex  *h_result=(Complex*)malloc(sizeof(Complex)*NX);
	
	//Initalize host memory for the signal
	for(unsigned int i=0;i<NX;i++)
	{
		h_signal[i].x=(float)i;
		h_signal[i].y=0;
	}
	
	/* Allocate device memory */
	cudaMalloc((void**)&idata,sizeof(cufftComplex)*NX*BATCH);
	cudaMalloc((void**)&odata,sizeof(cufftComplex)*NX*BATCH);

    /* 主机设备数据传输*/	
	cudaMemcpy(idata,h_signal,sizeof(Complex)*NX,cudaMemcpyHostToDevice);
	if(cudaGetLastError()!=cudaSuccess) {
	  fprintf(stderr,"Cuda error: Failed to allocate\n");
	  return;
	}

	start=clock();
	/* Create a 1D FFT plan. */	
	if(cufftPlan1d(&plan,NX,CUFFT_C2C,BATCH)!=CUFFT_SUCCESS) {
		fprintf(stderr,"CUFFT error: Plan creation failed");
		return;	
	}
	/* Use the CUFFT plan to transform the signal in place .*/
	if(cufftExecC2C(plan,idata,odata,CUFFT_FORWARD)!=CUFFT_SUCCESS) {
	    fprintf(stderr,"CUFFT error:ExecC2C Forward failed");
		return;
	}
	end=clock();

	/* Inverse transform the signal in place.
	if (cufftExecC2C(plan,idata,odata,CUFFT_INVERSE)!=CUFFT_SUCCESS)
	{ 
		fprintf(stderr,"CUFFT error:ExecC2C Inverse failed");
		return;
	} */

	if (cudaThreadSynchronize()!=cudaSuccess)
	{
		fprintf(stderr,"Cuda error: Failed to synchronize\n");
		return;
	}

	cudaMemcpy(h_result,odata,sizeof(Complex)*NX,cudaMemcpyDeviceToHost);

	/* Show the result.*/
	printf("The   result   are   as   follows\n");   
	for(int i=0;i<NX;i++)   
	{   
		printf("%.4f",h_result[i].x);   
		if(h_result[i].y>=0.0001)   
			printf("+%.4fj\n",h_result[i].y);   
		else   if(fabs(h_result[i].y)<0.0001)   
			printf("\n");   
		else     
			printf("%.4fj\n",h_result[i].y);   
	}  

	cout<<"GPU use "<<(end-start)<<" ms"<<endl;
	/* Destroy the CUFFT plan. */[url]
	cufftDestroy(plan);
	cudaFree(idata);
	cudaFree(odata);

    return 0;
}

the result is wrong,then,i use FFTW to cumpute the same data,but the result is not same
,my fftw code is below

#include <complex>
#include <fftw3.h>
#include <math.h>
#include <iostream>
#include <sys/time.h>

double get_mseconds() {
   struct timeval tp;
   gettimeofday(&tp,NULL);
   return (double)1000*(tp.tv_sec+((1e-6)*tp.tv_usec));
}

#define N 16384
using namespace std;
int main(int argc, char * argv[]){
int i;
double start,end;
fftw_complex *in,*out;
fftw_plan p;
in = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * N);
out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * N);

if((in==NULL)||(out==NULL))
{
    printf("Error:insufficient available memory\n");
}
else
{

for(i=0; i<N; i++)/*测试数据 */
{
   in[i][0] = i;
   in[i][1] = 0;
}
}


start=get_mseconds();
p = fftw_plan_dft_1d(N, in, out, FFTW_FORWARD,FFTW_ESTIMATE);
fftw_execute(p); /* repeat as needed */
end=get_mseconds();


fftw_destroy_plan(p);
fftw_cleanup();

/*for(i=0;i<N;i++)
{
printf("%f,%fi\n",in[i][0],in[i][1]);
}*/
printf("\n");
for(i=0;i<N;i++)
{
printf("%f,%fi\n",out[i][0],out[i][1]);
}

cout<<"it use "<<(end-start)<<"ms"<<endl;
if(in!=NULL) fftw_free(in);
if(out!=NULL) fftw_free(out);
return 0;


  }

i cumpute the same data,but the result is different,i have reserch a long time,but i cannot find why ,pelase help me.thanks.

aimjwizards · May 17, 2013, 11:15am

I don’t have time to run and verify this today but what’s your error function to determine it’s wrong?

use this function instead:

//
// The relative forward error is bound by ~logN, see Th. 24.2 in:
//
// Higham, N. J. 2002. Accuracy and Stability of Numerical Algorithms, SIAM.
// (Available online at Accuracy and Stability of Numerical Algorithms: Second Edition - Nicholas J. Higham - Google Books)
//

const float ulp = 1.192092896e-07f;

float relative_error( float2 *reference, float2 *result, int n, int batch )
{
float error = 0;
for( int i = 0; i < batch; i++ )
{
float diff = 0, norm = 0;
for( int j = 0; j < n; j++ )
{
diff += norm2( reference[j] - result[j] );
norm += norm2( reference[j] );
}
if( _isnan( diff ) )
return -1;

    error = max( error, diff / norm );
    
    reference += n;
    result += n;
}
return sqrt( error ) / ulp;

}

This comparison function is what I borrowed from Vasily Volkov’s website.

aimjwizards · May 17, 2013, 11:21am

From the way you generate the input data, the larger the size, the max value is larger, so you can’t compare the difference against a fixed “smaller value”.

I don’t see CUFFT went wrong in my test.

bjtuffh · May 20, 2013, 7:54am

sorry,i generate a solid input data,and do FFT in cufft,in the same way,i generate the same data,and do fft in fftw,the result is the same,donnot??i compare the result,they are different,there are something wrong in my code.where?you say you do not see CUFFT went wrong in your test,could you give me your test code,and i test your code in my platform,thank you very much.

bjtuffh · May 20, 2013, 12:15pm

how can i know my cumpute results are correct?can you give me some ways,thanks.

aimjwizards · May 22, 2013, 12:35am

------------using FFTW-> GET–>fftw_output
----------//----------------------------\
Input data -----------------------------==> compare the difference of the two outputs using the function I posted.
----------\----------------------------//
-------------using cufft-> GET–>cufft_output

You should need do a type casting for the data type.
FFTW uses fftw_complex for double precision, and fftwf_complex for single precision numbers.
CUFFT has their type names.

The error function I posted above is really straightforward.

bjtuffh · May 22, 2013, 1:41am

thanks for your reply,i saw CUFFT result error in this forum many times,so many people meet this qustion.for example Erroneous output data with CUFFT Is it a bug ? Can somebody explain this? (cufft strange result!)etc.Now i have my questions,one,in your way it is float type,but the FFT result is complex.second ,what is norm2,i do not know the function of norm2,please tell me?thank you very much.

aimjwizards · May 22, 2013, 2:39am

I apologize for the norm2 definition.

float norm2( float2 a ) { return a.xa.x+a.ya.y ; }

bjtuffh · May 22, 2013, 7:55am

i use your way to compute the error,when NX=1024,relative_error=0.4,when NX=4096,relative_error=0.68;when NX=32768,relative_error=0.77;when NX=1048576,relative_error=1.02828.can this prove CUFFT error?

bjtuffh · May 22, 2013, 8:25am

what is the value of relative_error,can we concider the CUFFT is correct?

aimjwizards · May 22, 2013, 11:41am

the relative error is it’s absolute error comparing to ULP (unit in the least place)

sqrt( error ) / ulp;

why not take a look at the output data and tell me if you think CUFFT is correct?

bjtuffh · May 23, 2013, 7:27am

thank you very much,i will take a deep look it the cuFFT.

Topic		Replies	Views
cuFFT and fftw CUDA Programming and Performance	10	4134	August 25, 2010
[SOLVED] cuFFT data storage, maybe I'm operating on the wrong elements CUDA Programming and Performance	16	2095	June 11, 2022
Questions about cuFFT for 3D matrix, arrayFire GPU-Accelerated Libraries	5	1644	October 12, 2021
cufft question CUDA Programming and Performance	6	8652	March 9, 2009
2D CUFFT wrong result GPU-Accelerated Libraries cufft	8	3060	November 7, 2023
FFTW output Vs CUDAFFT output Different outputs CUDA Programming and Performance	2	11325	May 6, 2008
CUDA FFT different from Matlab FFT CUDA Programming and Performance	32	9301	March 29, 2011
Apparently bug in CUFFT of CUDA 7.5 with (deprecated) NATIVE Compatibility CUDA Programming and Performance	3	1752	December 21, 2015
CUFFT appears to give errors for vectors > 1024 CUDA Programming and Performance	6	8764	April 12, 2007
Inconsistent Cuda Errors CUDA Programming and Performance	10	1052	July 7, 2017

CUFFT run wrong

Related topics