my correct code which is serial processing
//a.cpp : Defines the entry point for the console application.
//
/时间抽选基2FFT及IFFT算法C语言实现/
/Author :Junyi Sun/
/Copyright 2004-2005/
/Mail:ccnusjy@yahoo.com.cn/
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include
using namespace std;
#define N 65536
/定义复数类型/
typedef struct
{
double real;
double img;
}complex;
void fft(); /快速傅里叶变换/
void ifft();
void initW(); /初始化变换核/
void change(); /变址/
void add(complex ,complex ,complex *); /复数加法/
void mul(complex ,complex ,complex *); /复数乘法/
void sub(complex ,complex ,complex *); /复数减法/
void divi(complex ,complex ,complex *); /复数除法/
void output(); /输出结果/
complex x[N], *W; /输入序列,变换核/
int size_x=0; /输入序列的大小,在本程序中仅限2的次幂/
double PI; /圆周率/
int main()
{
int i,method;
PI=atan(1.0)*4;
printf(“Please input the size of x:\n”);
scanf(“%d”,&size_x);
printf(“Please input the data in x[N]:\n”);
/*for(i=0;i<size_x;i++)
scanf(“%lf%lf”,&x[i].real,&x[i].img); */
for(i=0;i<size_x;i++) {
x[i].real=i;
x[i].img=0;
}
initW();
printf(“Use FFT(0) or IFFT(1)?\n”);
clock_t start,end;
scanf(“%d”,&method);
if(method==0) {
start=clock();
fft();
end=clock();
}
else
ifft();
output();
cout<<"GPU use "<<(end-start)<<" ms"<<endl;
return 0;
}
/快速傅里叶变换/
void fft()
{
int i=0,j=0,k=0,l=0;
complex up,down,product;
change();
for(i=0;i< (int) (log((float)size_x)/log((float)2)) ;i++)/一级蝶形运算/
{
l=1<<i; //l=2^i i从0开始,l为一个蝶型元算输入数据直接的距离,2^l为蝶距
for(j=0;j<size_x;j+= 2*l )/一组蝶形运算/
{
for(k=0;k<l;k++)/一个蝶形运算/
{
mul(x[j+k+l],W[(size_x/2/l)k],&product); //计算一次复数乘product=WNkX2(k)
add(x[j+k],product,&up); //计算一次复数加up=X1(k)+ WNk*X2(k)
sub(x[j+k],product,&down); //计算一次复数减down=X1(k)-WNk*X2(k)
x[j+k]=up; //up=X[k]
x[j+k+l]=down; //down=X[k+N/2]
}
}
}
}
/快速傅里叶逆变换/
void ifft()
{
int i=0,j=0,k=0,l=size_x;
complex up,down;
for(i=0;i< (int)( log((float)size_x)/log((float)2) );i++) /一级蝶形运算/
{
l/=2;
for(j=0;j<size_x;j+= 2*l ) /一组蝶形运算/
{
for(k=0;k<l;k++) /一个蝶形运算/
{
add(x[j+k],x[j+k+l],&up);
up.real/=2;up.img/=2;
sub(x[j+k],x[j+k+l],&down);
down.real/=2;down.img/=2;
divi(down,W,&down);
x[j+k]=up;
x[j+k+l]=down;
}
}
}
change();
}
/初始化变换核/
void initW()
{
int i;
W=(complex )malloc(sizeof(complex) * size_x);
for(i=0;i<size_x;i++)
{
W[i].real=cos(2PI/size_xi);
W[i].img=-1sin(2PI/size_xi);
}
}
/变址计算,将x(n)码位倒置/
void change()
{
complex temp;
unsigned long i=0,j=0,k=0;
double t;
for(i=0;i<size_x;i++)
{
k=i;j=0;
t=(log((float)size_x)/log((float)2));
while( (t–)>0 )
{
j=j<<1;
j|=(k & 1);
k=k>>1;
}
if(j>i)
{
temp=x[i];
x[i]=x[j];
x[j]=temp;
}
}
}
/输出傅里叶变换的结果/
void output()
{
int i;
printf(“The result are as follows\n”);
for(i=0;i<size_x;i++)
{
printf(“%.4f”,x[i].real);
if(x[i].img>=0.0001)
printf(“+%.4fj\n”,x[i].img);
else if(fabs(x[i].img)<0.0001)
printf(“\n”);
else
printf(“%.4fj\n”,x[i].img);
}
}
void add(complex a,complex b,complex *c)
{
c->real=a.real+b.real;
c->img=a.img+b.img;
}
void mul(complex a,complex b,complex c)
{
c->real=a.realb.real - a.imgb.img;
c->img=a.realb.img + a.imgb.real;
}
void sub(complex a,complex b,complex c)
{
c->real=a.real-b.real;
c->img=a.img-b.img;
}
void divi(complex a,complex b,complex c)
{
c->real=( a.realb.real+a.imgb.img )/( b.realb.real+b.imgb.img);
c->img=( a.imgb.real-a.realb.img)/(b.realb.real+b.img*b.img);
}
my cufft is below
//#include “cuda_runtime.h”
//#include “device_launch_parameters.h”
#include <cufft.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include
#define NX 512
#define BATCH 1
using namespace std;
//Complex data type
typedef float2 Complex;
int main()
{
cufftHandle plan;
cufftComplex *idata,*odata;
clock_t start,end;
//Allocate host memory for the signal
Complex *h_signal=(Complex*)malloc(sizeof(Complex)*NX);
Complex *h_result=(Complex*)malloc(sizeof(Complex)*NX);
//Initalize host memory for the signal
for(unsigned int i=0;i<NX;i++)
{
h_signal[i].x=(float)i;
h_signal[i].y=0;
}
/* Allocate device memory */
cudaMalloc((void**)&idata,sizeof(cufftComplex)*NX*BATCH);
cudaMalloc((void**)&odata,sizeof(cufftComplex)*NX*BATCH);
/* 主机设备数据传输*/
cudaMemcpy(idata,h_signal,sizeof(Complex)*NX,cudaMemcpyHostToDevice);
if(cudaGetLastError()!=cudaSuccess) {
fprintf(stderr,"Cuda error: Failed to allocate\n");
return;
}
start=clock();
/* Create a 1D FFT plan. */
if(cufftPlan1d(&plan,NX,CUFFT_C2C,BATCH)!=CUFFT_SUCCESS) {
fprintf(stderr,"CUFFT error: Plan creation failed");
return;
}
/* Use the CUFFT plan to transform the signal in place .*/
if(cufftExecC2C(plan,idata,odata,CUFFT_FORWARD)!=CUFFT_SUCCESS) {
fprintf(stderr,"CUFFT error:ExecC2C Forward failed");
return;
}
end=clock();
/* Inverse transform the signal in place.
if (cufftExecC2C(plan,idata,odata,CUFFT_INVERSE)!=CUFFT_SUCCESS)
{
fprintf(stderr,"CUFFT error:ExecC2C Inverse failed");
return;
} */
if (cudaThreadSynchronize()!=cudaSuccess)
{
fprintf(stderr,"Cuda error: Failed to synchronize\n");
return;
}
cudaMemcpy(h_result,odata,sizeof(Complex)*NX,cudaMemcpyDeviceToHost);
/* Show the result.*/
printf("The result are as follows\n");
for(int i=0;i<NX;i++)
{
printf("%.4f",h_result[i].x);
if(h_result[i].y>=0.0001)
printf("+%.4fj\n",h_result[i].y);
else if(fabs(h_result[i].y)<0.0001)
printf("\n");
else
printf("%.4fj\n",h_result[i].y);
}
cout<<"GPU use "<<(end-start)<<" ms"<<endl;
/* Destroy the CUFFT plan. */
cufftDestroy(plan);
cudaFree(idata);
cudaFree(odata);
return 0;
}
when my NX which is the numbers of FFT points is bigger(eg, NX=65536),the result is wrong,i try my best to find where is wrong,but i cannnot ,thanks for your reply.