So the MATLAB code here

```
t=[0:1/1000:1-1/1000];
j=sqrt(-1);
s=[10*exp(j*2*pi*100*t)];
A=exp(-j*2*pi*7.5*[0:63].'*sin(77*pi/180)/15);
data=A*s;
data=data';
d0=data(:,1)';
d1=fft(d0);
```

and here is the first 20 of d1 on my pc

```
1.02318153949454e-11 - 3.89189389312207e-12i
3.96304793962431e-12 - 9.59627675712573e-12i
-1.22490611118041e-12 - 1.87442732754772e-12i
-2.62883065705508e-12 - 4.86408274878475e-12i
6.31721550091214e-12 + 3.25249180137728e-13i
-3.37521420326130e-12 - 7.87751355432440e-12i
4.41500716384229e-12 - 3.59597047697461e-12i
-7.07180311825956e-12 - 4.74217362079010e-13i
-5.62876972134328e-12 - 6.39511318817060e-12i
-5.38097000286798e-13 - 1.98707845052568e-12i
6.10931383113531e-13 + 2.44965382298964e-12i
-6.22210171250057e-12 + 5.28767466113619e-13i
-3.84916261802694e-12 + 3.11419120171075e-12i
-2.99812100496280e-12 + 1.23654974114445e-12i
5.19959329736483e-12 + 4.62825855394361e-12i
1.24055492666447e-12 + 6.60383790841987e-13i
3.42124542782838e-12 + 6.59616998606136e-13i
1.87649818762980e-12 - 2.23924607892215e-12i
2.60970084311677e-12 - 2.68843698366259e-13i
-5.41275677319708e-14 - 4.06094516348544e-12i
```

and here is my cuda code

```
#include <stdio.h>
#include <math.h>
#include <cufft.h>
#include <cuda_runtime.h>
#define PI 3.1415926
__managed__ cuDoubleComplex j;
__global__ void sGen(cuDoubleComplex* s, cuDoubleComplex* A, int M, int N);
__global__ void matMul(cuDoubleComplex* s, cuDoubleComplex* A, cuDoubleComplex* data, int N);
int main()
{
j.x=0.0;
j.y=1.0;
cuDoubleComplex* s;
cudaMallocManaged((void **)&s, 1000 * sizeof(cuDoubleComplex));
cuDoubleComplex* A;
cudaMallocManaged((void **)&A, 64 * sizeof(cuDoubleComplex));
cuDoubleComplex* data;
cudaMallocManaged((void **)&data, 64000 * sizeof(cuDoubleComplex));
sGen<<<1, 1000>>>(s, A, 64, 1000);
cudaDeviceSynchronize();
matMul<<<64, 1000>>>(s, A, data, 1000);
cudaDeviceSynchronize();
for(int n = 0 ; n < 20; n++)
{
printf("data[%d]: %2.7e\n", n, data[n].x);
printf("data[%d]: %2.7e\n", n, data[n].y);
}
cufftHandle plan;
int rank = 1;
int n[1] ={1000};
int istride = 1;
int idist = 1000;
int ostride =1;
int odist = 1000;
int inembed[2] = {1000, 64};
int onembed[2] = {1000, 64};
cufftPlanMany(&plan, rank, n, inembed, istride, idist, onembed, ostride, odist, CUFFT_Z2Z, 64);
cufftExecZ2Z(plan, data, data, CUFFT_FORWARD);
cudaDeviceSynchronize();
for(int n = 0 ; n < 20; n++)
{
printf("data[%d]: %2.7e\n", n, data[n].x);
printf("data[%d]: %2.7e\n", n, data[n].y);
}
return 0;
}
__global__ void sGen(cuDoubleComplex* s, cuDoubleComplex* A, int M, int N)
{
int tid = threadIdx.x + blockDim.x * blockIdx.x;
if(tid < N)
{
double t = 1.0 / 1000 * tid;
double a1 = j.x * 2 * PI * 100 * t;
double b1 = j.y * 2 * PI * 100 * t;
double sin1, cos1;
double e1 = exp(a1);
sincos(b1, &sin1, &cos1);
s[tid].x = 10 * cos1 * e1;
s[tid].y = 10 * sin1 * e1;
}
if(tid < M)
{
double a1 = -j.x * 2 * PI * 7.5 * tid * sin(77.0 * PI / 180) / 15.0;
double b1 = -j.y * 2 * PI * 7.5 * tid * sin(77.0 * PI / 180) / 15.0;
double sin1, cos1;
double e1 = exp(a1);
sincos(b1, &sin1, &cos1);
A[tid].x = cos1 * e1;
A[tid].y = sin1 * e1;
}
}
__global__ void matMul(cuDoubleComplex* s, cuDoubleComplex* A, cuDoubleComplex* data, int N)
{
int tid = threadIdx.x + blockDim.x * blockIdx.x;
int row = tid / N;
int col = tid % N;
data[tid].x = A[row].x * s[col].x - A[row].y * s[col].y;
data[tid].y = A[row].x * s[col].y + A[row].y * s[col].x;
}
```

and the part of the output of data is following,

```
data[0]: -1.6493214e-04
data[0]: 5.3590679e-05
data[1]: -1.6671242e-04
data[1]: 5.3590680e-05
data[2]: -1.6852784e-04
data[2]: 5.3590693e-05
data[3]: -1.7037949e-04
data[3]: 5.3590703e-05
data[4]: -1.7226850e-04
data[4]: 5.3590716e-05
data[5]: -1.7419606e-04
data[5]: 5.3590725e-05
```

or you can compile this code with

```
nvcc -o obj obj.cu -lcufft
```

I have verified the input data of fft already. Itâ€™s as same as MATLAB

Itâ€™s so strange that the result of cuda has so huge deviation against MATLAB code.

Is my cuda code wrong or the MATLAB bug?

Somebody help me please.