So the MATLAB code here
t=[0:1/1000:1-1/1000];
j=sqrt(-1);
s=[10*exp(j*2*pi*100*t)];
A=exp(-j*2*pi*7.5*[0:63].'*sin(77*pi/180)/15);
data=A*s;
data=data';
d0=data(:,1)';
d1=fft(d0);
and here is the first 20 of d1 on my pc
1.02318153949454e-11 - 3.89189389312207e-12i
3.96304793962431e-12 - 9.59627675712573e-12i
-1.22490611118041e-12 - 1.87442732754772e-12i
-2.62883065705508e-12 - 4.86408274878475e-12i
6.31721550091214e-12 + 3.25249180137728e-13i
-3.37521420326130e-12 - 7.87751355432440e-12i
4.41500716384229e-12 - 3.59597047697461e-12i
-7.07180311825956e-12 - 4.74217362079010e-13i
-5.62876972134328e-12 - 6.39511318817060e-12i
-5.38097000286798e-13 - 1.98707845052568e-12i
6.10931383113531e-13 + 2.44965382298964e-12i
-6.22210171250057e-12 + 5.28767466113619e-13i
-3.84916261802694e-12 + 3.11419120171075e-12i
-2.99812100496280e-12 + 1.23654974114445e-12i
5.19959329736483e-12 + 4.62825855394361e-12i
1.24055492666447e-12 + 6.60383790841987e-13i
3.42124542782838e-12 + 6.59616998606136e-13i
1.87649818762980e-12 - 2.23924607892215e-12i
2.60970084311677e-12 - 2.68843698366259e-13i
-5.41275677319708e-14 - 4.06094516348544e-12i
and here is my cuda code
#include <stdio.h>
#include <math.h>
#include <cufft.h>
#include <cuda_runtime.h>
#define PI 3.1415926
__managed__ cuDoubleComplex j;
__global__ void sGen(cuDoubleComplex* s, cuDoubleComplex* A, int M, int N);
__global__ void matMul(cuDoubleComplex* s, cuDoubleComplex* A, cuDoubleComplex* data, int N);
int main()
{
j.x=0.0;
j.y=1.0;
cuDoubleComplex* s;
cudaMallocManaged((void **)&s, 1000 * sizeof(cuDoubleComplex));
cuDoubleComplex* A;
cudaMallocManaged((void **)&A, 64 * sizeof(cuDoubleComplex));
cuDoubleComplex* data;
cudaMallocManaged((void **)&data, 64000 * sizeof(cuDoubleComplex));
sGen<<<1, 1000>>>(s, A, 64, 1000);
cudaDeviceSynchronize();
matMul<<<64, 1000>>>(s, A, data, 1000);
cudaDeviceSynchronize();
for(int n = 0 ; n < 20; n++)
{
printf("data[%d]: %2.7e\n", n, data[n].x);
printf("data[%d]: %2.7e\n", n, data[n].y);
}
cufftHandle plan;
int rank = 1;
int n[1] ={1000};
int istride = 1;
int idist = 1000;
int ostride =1;
int odist = 1000;
int inembed[2] = {1000, 64};
int onembed[2] = {1000, 64};
cufftPlanMany(&plan, rank, n, inembed, istride, idist, onembed, ostride, odist, CUFFT_Z2Z, 64);
cufftExecZ2Z(plan, data, data, CUFFT_FORWARD);
cudaDeviceSynchronize();
for(int n = 0 ; n < 20; n++)
{
printf("data[%d]: %2.7e\n", n, data[n].x);
printf("data[%d]: %2.7e\n", n, data[n].y);
}
return 0;
}
__global__ void sGen(cuDoubleComplex* s, cuDoubleComplex* A, int M, int N)
{
int tid = threadIdx.x + blockDim.x * blockIdx.x;
if(tid < N)
{
double t = 1.0 / 1000 * tid;
double a1 = j.x * 2 * PI * 100 * t;
double b1 = j.y * 2 * PI * 100 * t;
double sin1, cos1;
double e1 = exp(a1);
sincos(b1, &sin1, &cos1);
s[tid].x = 10 * cos1 * e1;
s[tid].y = 10 * sin1 * e1;
}
if(tid < M)
{
double a1 = -j.x * 2 * PI * 7.5 * tid * sin(77.0 * PI / 180) / 15.0;
double b1 = -j.y * 2 * PI * 7.5 * tid * sin(77.0 * PI / 180) / 15.0;
double sin1, cos1;
double e1 = exp(a1);
sincos(b1, &sin1, &cos1);
A[tid].x = cos1 * e1;
A[tid].y = sin1 * e1;
}
}
__global__ void matMul(cuDoubleComplex* s, cuDoubleComplex* A, cuDoubleComplex* data, int N)
{
int tid = threadIdx.x + blockDim.x * blockIdx.x;
int row = tid / N;
int col = tid % N;
data[tid].x = A[row].x * s[col].x - A[row].y * s[col].y;
data[tid].y = A[row].x * s[col].y + A[row].y * s[col].x;
}
and the part of the output of data is following,
data[0]: -1.6493214e-04
data[0]: 5.3590679e-05
data[1]: -1.6671242e-04
data[1]: 5.3590680e-05
data[2]: -1.6852784e-04
data[2]: 5.3590693e-05
data[3]: -1.7037949e-04
data[3]: 5.3590703e-05
data[4]: -1.7226850e-04
data[4]: 5.3590716e-05
data[5]: -1.7419606e-04
data[5]: 5.3590725e-05
or you can compile this code with
nvcc -o obj obj.cu -lcufft
I have verified the input data of fft already. It’s as same as MATLAB
It’s so strange that the result of cuda has so huge deviation against MATLAB code.
Is my cuda code wrong or the MATLAB bug?
Somebody help me please.