My CUDA code so far:
#define _USE_MATH_DEFINES
#include <cmath>
#include <iostream>
#include <fstream>
#include <math.h>
#include <complex>
#include <cufft.h>
#pragma comment ( lib, "cufft.lib" )
#define BSZ 16
__global__ void derivative(cufftComplex *ft, cufftComplex *ft_k, float *k, int N)
{
int i = threadIdx.x + blockIdx.x*BSZ;
int j = threadIdx.y + blockIdx.y*BSZ;
int index = j*N+i;
if (i<N && j<N)
{
/* Not exactly sure how to multiply the values with i
float k2 = k[i]+k[j];
ft_k[index].x = ft[index].x*k2*imag_number;
ft_k[index].y = ft[index].y*k2*imag_number;
*/
}
}
__global__ void real2complex(float *f, cufftComplex *fc, int N)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
int j = threadIdx.y + blockIdx.y*blockDim.y;
int index = j*N+i;
if (i<N && j<N)
{
fc[index].x = f[index];
fc[index].y = 0.0f;
}
}
__global__ void complex2real(cufftComplex *fc, float *f, int N)
{
int i = threadIdx.x + blockIdx.x*BSZ;
int j = threadIdx.y + blockIdx.y*BSZ;
int index = j*N+i;
if (i<N && j<N)
{
f[index] = fc[index].x/((float)N*(float)N);
// divide by number of elements to recover value
}
}
int main()
{
// init variables
int N = 128;
float xmax = (float) 2*M_PI, xmin = 0.0f, ymin = 0.0f,
h = (xmax-xmin)/((float)N);
float *x,*y, *u, *f, *u_a, *k;
// allocate arrays on the host
x = (float*) malloc(sizeof(float)*N*N);
y = (float*) malloc(sizeof(float)*N*N);
u = (float*) malloc(sizeof(float)*N*N);
f = (float*) malloc(sizeof(float)*N*N);
u_a = (float*) malloc(sizeof(float)*N*N);
k = (float*) malloc(sizeof(float)*N);
// init arrays
for (int j=0; j<N; j++)
{
for (int i=0; i<N; i++)
{
x[N*j+i] = xmin + i*h;
y[N*j+i] = ymin + j*h;
f[N*j+i] = sin(x[N*j+1]) + sin(y[N*j+i]); // rhs
u_a[N*j+i] = cos(x[N*j+i]) + cos(y[N*j+1]); // analytical solution
}
}
for (int i=0; i<=N/2; i++)
{
k[i] = i * 2*M_PI; // wavenumbers
}
for (int i=N/2+1; i<N; i++)
{
k[i] = (i - N) * 2*M_PI;
}
// Allocate arrays on the device
float *k_d, *f_d, *u_d;
cudaMalloc ((void**)&k_d, sizeof(float)*N);
cudaMalloc ((void**)&f_d, sizeof(float)*N*N);
cudaMalloc ((void**)&u_d, sizeof(float)*N*N);
cufftComplex *ft_d, *f_d_complex, *ft_d_result, *u_d_complex;
cudaMalloc ((void**)&ft_d, sizeof(cufftComplex)*N*N);
cudaMalloc ((void**)&ft_d_result, sizeof(cufftComplex)*N*N);
cudaMalloc ((void**)&f_d_complex, sizeof(cufftComplex)*N*N);
cudaMalloc ((void**)&u_d_complex, sizeof(cufftComplex)*N*N);
// transfer date from host to device
cudaMemcpy(k_d, k, sizeof(float)*N, cudaMemcpyHostToDevice);
cudaMemcpy(f_d, f, sizeof(float)*N*N, cudaMemcpyHostToDevice);
// define grid and blocksize
dim3 dimBlock (BSZ, BSZ);
dim3 dimGrid (int((N-0.5)/BSZ) + 1, int((N-0.5)/BSZ) + 1);
// create plan
cufftHandle plan;
cufftPlan2d(&plan, N, N, CUFFT_C2C);
// transform real to complex
real2complex<<<dimGrid, dimBlock>>>(f_d, f_d_complex, N);
// perform forward FFT
cufftExecC2C(plan, f_d_complex, ft_d, CUFFT_FORWARD);
// derive in Fourier space
derivative<<<dimGrid, dimBlock>>>(ft_d, ft_d_result, k_d, N);
// perform inverse FFT
cufftExecC2C(plan, ft_d_result, u_d_complex, CUFFT_INVERSE);
// transform complex to real
complex2real<<<dimGrid, dimBlock>>>(u_d_complex, u_d, N);
// copy result back to host
cudaMemcpy(u, u_d, sizeof(float)*N*N, cudaMemcpyDeviceToHost);
float constant = u[0];
for (int i=0; i<N*N; i++)
{
u[i] -= constant;
//substract u[0] to force the arbitrary constant to be 0
}
// Look at errors
float L2_error, L2_1 = 0.0f, L2_2 = 0.0f;
for (int j=1; j<N-1; j++)
{
for (int i=1; i<N-1; i++)
{
L2_1 += (u[(j-1)*(N-2)+(i-1)] - u_a[(j-1)*(N-2)+(i-1)])*(u[(j-1)*(N-2)+(i-1)] - u_a[(j-1)*(N-2)+(i-1)]);
L2_2 += u_a[(j-1)*(N-2)+(i-1)]*u_a[(j-1)*(N-2)+(i-1)];
}
}
L2_error = sqrt(L2_1/L2_2);
std::cout<<L2_error<<std::endl;
// cleanup
cufftDestroy(plan);
cudaFree(k_d);
cudaFree(f_d);
cudaFree(u_d);
free(x);
free(y);
free(f);
free(u);
free(u_a);
free(k);
return 0;
}