# Derivatives with CUFFT

Hello y’all,

I tried to find a solution to my problem, but wasn’t able to find one. I want to create a CUDA-kernel to compute the first derivative of a 2D function with the help of FFTs and especially CUFFT. The idea is to translate the following Matlab Code to CUDA.

% f is N x N
% 2D FFT
f_t = fft2(f);

% array of wave numbers
k = [0:N/2-1,-N/2:-1]';
[kx,ky] = meshgrid(k);

% define imaginary number
i_num = sqrt(-1);

% multiply coefficients with i*k to get the derivative
f_t_x = i_num.*kx.*f_t;
f_t_y = i_num.*ky.*f_t;

% reconstruct from Fourier to real space
df_x = real(ifft2(f_t_x));
df_y = real(ifft2(f_t_y));

I understand how to perform the Fourier transformation and how to convert the values from complex to real data. My problem right now is, that I don’t know how to perform the operation:

i*k;

Or how to define the imaginary number, so I can multiply the wave numbers.
I only found examples for the second derivative, but this doesn’t help me, because in that case one multiplies by -k^2 and not i*k.
Hopefully somebody encountered this problem before and is able to help me.

Best regards,

Florian

My CUDA code so far:

#define _USE_MATH_DEFINES

#include <cmath>

#include <iostream>

#include <fstream>

#include <math.h>

#include <complex>

#include <cufft.h>

#pragma comment ( lib, "cufft.lib" )

#define BSZ 16

__global__ void derivative(cufftComplex *ft, cufftComplex *ft_k, float *k, int N)

{

int i = threadIdx.x + blockIdx.x*BSZ;

int j = threadIdx.y + blockIdx.y*BSZ;

int index = j*N+i;

if (i<N && j<N)

{

/* Not exactly sure how to multiply the values with i

float k2 = k[i]+k[j];

ft_k[index].x = ft[index].x*k2*imag_number;

ft_k[index].y = ft[index].y*k2*imag_number;

*/

}

}

__global__ void real2complex(float *f, cufftComplex *fc, int N)

{

int i = threadIdx.x + blockIdx.x*blockDim.x;

int j = threadIdx.y + blockIdx.y*blockDim.y;

int index = j*N+i;

if (i<N && j<N)

{

fc[index].x = f[index];

fc[index].y = 0.0f;

}

}

__global__ void complex2real(cufftComplex *fc, float *f, int N)
{
int i = threadIdx.x + blockIdx.x*BSZ;

int j = threadIdx.y + blockIdx.y*BSZ;

int index = j*N+i;

if (i<N && j<N)

{

f[index] = fc[index].x/((float)N*(float)N);

// divide by number of elements to recover value

}

}

int main()

{

// init variables

int N = 128;

float xmax = (float) 2*M_PI, xmin = 0.0f, ymin = 0.0f,
h = (xmax-xmin)/((float)N);

float *x,*y, *u, *f, *u_a, *k;

// allocate arrays on the host

x = (float*) malloc(sizeof(float)*N*N);

y = (float*) malloc(sizeof(float)*N*N);

u = (float*) malloc(sizeof(float)*N*N);

f = (float*) malloc(sizeof(float)*N*N);

u_a = (float*) malloc(sizeof(float)*N*N);

k = (float*) malloc(sizeof(float)*N);

// init arrays

for (int j=0; j<N; j++)

{

for (int i=0; i<N; i++)

{

x[N*j+i] = xmin + i*h;

y[N*j+i] = ymin + j*h;

f[N*j+i] = sin(x[N*j+1]) + sin(y[N*j+i]);	     // rhs

u_a[N*j+i] = cos(x[N*j+i]) + cos(y[N*j+1]);     // analytical solution

}

}

for (int i=0; i<=N/2; i++)

{

k[i] = i * 2*M_PI;	                                     // wavenumbers

}

for (int i=N/2+1; i<N; i++)

{

k[i] = (i - N) * 2*M_PI;

}

// Allocate arrays on the device

float *k_d, *f_d, *u_d;

cudaMalloc ((void**)&k_d, sizeof(float)*N);

cudaMalloc ((void**)&f_d, sizeof(float)*N*N);

cudaMalloc ((void**)&u_d, sizeof(float)*N*N);

cufftComplex *ft_d, *f_d_complex, *ft_d_result, *u_d_complex;

cudaMalloc ((void**)&ft_d, sizeof(cufftComplex)*N*N);

cudaMalloc ((void**)&ft_d_result, sizeof(cufftComplex)*N*N);

cudaMalloc ((void**)&f_d_complex, sizeof(cufftComplex)*N*N);

cudaMalloc ((void**)&u_d_complex, sizeof(cufftComplex)*N*N);

// transfer date from host to device

cudaMemcpy(k_d, k, sizeof(float)*N, cudaMemcpyHostToDevice);

cudaMemcpy(f_d, f, sizeof(float)*N*N, cudaMemcpyHostToDevice);

// define grid and blocksize

dim3 dimBlock (BSZ, BSZ);

dim3 dimGrid (int((N-0.5)/BSZ) + 1, int((N-0.5)/BSZ) + 1);

// create plan

cufftHandle plan;

cufftPlan2d(&plan, N, N, CUFFT_C2C);

// transform real to complex

real2complex<<<dimGrid, dimBlock>>>(f_d, f_d_complex, N);

// perform forward FFT

cufftExecC2C(plan, f_d_complex, ft_d, CUFFT_FORWARD);

// derive in Fourier space

derivative<<<dimGrid, dimBlock>>>(ft_d, ft_d_result, k_d, N);

// perform inverse FFT

cufftExecC2C(plan, ft_d_result, u_d_complex, CUFFT_INVERSE);

// transform complex to real
complex2real<<<dimGrid, dimBlock>>>(u_d_complex, u_d, N);

// copy result back to host

cudaMemcpy(u, u_d, sizeof(float)*N*N, cudaMemcpyDeviceToHost);

float constant = u[0];

for (int i=0; i<N*N; i++)

{

u[i] -= constant;
//substract u[0] to force the arbitrary constant to be 0

}

// Look at errors

float L2_error, L2_1 = 0.0f, L2_2 = 0.0f;

for (int j=1; j<N-1; j++)

{
for (int i=1; i<N-1; i++)

{
L2_1 += (u[(j-1)*(N-2)+(i-1)] - u_a[(j-1)*(N-2)+(i-1)])*(u[(j-1)*(N-2)+(i-1)] - u_a[(j-1)*(N-2)+(i-1)]);

L2_2 += u_a[(j-1)*(N-2)+(i-1)]*u_a[(j-1)*(N-2)+(i-1)];

}

}

L2_error = sqrt(L2_1/L2_2);

std::cout<<L2_error<<std::endl;

// cleanup

cufftDestroy(plan);

cudaFree(k_d);

cudaFree(f_d);

cudaFree(u_d);

free(x);
free(y);
free(f);
free(u);
free(u_a);
free(k);

return 0;

}

In header file cufft.h we have:

typedef cuComplex cufftComplex;
typedef cuDoubleComplex cufftDoubleComplex;

cuComplex and cyDoubleComplex are types exported by header file cuComplex.h. The header file contains access functions and basic arithmetic, which is all that you would need. It also has a function for creating a complex number from its real and imaginary components:

typedef cuFloatComplex cuComplex;
__host__ __device__ static __inline__ cuComplex make_cuComplex (float x,
float y)
{
return make_cuFloatComplex (x, y);
}
__host__ __device__ static __inline__ cuFloatComplex make_cuFloatComplex
(float r, float i)
{
cuFloatComplex res;
res.x = r;
res.y = i;
return res;
}

So the imaginary constant ‘i’ [sqrt(-1)] itself would be the result of

make_cuComplex (0.0f, 1.0f);

Hello,

What I do is something like this:

cufft real to complex;
kernel in which I apply the derivatives in kspace;
invers cufft complex to real;

In this kernel you can do the multiplication directly by acccesing each component. For example i*C can be done in 2 operations:
newC.x=-C.y
newC.y=C.x,
where .x is the real part and .y the imaginary part.

Hello,