Derivatives with CUFFT

Hello y’all,

I tried to find a solution to my problem, but wasn’t able to find one. I want to create a CUDA-kernel to compute the first derivative of a 2D function with the help of FFTs and especially CUFFT. The idea is to translate the following Matlab Code to CUDA.

% f is N x N
% 2D FFT
f_t = fft2(f);

% array of wave numbers
k = [0:N/2-1,-N/2:-1]';
[kx,ky] = meshgrid(k);

% define imaginary number
i_num = sqrt(-1);

% multiply coefficients with i*k to get the derivative
f_t_x = i_num.*kx.*f_t;
f_t_y = i_num.*ky.*f_t;

% reconstruct from Fourier to real space
df_x = real(ifft2(f_t_x));
df_y = real(ifft2(f_t_y));

I understand how to perform the Fourier transformation and how to convert the values from complex to real data. My problem right now is, that I don’t know how to perform the operation:


Or how to define the imaginary number, so I can multiply the wave numbers.
I only found examples for the second derivative, but this doesn’t help me, because in that case one multiplies by -k^2 and not i*k.
Hopefully somebody encountered this problem before and is able to help me.
Thanks in advance.

Best regards,


My CUDA code so far:


#include <cmath>

#include <iostream>

#include <fstream>

#include <math.h>

#include <complex>

#include <cufft.h>

#pragma comment ( lib, "cufft.lib" )

#define BSZ 16

__global__ void derivative(cufftComplex *ft, cufftComplex *ft_k, float *k, int N)

    int i = threadIdx.x + blockIdx.x*BSZ;
    int j = threadIdx.y + blockIdx.y*BSZ;
    int index = j*N+i;
    if (i<N && j<N)
         /* Not exactly sure how to multiply the values with i
         float k2 = k[i]+k[j];
         ft_k[index].x = ft[index].x*k2*imag_number;
         ft_k[index].y = ft[index].y*k2*imag_number;


__global__ void real2complex(float *f, cufftComplex *fc, int N)
    int i = threadIdx.x + blockIdx.x*blockDim.x;
    int j = threadIdx.y + blockIdx.y*blockDim.y;
    int index = j*N+i;
    if (i<N && j<N)
        fc[index].x = f[index];
        fc[index].y = 0.0f;


__global__ void complex2real(cufftComplex *fc, float *f, int N)
    int i = threadIdx.x + blockIdx.x*BSZ;
    int j = threadIdx.y + blockIdx.y*BSZ;
    int index = j*N+i;
    if (i<N && j<N)
        f[index] = fc[index].x/((float)N*(float)N);
        // divide by number of elements to recover value


int main()

    // init variables
    int N = 128;
    float xmax = (float) 2*M_PI, xmin = 0.0f, ymin = 0.0f,
 h = (xmax-xmin)/((float)N);
    float *x,*y, *u, *f, *u_a, *k;

    // allocate arrays on the host
    x = (float*) malloc(sizeof(float)*N*N);
    y = (float*) malloc(sizeof(float)*N*N);
    u = (float*) malloc(sizeof(float)*N*N);
    f = (float*) malloc(sizeof(float)*N*N);
    u_a = (float*) malloc(sizeof(float)*N*N);

    k = (float*) malloc(sizeof(float)*N);

    // init arrays
    for (int j=0; j<N; j++)
        for (int i=0; i<N; i++)
            x[N*j+i] = xmin + i*h;
            y[N*j+i] = ymin + j*h;
            f[N*j+i] = sin(x[N*j+1]) + sin(y[N*j+i]);	     // rhs
            u_a[N*j+i] = cos(x[N*j+i]) + cos(y[N*j+1]);     // analytical solution
    for (int i=0; i<=N/2; i++)
        k[i] = i * 2*M_PI;	                                     // wavenumbers
    for (int i=N/2+1; i<N; i++)
        k[i] = (i - N) * 2*M_PI;

    // Allocate arrays on the device
    float *k_d, *f_d, *u_d;
    cudaMalloc ((void**)&k_d, sizeof(float)*N);
    cudaMalloc ((void**)&f_d, sizeof(float)*N*N);
    cudaMalloc ((void**)&u_d, sizeof(float)*N*N);

    cufftComplex *ft_d, *f_d_complex, *ft_d_result, *u_d_complex;	
    cudaMalloc ((void**)&ft_d, sizeof(cufftComplex)*N*N);
    cudaMalloc ((void**)&ft_d_result, sizeof(cufftComplex)*N*N);
    cudaMalloc ((void**)&f_d_complex, sizeof(cufftComplex)*N*N);
    cudaMalloc ((void**)&u_d_complex, sizeof(cufftComplex)*N*N);

    // transfer date from host to device
    cudaMemcpy(k_d, k, sizeof(float)*N, cudaMemcpyHostToDevice);
    cudaMemcpy(f_d, f, sizeof(float)*N*N, cudaMemcpyHostToDevice);

    // define grid and blocksize
    dim3 dimBlock (BSZ, BSZ);
    dim3 dimGrid (int((N-0.5)/BSZ) + 1, int((N-0.5)/BSZ) + 1);

    // create plan
    cufftHandle plan;
    cufftPlan2d(&plan, N, N, CUFFT_C2C);

    // transform real to complex
    real2complex<<<dimGrid, dimBlock>>>(f_d, f_d_complex, N);

    // perform forward FFT
    cufftExecC2C(plan, f_d_complex, ft_d, CUFFT_FORWARD);

    // derive in Fourier space
    derivative<<<dimGrid, dimBlock>>>(ft_d, ft_d_result, k_d, N);

    // perform inverse FFT
    cufftExecC2C(plan, ft_d_result, u_d_complex, CUFFT_INVERSE);

    // transform complex to real
    complex2real<<<dimGrid, dimBlock>>>(u_d_complex, u_d, N);

    // copy result back to host
    cudaMemcpy(u, u_d, sizeof(float)*N*N, cudaMemcpyDeviceToHost);
    float constant = u[0];
    for (int i=0; i<N*N; i++)
        u[i] -= constant; 
        //substract u[0] to force the arbitrary constant to be 0

    // Look at errors
    float L2_error, L2_1 = 0.0f, L2_2 = 0.0f;
    for (int j=1; j<N-1; j++)
        for (int i=1; i<N-1; i++)
            L2_1 += (u[(j-1)*(N-2)+(i-1)] - u_a[(j-1)*(N-2)+(i-1)])*(u[(j-1)*(N-2)+(i-1)] - u_a[(j-1)*(N-2)+(i-1)]);
            L2_2 += u_a[(j-1)*(N-2)+(i-1)]*u_a[(j-1)*(N-2)+(i-1)];	
    L2_error = sqrt(L2_1/L2_2);

    // cleanup



    return 0;


In header file cufft.h we have:

typedef cuComplex cufftComplex;
typedef cuDoubleComplex cufftDoubleComplex;

cuComplex and cyDoubleComplex are types exported by header file cuComplex.h. The header file contains access functions and basic arithmetic, which is all that you would need. It also has a function for creating a complex number from its real and imaginary components:

typedef cuFloatComplex cuComplex;
__host__ __device__ static __inline__ cuComplex make_cuComplex (float x, 
                                                                float y) 
    return make_cuFloatComplex (x, y); 
__host__ __device__ static __inline__ cuFloatComplex make_cuFloatComplex 
                                                             (float r, float i)
    cuFloatComplex res;
    res.x = r;
    res.y = i;
    return res;

So the imaginary constant ‘i’ [sqrt(-1)] itself would be the result of

make_cuComplex (0.0f, 1.0f);


What I do is something like this:

cufft real to complex;
kernel in which I apply the derivatives in kspace;
invers cufft complex to real;

In this kernel you can do the multiplication directly by acccesing each component. For example i*C can be done in 2 operations:
where .x is the real part and .y the imaginary part.


thank you for your answers. Pasoleatis solution works fine.

Best regards,


You should not calculate the derivative that way, it will be very sensitive to noise. Use sobel-filters instead, there is a sobelfilter example in the CUDA SDK.

I know, but I’ll use periodic and smooth data or apply a Gaussian to ensure the smoothness.