Derivatives with CUFFT

TrammyKelevra · December 12, 2012, 9:41pm

Hello y’all,

I tried to find a solution to my problem, but wasn’t able to find one. I want to create a CUDA-kernel to compute the first derivative of a 2D function with the help of FFTs and especially CUFFT. The idea is to translate the following Matlab Code to CUDA.

% f is N x N
% 2D FFT
f_t = fft2(f);

% array of wave numbers
k = [0:N/2-1,-N/2:-1]';
[kx,ky] = meshgrid(k);

% define imaginary number
i_num = sqrt(-1);

% multiply coefficients with i*k to get the derivative
f_t_x = i_num.*kx.*f_t;
f_t_y = i_num.*ky.*f_t;

% reconstruct from Fourier to real space
df_x = real(ifft2(f_t_x));
df_y = real(ifft2(f_t_y));

I understand how to perform the Fourier transformation and how to convert the values from complex to real data. My problem right now is, that I don’t know how to perform the operation:

i*k;

Or how to define the imaginary number, so I can multiply the wave numbers.
I only found examples for the second derivative, but this doesn’t help me, because in that case one multiplies by -k^2 and not i*k.
Hopefully somebody encountered this problem before and is able to help me.
Thanks in advance.

Best regards,

Florian

TrammyKelevra · December 12, 2012, 10:04pm

My CUDA code so far:

#define _USE_MATH_DEFINES

#include <cmath>

#include <iostream>

#include <fstream>

#include <math.h>

#include <complex>

#include <cufft.h>

#pragma comment ( lib, "cufft.lib" )

#define BSZ 16

__global__ void derivative(cufftComplex *ft, cufftComplex *ft_k, float *k, int N)

{
	 
    int i = threadIdx.x + blockIdx.x*BSZ;
	
    int j = threadIdx.y + blockIdx.y*BSZ;
	
    int index = j*N+i;
	
    if (i<N && j<N)
	
    {
		
         /* Not exactly sure how to multiply the values with i
		
         float k2 = k[i]+k[j];
		
         ft_k[index].x = ft[index].x*k2*imag_number;
		
         ft_k[index].y = ft[index].y*k2*imag_number;
		
         */
	
     }

}

__global__ void real2complex(float *f, cufftComplex *fc, int N)
	
{
	
    int i = threadIdx.x + blockIdx.x*blockDim.x;
	
    int j = threadIdx.y + blockIdx.y*blockDim.y;
	
    int index = j*N+i;
	
    if (i<N && j<N)
		
    { 
		
        fc[index].x = f[index];
		
        fc[index].y = 0.0f;
	
    }

} 

__global__ void complex2real(cufftComplex *fc, float *f, int N)
{
    int i = threadIdx.x + blockIdx.x*BSZ;
	
    int j = threadIdx.y + blockIdx.y*BSZ;
	
    int index = j*N+i;
	
    if (i<N && j<N)
	
    {
		
        f[index] = fc[index].x/((float)N*(float)N);
		
        // divide by number of elements to recover value
	
    }

}

int main()

{
	
    // init variables
	
    int N = 128;
	
    float xmax = (float) 2*M_PI, xmin = 0.0f, ymin = 0.0f,
 h = (xmax-xmin)/((float)N);
	
    float *x,*y, *u, *f, *u_a, *k;

	
	
    // allocate arrays on the host
	
    x = (float*) malloc(sizeof(float)*N*N);
	
    y = (float*) malloc(sizeof(float)*N*N);
	
    u = (float*) malloc(sizeof(float)*N*N);
	
    f = (float*) malloc(sizeof(float)*N*N);
	
    u_a = (float*) malloc(sizeof(float)*N*N);

    k = (float*) malloc(sizeof(float)*N);
	
	

    // init arrays
	
    for (int j=0; j<N; j++)
	
    {
		
        for (int i=0; i<N; i++)
		
        { 
			
            x[N*j+i] = xmin + i*h;
			
            y[N*j+i] = ymin + j*h;
			
            f[N*j+i] = sin(x[N*j+1]) + sin(y[N*j+i]);	     // rhs
			
            u_a[N*j+i] = cos(x[N*j+i]) + cos(y[N*j+1]);     // analytical solution
		
        }
	
    }
	
	
    for (int i=0; i<=N/2; i++)
	
    { 
		
        k[i] = i * 2*M_PI;	                                     // wavenumbers
	
    }
	
	
    for (int i=N/2+1; i<N; i++)
	
    { 
		
        k[i] = (i - N) * 2*M_PI;
	
    }
	
	

    // Allocate arrays on the device
	
    float *k_d, *f_d, *u_d;
	
    cudaMalloc ((void**)&k_d, sizeof(float)*N);
	
    cudaMalloc ((void**)&f_d, sizeof(float)*N*N);
	
    cudaMalloc ((void**)&u_d, sizeof(float)*N*N);

	
	
    cufftComplex *ft_d, *f_d_complex, *ft_d_result, *u_d_complex;	
	
    cudaMalloc ((void**)&ft_d, sizeof(cufftComplex)*N*N);
	
    cudaMalloc ((void**)&ft_d_result, sizeof(cufftComplex)*N*N);
	
    cudaMalloc ((void**)&f_d_complex, sizeof(cufftComplex)*N*N);
	
    cudaMalloc ((void**)&u_d_complex, sizeof(cufftComplex)*N*N);

	
	
    // transfer date from host to device
	
    cudaMemcpy(k_d, k, sizeof(float)*N, cudaMemcpyHostToDevice);
	
    cudaMemcpy(f_d, f, sizeof(float)*N*N, cudaMemcpyHostToDevice);
	

	
    // define grid and blocksize
	
    dim3 dimBlock (BSZ, BSZ);
	
    dim3 dimGrid (int((N-0.5)/BSZ) + 1, int((N-0.5)/BSZ) + 1);
	
	

    // create plan
	
    cufftHandle plan;
	
    cufftPlan2d(&plan, N, N, CUFFT_C2C);
	
	

    // transform real to complex
	
    real2complex<<<dimGrid, dimBlock>>>(f_d, f_d_complex, N);
	
	

    // perform forward FFT
	
    cufftExecC2C(plan, f_d_complex, ft_d, CUFFT_FORWARD);
	
	

    // derive in Fourier space
	
    derivative<<<dimGrid, dimBlock>>>(ft_d, ft_d_result, k_d, N);
	
	

    // perform inverse FFT
	
    cufftExecC2C(plan, ft_d_result, u_d_complex, CUFFT_INVERSE);
	
	

    // transform complex to real
    complex2real<<<dimGrid, dimBlock>>>(u_d_complex, u_d, N);
	
	

    // copy result back to host
	
    cudaMemcpy(u, u_d, sizeof(float)*N*N, cudaMemcpyDeviceToHost);
	
	
	
    float constant = u[0];
	
    for (int i=0; i<N*N; i++)
	
    { 
		
        u[i] -= constant; 
        //substract u[0] to force the arbitrary constant to be 0
	
    }
	
	

    // Look at errors
	
    float L2_error, L2_1 = 0.0f, L2_2 = 0.0f;
	
    for (int j=1; j<N-1; j++)
	
    {	
        for (int i=1; i<N-1; i++)
		
        {	
            L2_1 += (u[(j-1)*(N-2)+(i-1)] - u_a[(j-1)*(N-2)+(i-1)])*(u[(j-1)*(N-2)+(i-1)] - u_a[(j-1)*(N-2)+(i-1)]);
		
            L2_2 += u_a[(j-1)*(N-2)+(i-1)]*u_a[(j-1)*(N-2)+(i-1)];	
		
         }
	
    }
	
    L2_error = sqrt(L2_1/L2_2);
	
    std::cout<<L2_error<<std::endl;
	
	

    // cleanup

    cufftDestroy(plan);
	
    cudaFree(k_d);
	
    cudaFree(f_d);
	
    cudaFree(u_d);

    free(x);
    free(y);
    free(f);
    free(u);
    free(u_a);
    free(k);	
	

    return 0;

}

njuffa · December 13, 2012, 1:05am

In header file cufft.h we have:

typedef cuComplex cufftComplex;
typedef cuDoubleComplex cufftDoubleComplex;

cuComplex and cyDoubleComplex are types exported by header file cuComplex.h. The header file contains access functions and basic arithmetic, which is all that you would need. It also has a function for creating a complex number from its real and imaginary components:

typedef cuFloatComplex cuComplex;
__host__ __device__ static __inline__ cuComplex make_cuComplex (float x, 
                                                                float y) 
{ 
    return make_cuFloatComplex (x, y); 
}
__host__ __device__ static __inline__ cuFloatComplex make_cuFloatComplex 
                                                             (float r, float i)
{
    cuFloatComplex res;
    res.x = r;
    res.y = i;
    return res;
}

So the imaginary constant ‘i’ [sqrt(-1)] itself would be the result of

make_cuComplex (0.0f, 1.0f);

pasoleatis · December 13, 2012, 1:28pm

Hello,

What I do is something like this:

cufft real to complex;
kernel in which I apply the derivatives in kspace;
invers cufft complex to real;

In this kernel you can do the multiplication directly by acccesing each component. For example i*C can be done in 2 operations:
newC.x=-C.y
newC.y=C.x,
where .x is the real part and .y the imaginary part.

TrammyKelevra · December 13, 2012, 5:24pm

Hello,

thank you for your answers. Pasoleatis solution works fine.
Thanks.

Best regards,

Florian

wanderine · December 13, 2012, 6:18pm

You should not calculate the derivative that way, it will be very sensitive to noise. Use sobel-filters instead, there is a sobelfilter example in the CUDA SDK.

TrammyKelevra · December 13, 2012, 6:51pm

I know, but I’ll use periodic and smooth data or apply a Gaussian to ensure the smoothness.