# CUFFT + FORTRAN, time evolution

Hello,

I’m a begginer in CUDA and I have some problems with this. I’m writting a source code for evolution in time some initial wave packet. I use operator-splitting methods. My source code:

I. FORTRAN part

``````PROGRAM time_evolution

IMPLICIT NONE

integer*4 :: i

integer, parameter :: N=2**11

integer, parameter :: BATCH = 2

integer index_max,  N_x

double precision, Dimension(N*BATCH) ::V

double complex, Dimension(N*BATCH) :: h_psi_x,h_psi_k,h_V

double precision w_x_1,dt,dx,L,k_x,x,pi,M,t

double precision x_CM, k_CM, delta_k, omega

L = 10

M = 1

t = 1.0

x_CM = 0

delta_k = 1

k_CM = 0

N_x = N*BATCH

print *, N_x

omega = 100.0

dx = L/dble(N_x-1)

index_max = 1

open(unit=10,file="in_x.txt")

open(unit=11,file="V.txt")

!Initial wave packet: initial momentum: k_CM, cetered at x_CM, with delta_k width in momentum distribution

DO i=1,N_x

x = -L/2.d0 + (i-1)*dx

V(i) =0

h_psi_x(i) = dcmplx(exp(-delta_k**2.d0*abs(x-x_CM)**2/2.0)*abs(delta_k**2.d0)**0.25,0.d0)*dcmplx(cos(k_CM*x),sin(k_CM*x))

write(10,*) x,abs(h_psi_x(i))**2.d0

write(11,*) x,V(i)

END DO

close(10)

close(11)

write(*,*) "time evolution ..."

CALL kernel_wrapper(N, BATCH, L, t, index_max, M, V, h_psi_x, h_psi_k)

open(unit=10,file="out_x.txt")

do i = 1, N_x

x = -L/2.d0 + (i-1)*dx

write(10,*) x, abs(h_psi_x(i))**2.d0

end do

close(10)

END PROGRAM
``````

II. C part (wrapper)

``````#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <cuda.h>

#include <cuda_runtime.h>

#include "cuComplex.h"

#include "cufft.h"

#include <math.h>

__global__ void space_evolution(int N, double dt, double *d_V, cuDoubleComplex *d_psi_x){

int n = threadIdx.x + blockDim.x*blockIdx.x;

cuDoubleComplex c_d_U; //exp(-i V(x)*dt //

cuDoubleComplex cN;

cN.x = N;

cN.y = 0;

if(n<N){

c_d_U.x = cos(dt*d_V[n]);

c_d_U.y = -sin(dt*d_V[n]);

d_psi_x[n] = cuCdiv(cuCmul(d_psi_x[n],c_d_U),cN);

}

}

__global__ void momentum_evolution(int N,  double w_x, cuDoubleComplex *d_psi_k){

int n = threadIdx.x + blockDim.x*blockIdx.x;

cuDoubleComplex c_w_x;

if(n<N/2){

c_w_x.x = cos(w_x*n*n);

c_w_x.y = -sin(w_x*n*n);

d_psi_k[n] = cuCmul(d_psi_k[n],c_w_x);

}

if(n>=N/2 && n<N){

c_w_x.x = cos(w_x*(N-n)*(N-n));

c_w_x.y = -sin(w_x*(N-n)*(N-n));

d_psi_k[n] = cuCmul(d_psi_k[n],c_w_x);

}

}

extern "C"{

void kernel_wrapper_(int *Np, int *BATCH_1, double *Lp,  double *t_1, int *index_max_1, double *M_1, double *h_V, cuDoubleComplex *h_psi_x, cuDoubleComplex *h_psi_k){

int N = *Np;

int BATCH = *BATCH_1;

double L = *Lp;

double dx = L/(N*BATCH-1);

double t = *t_1;

double dt = pow(dx,2)*10.0;

int N_t = (t/dt + 1);

int index_max = *index_max_1;

int flag = (N_t/(double)index_max);

double M = *M_1;

double x,k;

double pi = acos(-1.0);

double w_x = 2*pow(pi,2)/(M*pow(L,2))*dt;

int time;

printf("L_x : %f, N : %d \n", L, N);

printf("t   : %f, t : %d \n", t, N_t);

FILE  *f_1,*f_2;

char plik_1[16],plik_2[16];

//initial data on device

cufftDoubleComplex *d_psi_x;

cudaMalloc( (void **)&d_psi_x, sizeof(cufftDoubleComplex) * N * BATCH);

cudaMemcpy( d_psi_x, h_psi_x, sizeof(cufftDoubleComplex) * N * BATCH, cudaMemcpyHostToDevice );

//

cufftDoubleComplex *d_psi_k;

cudaMalloc( (void **)&d_psi_k, sizeof(cufftDoubleComplex) * N * BATCH);

// potential on device

double *d_V;

cudaMalloc( (void **)&d_V, sizeof(double ) * N * BATCH);

cudaMemcpy( d_V, h_V, sizeof(double) * N * BATCH, cudaMemcpyHostToDevice);

cufftHandle plan_f,plan_b ;

cufftPlan1d( &plan_f, N, CUFFT_Z2Z, BATCH);

cufftPlan1d( &plan_b, N, CUFFT_Z2Z, BATCH);

for(int i = 0; i < N_t; i++){

cufftExecZ2Z(plan_f, d_psi_x, d_psi_k, CUFFT_FORWARD);

cufftExecZ2Z(plan_b, d_psi_k, d_psi_x, CUFFT_INVERSE);

//writing out indirect results

if(i % flag == 0){

time = ceil(i*dt);

printf("t: %d \n",time);

printf("bbb \n");

cudaMemcpy( h_psi_x, d_psi_x, sizeof(cuDoubleComplex) * N * BATCH, cudaMemcpyDeviceToHost);

cudaMemcpy( h_psi_k, d_psi_k, sizeof(cuDoubleComplex) * N * BATCH, cudaMemcpyDeviceToHost);

sprintf(plik_1,"psi_%d.txt",time);

sprintf(plik_2,"spt_%d.txt",time);

f_1 = fopen(plik_1,"w");

f_2 = fopen(plik_2,"w");

for(int j = N/2*BATCH; j<N*BATCH;j++){

x = -L/2 + j*dx;

k = -2.0*pi/L*(N*BATCH-j);

fprintf(f_1,"%g %lf \n",x,pow(cuCabs(h_psi_x[j]),2));

fprintf(f_2,"%g %lf \n",k,pow(cuCabs(h_psi_k[j]),2)*pow(dx,2)/(2.0*pi));

}

for(int j = 0; j<N/2*BATCH;j++){

x = -L/2 + j*dx;

k = 2.0*pi/L*j;

fprintf(f_1,"%g %lf \n",x,pow(cuCabs(h_psi_x[j]),2));

fprintf(f_2,"%g %lf \n",k,pow(cuCabs(h_psi_k[j]),2)*pow(dx,2)/(2.0*pi));

}

fclose(f_1);

fclose(f_2);

}

}

cudaMemcpy( h_psi_x, d_psi_x, sizeof(cuDoubleComplex) * N * BATCH, cudaMemcpyDeviceToHost );

cudaMemcpy( h_psi_k, d_psi_k, sizeof(cuDoubleComplex) * N * BATCH, cudaMemcpyDeviceToHost );

cufftDestroy(plan_f);

cufftDestroy(plan_b);

cudaFree(d_psi_x);

cudaFree(d_psi_k);

cudaFree(d_V);

return;

}

}
``````

My question is:

1. I don’t really understand what for is “BATCH” variable. The problem is with allocation of “N*BATCH” elements (when BATCH > 1). Does BATCH > 1 speed up calculation of FFT ? Why we should use it ?

2. Can I use cuDoubleComplex instead of cufftDoubleComplex in C wrapper ? In fact my initial data are type of “double complex” (in fortran part) ?