CUFFT + FORTRAN, time evolution

Hello,

I’m a begginer in CUDA and I have some problems with this. I’m writting a source code for evolution in time some initial wave packet. I use operator-splitting methods. My source code:

I. FORTRAN part

PROGRAM time_evolution

IMPLICIT NONE

integer*4 :: i

integer, parameter :: N=2**11

integer, parameter :: BATCH = 2

integer index_max,  N_x 

double precision, Dimension(N*BATCH) ::V

double complex, Dimension(N*BATCH) :: h_psi_x,h_psi_k,h_V

double precision w_x_1,dt,dx,L,k_x,x,pi,M,t 

double precision x_CM, k_CM, delta_k, omega

L = 10

M = 1

t = 1.0

x_CM = 0

delta_k = 1

k_CM = 0

N_x = N*BATCH

print *, N_x

omega = 100.0

dx = L/dble(N_x-1)

index_max = 1

open(unit=10,file="in_x.txt")

open(unit=11,file="V.txt")

!Initial wave packet: initial momentum: k_CM, cetered at x_CM, with delta_k width in momentum distribution

DO i=1,N_x

  x = -L/2.d0 + (i-1)*dx

  V(i) =0 

  h_psi_x(i) = dcmplx(exp(-delta_k**2.d0*abs(x-x_CM)**2/2.0)*abs(delta_k**2.d0)**0.25,0.d0)*dcmplx(cos(k_CM*x),sin(k_CM*x))

  write(10,*) x,abs(h_psi_x(i))**2.d0

  write(11,*) x,V(i)

END DO

close(10)

close(11)

write(*,*) "time evolution ..."

CALL kernel_wrapper(N, BATCH, L, t, index_max, M, V, h_psi_x, h_psi_k)

open(unit=10,file="out_x.txt")

do i = 1, N_x

 x = -L/2.d0 + (i-1)*dx

 write(10,*) x, abs(h_psi_x(i))**2.d0

end do

close(10)

END PROGRAM

II. C part (wrapper)

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <cuda.h>

#include <cuda_runtime.h>

#include "cuComplex.h"

#include "cufft.h"

#include <math.h>

__global__ void space_evolution(int N, double dt, double *d_V, cuDoubleComplex *d_psi_x){

 int n = threadIdx.x + blockDim.x*blockIdx.x;  

 cuDoubleComplex c_d_U; //exp(-i V(x)*dt //

 cuDoubleComplex cN; 

 cN.x = N;

 cN.y = 0;

  if(n<N){ 

   c_d_U.x = cos(dt*d_V[n]);

   c_d_U.y = -sin(dt*d_V[n]);

   d_psi_x[n] = cuCdiv(cuCmul(d_psi_x[n],c_d_U),cN);

   }

 }

__global__ void momentum_evolution(int N,  double w_x, cuDoubleComplex *d_psi_k){

 int n = threadIdx.x + blockDim.x*blockIdx.x;

 cuDoubleComplex c_w_x;

if(n<N/2){

  c_w_x.x = cos(w_x*n*n);

  c_w_x.y = -sin(w_x*n*n);

  d_psi_k[n] = cuCmul(d_psi_k[n],c_w_x);

 }

 if(n>=N/2 && n<N){

   c_w_x.x = cos(w_x*(N-n)*(N-n));

   c_w_x.y = -sin(w_x*(N-n)*(N-n));

   d_psi_k[n] = cuCmul(d_psi_k[n],c_w_x); 

}

}

extern "C"{ 

void kernel_wrapper_(int *Np, int *BATCH_1, double *Lp,  double *t_1, int *index_max_1, double *M_1, double *h_V, cuDoubleComplex *h_psi_x, cuDoubleComplex *h_psi_k){

int N = *Np;         

int BATCH = *BATCH_1;

double L = *Lp;

double dx = L/(N*BATCH-1);

double t = *t_1;

double dt = pow(dx,2)*10.0;

int N_t = (t/dt + 1);

int index_max = *index_max_1;

int flag = (N_t/(double)index_max);

double M = *M_1;

double x,k;

double pi = acos(-1.0);

double w_x = 2*pow(pi,2)/(M*pow(L,2))*dt;

int time;

printf("L_x : %f, N : %d \n", L, N);

   printf("t   : %f, t : %d \n", t, N_t);

FILE  *f_1,*f_2;

   char plik_1[16],plik_2[16];

int ThreadsPerBlock = 256;

   int BlocksPerGrid = (N+ThreadsPerBlock-1)/ThreadsPerBlock; 

//initial data on device

cufftDoubleComplex *d_psi_x; 

cudaMalloc( (void **)&d_psi_x, sizeof(cufftDoubleComplex) * N * BATCH); 

cudaMemcpy( d_psi_x, h_psi_x, sizeof(cufftDoubleComplex) * N * BATCH, cudaMemcpyHostToDevice );

//

cufftDoubleComplex *d_psi_k;

cudaMalloc( (void **)&d_psi_k, sizeof(cufftDoubleComplex) * N * BATCH);

// potential on device

double *d_V;

cudaMalloc( (void **)&d_V, sizeof(double ) * N * BATCH);

cudaMemcpy( d_V, h_V, sizeof(double) * N * BATCH, cudaMemcpyHostToDevice); 

cufftHandle plan_f,plan_b ; 

cufftPlan1d( &plan_f, N, CUFFT_Z2Z, BATCH);

cufftPlan1d( &plan_b, N, CUFFT_Z2Z, BATCH);

for(int i = 0; i < N_t; i++){

cufftExecZ2Z(plan_f, d_psi_x, d_psi_k, CUFFT_FORWARD);

momentum_evolution<<<BlocksPerGrid,ThreadsPerBlock>>>(N*BATCH, w_x, d_psi_k);

cufftExecZ2Z(plan_b, d_psi_k, d_psi_x, CUFFT_INVERSE);

space_evolution <<<BlocksPerGrid,ThreadsPerBlock>>>(N*BATCH, dt, d_V, d_psi_x);

//writing out indirect results 

 if(i % flag == 0){

time = ceil(i*dt);

  printf("t: %d \n",time);

printf("bbb \n");

  cudaMemcpy( h_psi_x, d_psi_x, sizeof(cuDoubleComplex) * N * BATCH, cudaMemcpyDeviceToHost);

  cudaMemcpy( h_psi_k, d_psi_k, sizeof(cuDoubleComplex) * N * BATCH, cudaMemcpyDeviceToHost);

sprintf(plik_1,"psi_%d.txt",time);

  sprintf(plik_2,"spt_%d.txt",time);

f_1 = fopen(plik_1,"w");	

  f_2 = fopen(plik_2,"w");

for(int j = N/2*BATCH; j<N*BATCH;j++){

   x = -L/2 + j*dx;

   k = -2.0*pi/L*(N*BATCH-j);

   fprintf(f_1,"%g %lf \n",x,pow(cuCabs(h_psi_x[j]),2));

   fprintf(f_2,"%g %lf \n",k,pow(cuCabs(h_psi_k[j]),2)*pow(dx,2)/(2.0*pi));

  }

for(int j = 0; j<N/2*BATCH;j++){

   x = -L/2 + j*dx;

   k = 2.0*pi/L*j;

   fprintf(f_1,"%g %lf \n",x,pow(cuCabs(h_psi_x[j]),2));

   fprintf(f_2,"%g %lf \n",k,pow(cuCabs(h_psi_k[j]),2)*pow(dx,2)/(2.0*pi));

  }

fclose(f_1);

  fclose(f_2);

 }

}

cudaMemcpy( h_psi_x, d_psi_x, sizeof(cuDoubleComplex) * N * BATCH, cudaMemcpyDeviceToHost );

cudaMemcpy( h_psi_k, d_psi_k, sizeof(cuDoubleComplex) * N * BATCH, cudaMemcpyDeviceToHost );

cufftDestroy(plan_f);

cufftDestroy(plan_b);

cudaFree(d_psi_x);

cudaFree(d_psi_k);

cudaFree(d_V); 

return;

}

}

My question is:

  1. I don’t really understand what for is “BATCH” variable. The problem is with allocation of “N*BATCH” elements (when BATCH > 1). Does BATCH > 1 speed up calculation of FFT ? Why we should use it ?

  2. Can I use cuDoubleComplex instead of cufftDoubleComplex in C wrapper ? In fact my initial data are type of “double complex” (in fortran part) ?

Thanks for your help !

Sumert