Hello,
I’m a begginer in CUDA and I have some problems with this. I’m writting a source code for evolution in time some initial wave packet. I use operator-splitting methods. My source code:
I. FORTRAN part
PROGRAM time_evolution
IMPLICIT NONE
integer*4 :: i
integer, parameter :: N=2**11
integer, parameter :: BATCH = 2
integer index_max, N_x
double precision, Dimension(N*BATCH) ::V
double complex, Dimension(N*BATCH) :: h_psi_x,h_psi_k,h_V
double precision w_x_1,dt,dx,L,k_x,x,pi,M,t
double precision x_CM, k_CM, delta_k, omega
L = 10
M = 1
t = 1.0
x_CM = 0
delta_k = 1
k_CM = 0
N_x = N*BATCH
print *, N_x
omega = 100.0
dx = L/dble(N_x-1)
index_max = 1
open(unit=10,file="in_x.txt")
open(unit=11,file="V.txt")
!Initial wave packet: initial momentum: k_CM, cetered at x_CM, with delta_k width in momentum distribution
DO i=1,N_x
x = -L/2.d0 + (i-1)*dx
V(i) =0
h_psi_x(i) = dcmplx(exp(-delta_k**2.d0*abs(x-x_CM)**2/2.0)*abs(delta_k**2.d0)**0.25,0.d0)*dcmplx(cos(k_CM*x),sin(k_CM*x))
write(10,*) x,abs(h_psi_x(i))**2.d0
write(11,*) x,V(i)
END DO
close(10)
close(11)
write(*,*) "time evolution ..."
CALL kernel_wrapper(N, BATCH, L, t, index_max, M, V, h_psi_x, h_psi_k)
open(unit=10,file="out_x.txt")
do i = 1, N_x
x = -L/2.d0 + (i-1)*dx
write(10,*) x, abs(h_psi_x(i))**2.d0
end do
close(10)
END PROGRAM
II. C part (wrapper)
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include "cuComplex.h"
#include "cufft.h"
#include <math.h>
__global__ void space_evolution(int N, double dt, double *d_V, cuDoubleComplex *d_psi_x){
int n = threadIdx.x + blockDim.x*blockIdx.x;
cuDoubleComplex c_d_U; //exp(-i V(x)*dt //
cuDoubleComplex cN;
cN.x = N;
cN.y = 0;
if(n<N){
c_d_U.x = cos(dt*d_V[n]);
c_d_U.y = -sin(dt*d_V[n]);
d_psi_x[n] = cuCdiv(cuCmul(d_psi_x[n],c_d_U),cN);
}
}
__global__ void momentum_evolution(int N, double w_x, cuDoubleComplex *d_psi_k){
int n = threadIdx.x + blockDim.x*blockIdx.x;
cuDoubleComplex c_w_x;
if(n<N/2){
c_w_x.x = cos(w_x*n*n);
c_w_x.y = -sin(w_x*n*n);
d_psi_k[n] = cuCmul(d_psi_k[n],c_w_x);
}
if(n>=N/2 && n<N){
c_w_x.x = cos(w_x*(N-n)*(N-n));
c_w_x.y = -sin(w_x*(N-n)*(N-n));
d_psi_k[n] = cuCmul(d_psi_k[n],c_w_x);
}
}
extern "C"{
void kernel_wrapper_(int *Np, int *BATCH_1, double *Lp, double *t_1, int *index_max_1, double *M_1, double *h_V, cuDoubleComplex *h_psi_x, cuDoubleComplex *h_psi_k){
int N = *Np;
int BATCH = *BATCH_1;
double L = *Lp;
double dx = L/(N*BATCH-1);
double t = *t_1;
double dt = pow(dx,2)*10.0;
int N_t = (t/dt + 1);
int index_max = *index_max_1;
int flag = (N_t/(double)index_max);
double M = *M_1;
double x,k;
double pi = acos(-1.0);
double w_x = 2*pow(pi,2)/(M*pow(L,2))*dt;
int time;
printf("L_x : %f, N : %d \n", L, N);
printf("t : %f, t : %d \n", t, N_t);
FILE *f_1,*f_2;
char plik_1[16],plik_2[16];
int ThreadsPerBlock = 256;
int BlocksPerGrid = (N+ThreadsPerBlock-1)/ThreadsPerBlock;
//initial data on device
cufftDoubleComplex *d_psi_x;
cudaMalloc( (void **)&d_psi_x, sizeof(cufftDoubleComplex) * N * BATCH);
cudaMemcpy( d_psi_x, h_psi_x, sizeof(cufftDoubleComplex) * N * BATCH, cudaMemcpyHostToDevice );
//
cufftDoubleComplex *d_psi_k;
cudaMalloc( (void **)&d_psi_k, sizeof(cufftDoubleComplex) * N * BATCH);
// potential on device
double *d_V;
cudaMalloc( (void **)&d_V, sizeof(double ) * N * BATCH);
cudaMemcpy( d_V, h_V, sizeof(double) * N * BATCH, cudaMemcpyHostToDevice);
cufftHandle plan_f,plan_b ;
cufftPlan1d( &plan_f, N, CUFFT_Z2Z, BATCH);
cufftPlan1d( &plan_b, N, CUFFT_Z2Z, BATCH);
for(int i = 0; i < N_t; i++){
cufftExecZ2Z(plan_f, d_psi_x, d_psi_k, CUFFT_FORWARD);
momentum_evolution<<<BlocksPerGrid,ThreadsPerBlock>>>(N*BATCH, w_x, d_psi_k);
cufftExecZ2Z(plan_b, d_psi_k, d_psi_x, CUFFT_INVERSE);
space_evolution <<<BlocksPerGrid,ThreadsPerBlock>>>(N*BATCH, dt, d_V, d_psi_x);
//writing out indirect results
if(i % flag == 0){
time = ceil(i*dt);
printf("t: %d \n",time);
printf("bbb \n");
cudaMemcpy( h_psi_x, d_psi_x, sizeof(cuDoubleComplex) * N * BATCH, cudaMemcpyDeviceToHost);
cudaMemcpy( h_psi_k, d_psi_k, sizeof(cuDoubleComplex) * N * BATCH, cudaMemcpyDeviceToHost);
sprintf(plik_1,"psi_%d.txt",time);
sprintf(plik_2,"spt_%d.txt",time);
f_1 = fopen(plik_1,"w");
f_2 = fopen(plik_2,"w");
for(int j = N/2*BATCH; j<N*BATCH;j++){
x = -L/2 + j*dx;
k = -2.0*pi/L*(N*BATCH-j);
fprintf(f_1,"%g %lf \n",x,pow(cuCabs(h_psi_x[j]),2));
fprintf(f_2,"%g %lf \n",k,pow(cuCabs(h_psi_k[j]),2)*pow(dx,2)/(2.0*pi));
}
for(int j = 0; j<N/2*BATCH;j++){
x = -L/2 + j*dx;
k = 2.0*pi/L*j;
fprintf(f_1,"%g %lf \n",x,pow(cuCabs(h_psi_x[j]),2));
fprintf(f_2,"%g %lf \n",k,pow(cuCabs(h_psi_k[j]),2)*pow(dx,2)/(2.0*pi));
}
fclose(f_1);
fclose(f_2);
}
}
cudaMemcpy( h_psi_x, d_psi_x, sizeof(cuDoubleComplex) * N * BATCH, cudaMemcpyDeviceToHost );
cudaMemcpy( h_psi_k, d_psi_k, sizeof(cuDoubleComplex) * N * BATCH, cudaMemcpyDeviceToHost );
cufftDestroy(plan_f);
cufftDestroy(plan_b);
cudaFree(d_psi_x);
cudaFree(d_psi_k);
cudaFree(d_V);
return;
}
}
My question is:
-
I don’t really understand what for is “BATCH” variable. The problem is with allocation of “N*BATCH” elements (when BATCH > 1). Does BATCH > 1 speed up calculation of FFT ? Why we should use it ?
-
Can I use cuDoubleComplex instead of cufftDoubleComplex in C wrapper ? In fact my initial data are type of “double complex” (in fortran part) ?
Thanks for your help !
Sumert