Thank you!
This program is for a finite difference simulation of seismic wave.
(1)cuda version cuda-8.0
(2)GPU used titian XP
(3) operating system Centos 7
the code for cycling
for (int it = 0 ; it < timesteps ; it=it+8)
{
printf("\tt = %d ", it);
// launch the kernel
printf("launch kernel\n");
//RK step1
RK_init<<<dimGrid,dimBlock>>>(buW,bumW,butW,dimx,dimy,dimz);
LxB_LyB_LzB<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
RK_begin<<<dimGrid,dimBlock>>>(buW,bumW,buhW,butW,dt,RK4a[1],RK4b[0],dimx,dimy,dimz);
LxF_LyF_LzF<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
RK_inner<<<dimGrid,dimBlock>>>(buW,bumW,buhW,butW,dt,RK4a[2],RK4b[1],dimx,dimy,dimz);
LxB_LyB_LzB<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
RK_inner<<<dimGrid,dimBlock>>>(buW,bumW,buhW,butW,dt,RK4a[3],RK4b[2],dimx,dimy,dimz);
LxF_LyF_LzF<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
RK_finish<<<dimGrid,dimBlock>>>(buW,buhW,butW,dt,RK4b[3],dimx,dimy,dimz);
// step2
RK_init<<<dimGrid,dimBlock>>>(buW,bumW,butW,dimx,dimy,dimz);
LxF_LyF_LzB<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
RK_begin<<<dimGrid,dimBlock>>>(buW,bumW,buhW,butW,dt,RK4a[1],RK4b[0],dimx,dimy,dimz);
LxB_LyB_LzF<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
RK_inner<<<dimGrid,dimBlock>>>(buW,bumW,buhW,butW,dt,RK4a[2],RK4b[1],dimx,dimy,dimz);
LxF_LyF_LzB<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
RK_inner<<<dimGrid,dimBlock>>>(buW,bumW,buhW,butW,dt,RK4a[3],RK4b[2],dimx,dimy,dimz);
LxB_LyB_LzF<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
RK_finish<<<dimGrid,dimBlock>>>(buW,buhW,butW,dt,RK4b[3],dimx,dimy,dimz);
// step3
RK_init<<<dimGrid,dimBlock>>>(buW,bumW,butW,dimx,dimy,dimz);
LxF_LyF_LzF<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
RK_begin<<<dimGrid,dimBlock>>>(buW,bumW,buhW,butW,dt,RK4a[1],RK4b[0],dimx,dimy,dimz);
LxB_LyB_LzB<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
RK_inner<<<dimGrid,dimBlock>>>(buW,bumW,buhW,butW,dt,RK4a[2],RK4b[1],dimx,dimy,dimz);
LxF_LyF_LzF<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
RK_inner<<<dimGrid,dimBlock>>>(buW,bumW,buhW,butW,dt,RK4a[3],RK4b[2],dimx,dimy,dimz);
LxB_LyB_LzB<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
RK_finish<<<dimGrid,dimBlock>>>(buW,buhW,butW,dt,RK4b[3],dimx,dimy,dimz);
// step 4
RK_init<<<dimGrid,dimBlock>>>(buW,bumW,butW,dimx,dimy,dimz);
LxB_LyB_LzF<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
RK_begin<<<dimGrid,dimBlock>>>(buW,bumW,buhW,butW,dt,RK4a[1],RK4b[0],dimx,dimy,dimz);
LxF_LyF_LzB<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
RK_inner<<<dimGrid,dimBlock>>>(buW,bumW,buhW,butW,dt,RK4a[2],RK4b[1],dimx,dimy,dimz);
LxB_LyB_LzF<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
RK_inner<<<dimGrid,dimBlock>>>(buW,bumW,buhW,butW,dt,RK4a[3],RK4b[2],dimx,dimy,dimz);
LxF_LyF_LzB<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
RK_finish<<<dimGrid,dimBlock>>>(buW,buhW,butW,dt,RK4b[3],dimx,dimy,dimz);
// step 5
RK_init<<<dimGrid,dimBlock>>>(buW,bumW,butW,dimx,dimy,dimz);
LxB_LyF_LzB<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
RK_begin<<<dimGrid,dimBlock>>>(buW,bumW,buhW,butW,dt,RK4a[1],RK4b[0],dimx,dimy,dimz);
LxF_LyB_LzF<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
RK_inner<<<dimGrid,dimBlock>>>(buW,bumW,buhW,butW,dt,RK4a[2],RK4b[1],dimx,dimy,dimz);
LxB_LyF_LzB<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
RK_inner<<<dimGrid,dimBlock>>>(buW,bumW,buhW,butW,dt,RK4a[3],RK4b[2],dimx,dimy,dimz);
LxF_LyB_LzF<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
RK_finish<<<dimGrid,dimBlock>>>(buW,buhW,butW,dt,RK4b[3],dimx,dimy,dimz);
//step 6
RK_init<<<dimGrid,dimBlock>>>(buW,bumW,butW,dimx,dimy,dimz);
LxF_LyB_LzB<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
RK_begin<<<dimGrid,dimBlock>>>(buW,bumW,buhW,butW,dt,RK4a[1],RK4b[0],dimx,dimy,dimz);
LxB_LyF_LzF<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
RK_inner<<<dimGrid,dimBlock>>>(buW,bumW,buhW,butW,dt,RK4a[2],RK4b[1],dimx,dimy,dimz);
LxF_LyB_LzB<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
RK_inner<<<dimGrid,dimBlock>>>(buW,bumW,buhW,butW,dt,RK4a[3],RK4b[2],dimx,dimy,dimz);
LxB_LyF_LzF<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
RK_finish<<<dimGrid,dimBlock>>>(buW,buhW,butW,dt,RK4b[3],dimx,dimy,dimz);
// step 7
RK_init<<<dimGrid,dimBlock>>>(buW,bumW,butW,dimx,dimy,dimz);
LxF_LyB_LzF<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
RK_begin<<<dimGrid,dimBlock>>>(buW,bumW,buhW,butW,dt,RK4a[1],RK4b[0],dimx,dimy,dimz);
LxB_LyF_LzB<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
RK_inner<<<dimGrid,dimBlock>>>(buW,bumW,buhW,butW,dt,RK4a[2],RK4b[1],dimx,dimy,dimz);
LxF_LyB_LzF<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
RK_inner<<<dimGrid,dimBlock>>>(buW,bumW,buhW,butW,dt,RK4a[3],RK4b[2],dimx,dimy,dimz);
LxB_LyF_LzB<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
RK_finish<<<dimGrid,dimBlock>>>(buW,buhW,butW,dt,RK4b[3],dimx,dimy,dimz);
// step 8
RK_init<<<dimGrid,dimBlock>>>(buW,bumW,butW,dimx,dimy,dimz);
LxB_LyF_LzF<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
RK_begin<<<dimGrid,dimBlock>>>(buW,bumW,buhW,butW,dt,RK4a[1],RK4b[0],dimx,dimy,dimz);
LxF_LyB_LzB<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
RK_inner<<<dimGrid,dimBlock>>>(buW,bumW,buhW,butW,dt,RK4a[2],RK4b[1],dimx,dimy,dimz);
LxB_LyF_LzF<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
RK_inner<<<dimGrid,dimBlock>>>(buW,bumW,buhW,butW,dt,RK4a[3],RK4b[2],dimx,dimy,dimz);
LxF_LyB_LzB<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
RK_finish<<<dimGrid,dimBlock>>>(buW,buhW,butW,dt,RK4b[3],dimx,dimy,dimz);
}
and for one of the kernal
__global__ void RK_begin(wave W, wave mW, wave hW, wave tW,float DT,float a, float b,float dimx,float dimy,float dimz){
bool validr= true;
const int gtidx = blockIdx.x * blockDim.x + threadIdx.x;
const int gtidy = blockIdx.y * blockDim.y + threadIdx.y;
const int stride_y = dimx + 2 * RADIUS;
const int stride_z = stride_y * (dimy + 2 * RADIUS);
int indx = 0;
indx += RADIUS * stride_y + RADIUS ;
indx += gtidy * stride_y + gtidx;
indx += stride_z*3;
if ((gtidx >= dimx + RADIUS ) || (gtidy >= dimy + RADIUS ))
validr = false;
float rka,rkb;
rka=a*DT;
rkb=b*DT;
for(int iz=0;iz<dimz;iz++){
if(validr){
W.Vx [indx] = mW.Vx [indx] + rka * hW.Vx [indx];
W.Vy [indx] = mW.Vy [indx] + rka * hW.Vy [indx];
W.Vz [indx] = mW.Vz [indx] + rka * hW.Vz [indx];
W.Txx[indx] = mW.Txx[indx] + rka * hW.Txx[indx];
W.Tyy[indx] = mW.Tyy[indx] + rka * hW.Tyy[indx];
W.Tzz[indx] = mW.Tzz[indx] + rka * hW.Tzz[indx];
W.Txy[indx] = mW.Txy[indx] + rka * hW.Txy[indx];
W.Txz[indx] = mW.Txz[indx] + rka * hW.Txz[indx];
W.Tyz[indx] = mW.Tyz[indx] + rka * hW.Tyz[indx];
tW.Vx [indx] = mW.Vx [indx] + rkb * hW.Vx [indx];
tW.Vy [indx] = mW.Vy [indx] + rkb * hW.Vy [indx];
tW.Vz [indx] = mW.Vz [indx] + rkb * hW.Vz [indx];
tW.Txx[indx] = mW.Txx[indx] + rkb * hW.Txx[indx];
tW.Tyy[indx] = mW.Tyy[indx] + rkb * hW.Tyy[indx];
tW.Tzz[indx] = mW.Tzz[indx] + rkb * hW.Tzz[indx];
tW.Txy[indx] = mW.Txy[indx] + rkb * hW.Txy[indx];
tW.Txz[indx] = mW.Txz[indx] + rkb * hW.Txz[indx];
tW.Tyz[indx] = mW.Tyz[indx] + rkb * hW.Tyz[indx];
indx += stride_z;
}
}
}