Dear Sirs,
I have a problem of passing sets of arry constant from host to device. I define these sets of array constant in Main program and pass to subroutine, then pass these sets of array constant from subroutine to Kernel.
There is no compiling problem. But when I tried to run the compiled program in GPU, the Kernel seems not running. The program was completed ripdly. The result was same as initial value. Then I compile the program with emulation mode and run it in CPU. The result is correct.
I try to define these sets of array constant by each thread themselves and declare them in shared memory. It works fine in CPU and GPU. The program runs for reasonable time. The result is right also. But the efficiency of the program is degarded
Anyone could help me? I have tried to make the program as short as possible, but it is still so long.
I have CUDA 3.0, PGI compiler 10.4 and using Tesla C1060 running on Redhat OS.
Note, the declaration array constant is located around line 20.
I compile the program in this way:
pgfortran -o c -Mcuda=fastmath constant.CUF
! ******************* Program Start **************************
module new_mod
use cudafor
contains
! *********************** Kernel K2 ************************** ---------------------------------------------K2
attributes(global) subroutine K2_kernel&
&(AddG,Nx,Ny,Loop,RSF,Q_pts,&
&Rc,pi,DelT,DelX,D_2,&
&xx,yy,absci,weight,CaseS,&
&XiX,XiY,XiR,YiR,CXiX,CXiY)
!**************************************************************
integer :: io, jo, tx, ty
!**************************************************************
integer :: Lx,Ly,Lxy,LL,K,U
integer,value :: AddG,Nx,Ny,Loop,RSF,Q_pts
integer, constant :: CaseS(Q_pts,Q_pts)
real, constant :: Rc,pi,DelT,DelX,D_2
real, constant :: xx(Nx),yy(Ny),absci(Q_pts),weight(Q_pts)
!************************************************************* --------------------------------CONSTANT ARRY
real, constant :: XiX(Q_pts,Q_pts,5),XiY(Q_pts,Q_pts,5)
real, constant :: XiR(Q_pts,Q_pts,5),YiR(Q_pts,Q_pts,5)
real, constant :: CXiX(Q_pts,Q_pts,5),CXiY(Q_pts,Q_pts,5)
! real, shared :: XiX(8,8,5),XiY(8,8,5)
! real, shared :: XiR(8,8,5),YiR(8,8,5)
! real, shared :: CXiX(8,8,5),CXiY(8,8,5)
!**************************************************************
! Submatrices are Declared to be in CUDA Shared Memory
!**************************************************************
integer :: CHK,Qr,Ir,Jr,Ixr,Jyr,I1,J1,I2,J2
real :: C0,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11
real :: C12,C13,C14,C15,C16,C17,C18,C19,C20,C21
real :: Dummy1,Dummy2,Dummy3,Dummy4
real :: rij,uij,vij,Temij
real :: rxij,Temxij,uxij,vxij
real :: ryij,Temyij,uyij,vyij
!**************************************************************
! Start Execution, First Get My Thread Indices
!**************************************************************
tx = threadidx%x
ty = threadidx%y
io = (blockidx%x-1) * 16 + tx
jo = (blockidx%y-1) * 16 + ty
!***************************************************************
IF ((io.EQ.1).or.(jo.EQ.1).or.(io.EQ.Nx).or.(jo.EQ.Ny)) THEN
ELSE
!***************************************************************************************
!********************************** Useful Coefficient *********************************
! DO Lx=1,Q_pts;DO Ly=1,Q_pts
! XiX(Lx,Ly,1)=absci(Lx);XiY(Lx,Ly,1)=absci(Ly)
! XiR(Lx,Ly,1)=XiX(Lx,Ly,1)/XiY(Lx,Ly,1);YiR(Lx,Ly,1)=XiY(Lx,Ly,1)/XiX(Lx,Ly,1)
! CXiX(Lx,Ly,1)=2*Rc*XiX(Lx,Ly,1)*XiX(Lx,Ly,1)
! CXiY(Lx,Ly,1)=2*Rc*XiY(Lx,Ly,1)*XiY(Lx,Ly,1)
! XiX(Lx,Ly,2)=absci(Lx)-0.05;XiY(Lx,Ly,2)=absci(Ly)
! XiR(Lx,Ly,2)=XiX(Lx,Ly,2)/XiY(Lx,Ly,2);YiR(Lx,Ly,2)=XiY(Lx,Ly,2)/XiX(Lx,Ly,2)
! CXiX(Lx,Ly,2)=2*Rc*XiX(Lx,Ly,2)*XiX(Lx,Ly,2)
! CXiY(Lx,Ly,2)=2*Rc*XiY(Lx,Ly,2)*XiY(Lx,Ly,2)
! XiX(Lx,Ly,3)=absci(Lx)+0.05;XiY(Lx,Ly,3)=absci(Ly)
! XiR(Lx,Ly,3)=XiX(Lx,Ly,3)/XiY(Lx,Ly,3);YiR(Lx,Ly,3)=XiY(Lx,Ly,3)/XiX(Lx,Ly,3)
! CXiX(Lx,Ly,3)=2*Rc*XiX(Lx,Ly,3)*XiX(Lx,Ly,3)
! CXiY(Lx,Ly,3)=2*Rc*XiY(Lx,Ly,3)*XiY(Lx,Ly,3)
! XiX(Lx,Ly,4)=absci(Lx);XiY(Lx,Ly,4)=absci(Ly)-0.05
! XiR(Lx,Ly,4)=XiX(Lx,Ly,4)/XiY(Lx,Ly,4);YiR(Lx,Ly,4)=XiY(Lx,Ly,4)/XiX(Lx,Ly,4)
! CXiX(Lx,Ly,4)=2*Rc*XiX(Lx,Ly,4)*XiX(Lx,Ly,4)
! CXiY(Lx,Ly,4)=2*Rc*XiY(Lx,Ly,4)*XiY(Lx,Ly,4)
! XiX(Lx,Ly,5)=absci(Lx);XiY(Lx,Ly,5)=absci(Ly)+0.05
! XiR(Lx,Ly,5)=XiX(Lx,Ly,5)/XiY(Lx,Ly,5);YiR(Lx,Ly,5)=XiY(Lx,Ly,5)/XiX(Lx,Ly,5)
! CXiX(Lx,Ly,5)=2*Rc*XiX(Lx,Ly,5)*XiX(Lx,Ly,5)
! CXiY(Lx,Ly,5)=2*Rc*XiY(Lx,Ly,5)*XiY(Lx,Ly,5)
! ENDDO;ENDDO
!***************************************************************
Dummy1=0;Dummy2=0
Dummy3=0;Dummy4=0
DO Lx=1,Q_pts !"QUADRATURE LOOP START
DO Ly=1,Q_pts
Qr=1
I1=100;I2=101
J1=100;J2=101
rij=0;uij=0
vij=0;Temij=0
rxij=0;ryij=0
uxij=0;uyij=0
vxij=0;vyij=0
Temxij=0;Temyij=0
SELECT CASE (Qr)
! ****** QUADRATURE-I ******
CASE(1)
C0=-(1/DelT)-uxij;C1=-uyij
C2=((xx(io)-xx(I1))/DelT)-uij
C3=-vxij;C4=-(1/DelT)-vyij
C5=((yy(jo)-yy(J1))/DelT)-vij
! **************************
SELECT CASE (CaseS(Lx,Ly))
! ******* Ex=Ey=0 *********
CASE(0)
C6=C1*C3-C4*C0;C14=(C2*C4-C5*C1)/C6;C15=(C5*C0-C2*C3)/C6
! ******* JACOBIAN *******
C6=-C3/C4;C7=-C5/C4;C8=C0+C1*C6;C9=C2+C1*C7;C10=C8*C8
C11=2*C8*C9-CXiX(Lx,Ly,3)*((Temxij)+(Temyij)*C6)
C12=C9*C9-CXiX(Lx,Ly,3)*((Temij)+(Temyij)*C7)
C13=sqrt(C11*C11-4*C10*C12);C16=(-C11+C13)/(2*C10);C17=C6*C16+C7
if(sign(1.0,absci(Lx)).ne.sign(1.0,C0*C16+C1*C17+C2))then
C16=(-C11-C13)/(2*C10);C17=C6*C16+C7;endif
C18=(C16-C14)/0.05;C20=(C17-C15)/0.05
C6=-C1/C0;C7=-C2/C0;C8=C4+C3*C6;C9=C5+C3*C7;C10=C8*C8
C11=2*C8*C9-CXiY(Lx,Ly,5)*((Temyij)+(Temxij)*C6)
C12=C9*C9-CXiY(Lx,Ly,5)*((Temij)+(Temxij)*C7)
C13=sqrt(C11*C11-4*C10*C12);C17=(-C11+C13)/(2*C10);C16=C6*C17+C7
if(sign(1.0,absci(Ly)).ne.sign(1.0,C3*C16+C4*C17+C5))then
C17=(-C11-C13)/(2*C10);C16=C6*C17+C7;endif
C19=(C16-C14)/0.05;C21=(C17-C15)/0.05
! ******** Ey>Ex *********
CASE(1)
C6=-(C1-XiR(Lx,Ly,1)*C4)/(C0-XiR(Lx,Ly,1)*C3)
C7=-(C2-XiR(Lx,Ly,1)*C5)/(C0-XiR(Lx,Ly,1)*C3)
C8=C4+C3*C6
C9=C5+C3*C7
C10=C8*C8
C11=2*C8*C9-CXiY(Lx,Ly,1)*((Temyij)+(Temxij)*C6)
C12=C9*C9-CXiY(Lx,Ly,1)*((Temij)+(Temxij)*C7)
C13=sqrt(C11*C11-4*C10*C12)
C15=(-C11+C13)/(2*C10)
C14=C6*C15+C7
Qr=1
if(sign(1.0,absci(Ly)).ne.sign(1.0,C3*C14+C4*C15+C5))then
C15=(-C11-C13)/(2*C10)
C14=C6*C15+C7
Qr=-1;endif
! ******* JACOBIAN *******
C6=-(C1-XiR(Lx,Ly,3)*C4)/(C0-XiR(Lx,Ly,3)*C3)
C7=-(C2-XiR(Lx,Ly,3)*C5)/(C0-XiR(Lx,Ly,3)*C3)
C8=C4+C3*C6
C9=C5+C3*C7
C10=C8*C8
C11=2*C8*C9-CXiY(Lx,Ly,3)*((Temyij)+(Temxij)*C6)
C12=C9*C9-CXiY(Lx,Ly,3)*((Temij)+(Temxij)*C7)
C17=(-C11+Qr*sqrt(C11*C11-4*C10*C12))/(2*C10)
C16=C6*C17+C7
C18=(C16-C14)/0.05
C20=(C17-C15)/0.05
C6=-(C1-XiR(Lx,Ly,5)*C4)/(C0-XiR(Lx,Ly,5)*C3)
C7=-(C2-XiR(Lx,Ly,5)*C5)/(C0-XiR(Lx,Ly,5)*C3)
C8=C4+C3*C6
C9=C5+C3*C7
C10=C8*C8
C11=2*C8*C9-CXiY(Lx,Ly,5)*((Temyij)+(Temxij)*C6)
C12=C9*C9-CXiY(Lx,Ly,5)*((Temij)+(Temxij)*C7)
C17=(-C11+Qr*sqrt(C11*C11-4*C10*C12))/(2*C10)
C16=C6*C17+C7
C19=(C16-C14)/0.05
C21=(C17-C15)/0.05
! ******** Ex>Ey *********
CASE(2)
C6=-(YiR(Lx,Ly,1)*C0-C3)/(YiR(Lx,Ly,1)*C1-C4)
C7=-(YiR(Lx,Ly,1)*C2-C5)/(YiR(Lx,Ly,1)*C1-C4)
C8=C0+C1*C6
C9=C2+C1*C7
C10=C8*C8
C11=2*C8*C9-CXiX(Lx,Ly,1)*((Temxij)+(Temyij)*C6)
C12=C9*C9-CXiX(Lx,Ly,1)*((Temij)+(Temyij)*C7)
C13=sqrt(C11*C11-4*C10*C12)
C14=(-C11+C13)/(2*C10)
C15=C6*C14+C7
Qr=1
if(sign(1.0,absci(Lx)).ne.sign(1.0,C0*C14+C1*C15+C2))then
C14=(-C11-C13)/(2*C10)
C15=C6*C14+C7
Qr=-1;endif
! ******* JACOBIAN *******
C6=-(YiR(Lx,Ly,3)*C0-C3)/(YiR(Lx,Ly,3)*C1-C4)
C7=-(YiR(Lx,Ly,3)*C2-C5)/(YiR(Lx,Ly,3)*C1-C4)
C8=C0+C1*C6
C9=C2+C1*C7
C10=C8*C8
C11=2*C8*C9-CXiX(Lx,Ly,3)*((Temxij)+(Temyij)*C6)
C12=C9*C9-CXiX(Lx,Ly,3)*((Temij)+(Temyij)*C7)
C16=(-C11+Qr*sqrt(C11*C11-4*C10*C12))/(2*C10)
C17=C6*C16+C7
C18=(C16-C14)/0.05
C20=(C17-C15)/0.05
C6=-(YiR(Lx,Ly,5)*C0-C3)/(YiR(Lx,Ly,5)*C1-C4)
C7=-(YiR(Lx,Ly,5)*C2-C5)/(YiR(Lx,Ly,5)*C1-C4)
C8=C0+C1*C6
C9=C2+C1*C7
C10=C8*C8
C11=2*C8*C9-CXiX(Lx,Ly,5)*((Temxij)+(Temyij)*C6)
C12=C9*C9-CXiX(Lx,Ly,5)*((Temij)+(Temyij)*C7)
C16=(-C11+Qr*sqrt(C11*C11-4*C10*C12))/(2*C10)
C17=C6*C16+C7
C19=(C16-C14)/0.05
C21=(C17-C15)/0.05
ENDSELECT
! ****** QUADRATURE-II ******
CASE(2)
C0=(1/DelT)+uxij
C1=-uyij
C2=((xx(io)-xx(I2))/DelT)-uij
C3=vxij
C4=-(1/DelT)-vyij
C5=((yy(jo)-yy(J1))/DelT)-vij
! **************************
SELECT CASE (CaseS(Lx,Ly))
! ******* Ex=Ey=0 ********
CASE(0)
C6=C1*C3-C4*C0
C14=(C2*C4-C5*C1)/C6
C15=(C5*C0-C2*C3)/C6
! ******* JACOBIAN *******
C6=-C3/C4
C7=-C5/C4
C8=C0+C1*C6
C9=C2+C1*C7
C10=C8*C8
C11=2*C8*C9-CXiX(Lx,Ly,3)*(-Temxij+Temyij*C6)
C12=C9*C9-CXiX(Lx,Ly,3)*(Temij+Temyij*C7)
C13=sqrt(C11*C11-4*C10*C12)
C16=(-C11+C13)/(2*C10)
C17=C6*C16+C7
if(sign(1.0,absci(Lx)).ne.sign(1.0,C0*C16+C1*C17+C2))then
C16=(-C11-C13)/(2*C10)
C17=C6*C16+C7;endif
C18=(C14-C16)/0.05
C20=(C17-C15)/0.05
C6=-C1/C0
C7=-C2/C0
C8=C4+C3*C6
C9=C5+C3*C7
C10=C8*C8
C11=2*C8*C9-CXiY(Lx,Ly,5)*(Temyij+(-Temxij)*C6)
C12=C9*C9-CXiY(Lx,Ly,5)*(Temij+(-Temxij)*C7)
C13=sqrt(C11*C11-4*C10*C12)
C17=(-C11+C13)/(2*C10)
C16=C6*C17+C7
if(sign(1.0,absci(Ly)).ne.sign(1.0,C3*C16+C4*C17+C5))then
C17=(-C11-C13)/(2*C10)
C16=C6*C17+C7;endif
C19=(C14-C16)/0.05
C21=(C17-C15)/0.05
! ******** Ey>Ex *********
CASE(1)
C6=-(C1-XiR(Lx,Ly,1)*C4)/(C0-XiR(Lx,Ly,1)*C3)
C7=-(C2-XiR(Lx,Ly,1)*C5)/(C0-XiR(Lx,Ly,1)*C3)
C8=C4+C3*C6
C9=C5+C3*C7
C10=C8*C8
C11=2*C8*C9-CXiY(Lx,Ly,1)*(Temyij+(-Temxij)*C6)
C12=C9*C9-CXiY(Lx,Ly,1)*(Temij+(-Temxij)*C7)
C13=sqrt(C11*C11-4*C10*C12)
C15=(-C11+C13)/(2*C10)
C14=C6*C15+C7
Qr=1
if(sign(1.0,absci(Ly)).ne.sign(1.0,C3*C14+C4*C15+C5))then
C15=(-C11-C13)/(2*C10)
C14=C6*C15+C7
Qr=-1;endif
! ******* JACOBIAN *******
C6=-(C1-XiR(Lx,Ly,3)*C4)/(C0-XiR(Lx,Ly,3)*C3)
C7=-(C2-XiR(Lx,Ly,3)*C5)/(C0-XiR(Lx,Ly,3)*C3)
C8=C4+C3*C6
C9=C5+C3*C7
C10=C8*C8
C11=2*C8*C9-CXiY(Lx,Ly,3)*(Temyij+(-Temxij)*C6)
C12=C9*C9-CXiY(Lx,Ly,3)*(Temij+(-Temxij)*C7)
C17=(-C11+Qr*sqrt(C11*C11-4*C10*C12))/(2*C10)
C16=C6*C17+C7
C18=(C14-C16)/0.05
C20=(C17-C15)/0.05
C6=-(C1-XiR(Lx,Ly,5)*C4)/(C0-XiR(Lx,Ly,5)*C3)
C7=-(C2-XiR(Lx,Ly,5)*C5)/(C0-XiR(Lx,Ly,5)*C3)
C8=C4+C3*C6
C9=C5+C3*C7
C10=C8*C8
C11=2*C8*C9-CXiY(Lx,Ly,5)*(Temyij+(-Temxij)*C6)
C12=C9*C9-CXiY(Lx,Ly,5)*(Temij+(-Temxij)*C7)
C17=(-C11+Qr*sqrt(C11*C11-4*C10*C12))/(2*C10)
C16=C6*C17+C7;
C19=(C14-C16)/0.05
C21=(C17-C15)/0.05
! ******** Ex>Ey *********
CASE(2)
C6=-(YiR(Lx,Ly,1)*C0-C3)/(YiR(Lx,Ly,1)*C1-C4)
C7=-(YiR(Lx,Ly,1)*C2-C5)/(YiR(Lx,Ly,1)*C1-C4)
C8=C0+C1*C6
C9=C2+C1*C7
C10=C8*C8
C11=2*C8*C9-CXiX(Lx,Ly,1)*(-Temxij+Temyij*C6)
C12=C9*C9-CXiX(Lx,Ly,1)*(Temij+Temyij*C7)
C13=sqrt(C11*C11-4*C10*C12)
C14=(-C11+C13)/(2*C10)
C15=C6*C14+C7
Qr=1
if(sign(1.0,absci(Lx)).ne.sign(1.0,C0*C14+C1*C15+C2))then
C14=(-C11-C13)/(2*C10)
C15=C6*C14+C7
Qr=-1;endif
! ******* JACOBIAN *******
C6=-(YiR(Lx,Ly,3)*C0-C3)/(YiR(Lx,Ly,3)*C1-C4)
C7=-(YiR(Lx,Ly,3)*C2-C5)/(YiR(Lx,Ly,3)*C1-C4)
C8=C0+C1*C6
C9=C2+C1*C7
C10=C8*C8
C11=2*C8*C9-CXiX(Lx,Ly,3)*(-Temxij+Temyij*C6)
C12=C9*C9-CXiX(Lx,Ly,3)*(Temij+Temyij*C7)
C16=(-C11+Qr*sqrt(C11*C11-4*C10*C12))/(2*C10)
C17=C6*C16+C7
C18=(C14-C16)/0.05
C20=(C17-C15)/0.05
C6=-(YiR(Lx,Ly,5)*C0-C3)/(YiR(Lx,Ly,5)*C1-C4)
C7=-(YiR(Lx,Ly,5)*C2-C5)/(YiR(Lx,Ly,5)*C1-C4)
C8=C0+C1*C6
C9=C2+C1*C7
C10=C8*C8
C11=2*C8*C9-CXiX(Lx,Ly,5)*(-Temxij+Temyij*C6)
C12=C9*C9-CXiX(Lx,Ly,5)*(Temij+Temyij*C7)
C16=(-C11+Qr*sqrt(C11*C11-4*C10*C12))/(2*C10)
C17=C6*C16+C7
C19=(C14-C16)/0.05
C21=(C17-C15)/0.05
ENDSELECT
! ****** QUADRATURE-III ******
CASE(3)
C0=(1/DelT)+uxij
C1=uyij
C2=((xx(io)-xx(I2))/DelT)-uij
C3=vxij
C4=(1/DelT)+vyij
C5=((yy(jo)-yy(J2))/DelT)-vij
! **************************
SELECT CASE (CaseS(Lx,Ly))
! ******* Ex=Ey=0 **********
CASE(0)
C6=C1*C3-C4*C0
C14=(C2*C4-C5*C1)/C6
C15=(C5*C0-C2*C3)/C6
! ******* JACOBIAN *******
C6=-C3/C4
C7=-C5/C4
C8=C0+C1*C6
C9=C2+C1*C7
C10=C8*C8
C11=2*C8*C9-CXiX(Lx,Ly,3)*((-Temxij)+(-Temyij)*C6)
C12=C9*C9-CXiX(Lx,Ly,3)*(Temij+(-Temyij)*C7)
C13=sqrt(C11*C11-4*C10*C12)
C16=(-C11+C13)/(2*C10)
C17=C6*C16+C7
if(sign(1.0,absci(Lx)).ne.sign(1.0,C0*C16+C1*C17+C2))then
C16=(-C11-C13)/(2*C10)
C17=C6*C16+C7;endif
C18=(C14-C16)/0.05
C20=(C15-C17)/0.05
C6=-C1/C0
C7=-C2/C0
C8=C4+C3*C6
C9=C5+C3*C7
C10=C8*C8
C11=2*C8*C9-CXiY(Lx,Ly,5)*((-Temyij)+(-Temxij)*C6)
C12=C9*C9-CXiY(Lx,Ly,5)*(Temij+(-Temxij)*C7)
C13=sqrt(C11*C11-4*C10*C12)
C17=(-C11+C13)/(2*C10)
C16=C6*C17+C7
if(sign(1.0,absci(Ly)).ne.sign(1.0,C3*C16+C4*C17+C5))then
C17=(-C11-C13)/(2*C10)
C16=C6*C17+C7;endif
C19=(C14-C16)/0.05
C21=(C15-C17)/0.05
! ******** Ey>Ex *********
CASE(1)
C6=-(C1-XiR(Lx,Ly,1)*C4)/(C0-XiR(Lx,Ly,1)*C3)
C7=-(C2-XiR(Lx,Ly,1)*C5)/(C0-XiR(Lx,Ly,1)*C3)
C8=C4+C3*C6
C9=C5+C3*C7
C10=C8*C8
C11=2*C8*C9-CXiY(Lx,Ly,1)*((-Temyij)+(-Temxij)*C6)
C12=C9*C9-CXiY(Lx,Ly,1)*(Temij+(-Temxij)*C7)
C13=sqrt(C11*C11-4*C10*C12)
C15=(-C11+C13)/(2*C10)
C14=C6*C15+C7
Qr=1
if(sign(1.0,absci(Ly)).ne.sign(1.0,C3*C14+C4*C15+C5))then
C15=(-C11-C13)/(2*C10)
C14=C6*C15+C7
Qr=-1;endif
! ******* JACOBIAN *******
C6=-(C1-XiR(Lx,Ly,3)*C4)/(C0-XiR(Lx,Ly,3)*C3)
C7=-(C2-XiR(Lx,Ly,3)*C5)/(C0-XiR(Lx,Ly,3)*C3)
C8=C4+C3*C6
C9=C5+C3*C7
C10=C8*C8
C11=2*C8*C9-CXiY(Lx,Ly,3)*((-Temyij)+(-Temxij)*C6)
C12=C9*C9-CXiY(Lx,Ly,3)*(Temij+(-Temxij)*C7)
C17=(-C11+Qr*sqrt(C11*C11-4*C10*C12))/(2*C10)
C16=C6*C17+C7
C18=(C14-C16)/0.05
C20=(C15-C17)/0.05
C6=-(C1-XiR(Lx,Ly,5)*C4)/(C0-XiR(Lx,Ly,5)*C3)
C7=-(C2-XiR(Lx,Ly,5)*C5)/(C0-XiR(Lx,Ly,5)*C3)
C8=C4+C3*C6
C9=C5+C3*C7
C10=C8*C8
C11=2*C8*C9-CXiY(Lx,Ly,5)*((-Temyij)+(-Temxij)*C6)
C12=C9*C9-CXiY(Lx,Ly,5)*(Temij+(-Temxij)*C7)
C17=(-C11+Qr*sqrt(C11*C11-4*C10*C12))/(2*C10)
C16=C6*C17+C7
C19=(C14-C16)/0.05
C21=(C15-C17)/0.05
! ******** Ex>Ey *********
CASE(2)
C6=-(YiR(Lx,Ly,1)*C0-C3)/(YiR(Lx,Ly,1)*C1-C4)
C7=-(YiR(Lx,Ly,1)*C2-C5)/(YiR(Lx,Ly,1)*C1-C4)
C8=C0+C1*C6
C9=C2+C1*C7
C10=C8*C8
C11=2*C8*C9-CXiX(Lx,Ly,1)*((-Temxij)+(-Temyij)*C6)
C12=C9*C9-CXiX(Lx,Ly,1)*(Temij+(-Temyij)*C7)
C13=sqrt(C11*C11-4*C10*C12)
C14=(-C11+C13)/(2*C10)
C15=C6*C14+C7
Qr=1
if(sign(1.0,absci(Lx)).ne.sign(1.0,C0*C14+C1*C15+C2))then
C14=(-C11-C13)/(2*C10)
C15=C6*C14+C7
Qr=-1;endif
! ******* JACOBIAN *******
C6=-(YiR(Lx,Ly,3)*C0-C3)/(YiR(Lx,Ly,3)*C1-C4)
C7=-(YiR(Lx,Ly,3)*C2-C5)/(YiR(Lx,Ly,3)*C1-C4)
C8=C0+C1*C6
C9=C2+C1*C7
C10=C8*C8
C11=2*C8*C9-CXiX(Lx,Ly,3)*((-Temxij)+(-Temyij)*C6)
C12=C9*C9-CXiX(Lx,Ly,3)*(Temij+(-Temyij)*C7)
C16=(-C11+Qr*sqrt(C11*C11-4*C10*C12))/(2*C10)
C17=C6*C16+C7
C18=(C14-C16)/0.05
C20=(C15-C17)/0.05
C6=-(YiR(Lx,Ly,5)*C0-C3)/(YiR(Lx,Ly,5)*C1-C4)
C7=-(YiR(Lx,Ly,5)*C2-C5)/(YiR(Lx,Ly,5)*C1-C4)
C8=C0+C1*C6
C9=C2+C1*C7
C10=C8*C8
C11=2*C8*C9-CXiX(Lx,Ly,5)*((-Temxij)+(-Temyij)*C6)
C12=C9*C9-CXiX(Lx,Ly,5)*(Temij+(-Temyij)*C7)
C16=(-C11+Qr*sqrt(C11*C11-4*C10*C12))/(2*C10)
C17=C6*C16+C7
C19=(C14-C16)/0.05
C21=(C15-C17)/0.05
ENDSELECT
! ****** QUADRATURE-IV ******
CASE(4)
C0=-(1/DelT)-uxij
C1=uyij
C2=((xx(io)-xx(I1))/DelT)-uij
C3=-vxij
C4=(1/DelT)+vyij
C5=((yy(jo)-yy(J2))/DelT)-vij
! **************************
SELECT CASE (CaseS(Lx,Ly))
! ******* Ex=Ey=0 **********
CASE(0)
C6=C1*C3-C4*C0
C14=(C2*C4-C5*C1)/C6
C15=(C5*C0-C2*C3)/C6
! ******* JACOBIAN *******
C6=-C3/C4
C7=-C5/C4
C8=C0+C1*C6
C9=C2+C1*C7
C10=C8*C8
C11=2*C8*C9-CXiX(Lx,Ly,3)*(Temxij+(-Temyij)*C6)
C12=C9*C9-CXiX(Lx,Ly,3)*(Temij+(-Temyij)*C7)
C13=sqrt(C11*C11-4*C10*C12)
C16=(-C11+C13)/(2*C10)
C17=C6*C16+C7
if(sign(1.0,absci(Lx)).ne.sign(1.0,C0*C16+C1*C17+C2))then
C16=(-C11-C13)/(2*C10)
C17=C6*C16+C7;endif
C18=(C16-C14)/0.05
C20=(C15-C17)/0.05
C6=-C1/C0
C7=-C2/C0
C8=C4+C3*C6
C9=C5+C3*C7
C10=C8*C8
C11=2*C8*C9-CXiY(Lx,Ly,5)*((-Temyij)+Temxij*C6)
C12=C9*C9-CXiY(Lx,Ly,5)*(Temij+Temxij*C7)
C13=sqrt(C11*C11-4*C10*C12)
C17=(-C11+C13)/(2*C10)
C16=C6*C17+C7
if(sign(1.0,absci(Ly)).ne.sign(1.0,C3*C16+C4*C17+C5))then
C17=(-C11-C13)/(2*C10)
C16=C6*C17+C7;endif
C19=(C16-C14)/0.05
C21=(C15-C17)/0.05
! ******** Ey>Ex *********
CASE(1)
C6=-(C1-XiR(Lx,Ly,1)*C4)/(C0-XiR(Lx,Ly,1)*C3)
C7=-(C2-XiR(Lx,Ly,1)*C5)/(C0-XiR(Lx,Ly,1)*C3)
C8=C4+C3*C6
C9=C5+C3*C7
C10=C8*C8
C11=2*C8*C9-CXiY(Lx,Ly,1)*((-Temyij)+Temxij*C6)
C12=C9*C9-CXiY(Lx,Ly,1)*(Temij+Temxij*C7)
C13=sqrt(C11*C11-4*C10*C12)
C15=(-C11+C13)/(2*C10)
C14=C6*C15+C7
Qr=1
if(sign(1.0,absci(Ly)).ne.sign(1.0,C3*C14+C4*C15+C5))then
C15=(-C11-C13)/(2*C10)
C14=C6*C15+C7
Qr=-1;endif
! ******* JACOBIAN *******
C6=-(C1-XiR(Lx,Ly,3)*C4)/(C0-XiR(Lx,Ly,3)*C3)
C7=-(C2-XiR(Lx,Ly,3)*C5)/(C0-XiR(Lx,Ly,3)*C3)
C8=C4+C3*C6
C9=C5+C3*C7
C10=C8*C8
C11=2*C8*C9-CXiY(Lx,Ly,3)*((-Temyij)+Temxij*C6)
C12=C9*C9-CXiY(Lx,Ly,3)*(Temij+Temxij*C7)
C17=(-C11+Qr*sqrt(C11*C11-4*C10*C12))/(2*C10)
C16=C6*C17+C7
C18=(C16-C14)/0.05
C20=(C15-C17)/0.05
C6=-(C1-XiR(Lx,Ly,5)*C4)/(C0-XiR(Lx,Ly,5)*C3)
C7=-(C2-XiR(Lx,Ly,5)*C5)/(C0-XiR(Lx,Ly,5)*C3)
C8=C4+C3*C6
C9=C5+C3*C7
C10=C8*C8
C11=2*C8*C9-CXiY(Lx,Ly,5)*((-Temyij)+Temxij*C6)
C12=C9*C9-CXiY(Lx,Ly,5)*(Temij+Temxij*C7)
C17=(-C11+Qr*sqrt(C11*C11-4*C10*C12))/(2*C10)
C16=C6*C17+C7;
C19=(C16-C14)/0.05
C21=(C15-C17)/0.05
! ******** Ex>Ey *********
CASE(2)
C6=-(YiR(Lx,Ly,1)*C0-C3)/(YiR(Lx,Ly,1)*C1-C4)
C7=-(YiR(Lx,Ly,1)*C2-C5)/(YiR(Lx,Ly,1)*C1-C4)
C8=C0+C1*C6
C9=C2+C1*C7
C10=C8*C8
C11=2*C8*C9-CXiX(Lx,Ly,1)*(Temxij+(-Temyij)*C6)
C12=C9*C9-CXiX(Lx,Ly,1)*(Temij+(-Temyij)*C7)
C13=sqrt(C11*C11-4*C10*C12)
C14=(-C11+C13)/(2*C10)
C15=C6*C14+C7
Qr=1.0
if(sign(1.0,absci(Lx)).ne.sign(1.0,C0*C14+C1*C15+C2))then
C14=(-C11-C13)/(2.0*C10)
C15=C6*C14+C7
Qr=-1;endif
! ******* JACOBIAN *******
C6=-(YiR(Lx,Ly,3)*C0-C3)/(YiR(Lx,Ly,3)*C1-C4)
C7=-(YiR(Lx,Ly,3)*C2-C5)/(YiR(Lx,Ly,3)*C1-C4)
C8=C0+C1*C6
C9=C2+C1*C7
C10=C8*C8
C11=2.0*C8*C9-CXiX(Lx,Ly,3)*(Temxij+(-Temyij)*C6)
C12=C9*C9-CXiX(Lx,Ly,3)*(Temij+(-Temyij)*C7)
C16=(-C11+Qr*sqrt(C11*C11-4.0*C10*C12))/(2.0*C10)
C17=C6*C16+C7
C18=(C16-C14)/0.05
C20=(C15-C17)/0.05
C6=-(YiR(Lx,Ly,5)*C0-C3)/(YiR(Lx,Ly,5)*C1-C4)
C7=-(YiR(Lx,Ly,5)*C2-C5)/(YiR(Lx,Ly,5)*C1-C4)
C8=C0+C1*C6
C9=C2+C1*C7
C10=C8*C8
C11=2.0*C8*C9-CXiX(Lx,Ly,5)*(Temxij+(-Temyij)*C6)
C12=C9*C9-CXiX(Lx,Ly,5)*(Temij+(-Temyij)*C7)
C16=(-C11+Qr*sqrt(C11*C11-4.0*C10*C12))/(2.0*C10)
C17=C6*C16+C7
C19=(C16-C14)/0.05
C21=(C15-C17)/0.05
ENDSELECT
ENDSELECT
ENDDO
ENDDO !"QUADRATURE LOOP END"
ENDIF
!****************************************************************!
END SUBROUTINE K2_kernel
! ********** The Host Routine to Drive the Kernel ************ ---------------------------------------------SUBROUTINE
subroutine SUB_FULL(AddG,Nx,Ny,Loop,Q_pts,RSF,&
Rc,pi,DelT,DelX,D_2,xx,yy,absci,weight,&
&CaseS,XiX,XiY,XiR,YiR,CXiX,CXiY)
! assumed shape input arrays
integer AddG,Loop,Q_pts,RSF
real Rc,pi,DelT,DelX,D_2
real,dimension(:) :: xx,yy,absci,weight
real,dimension(:,:) :: rho,velx,vely,Temp
integer, dimension(:,:) :: CaseS
real, dimension(:,:,:) :: XiX,XiY
real, dimension(:,:,:) :: XiR,YiR
real, dimension(:,:,:) :: CXiX,CXiY
! Array dimensions
integer :: Nx, Ny, K, Lx, Ly
! allocatable device arrays
integer, device, allocatable, dimension(:,:) :: CaseSdev
real, device :: Rcdev,pidev,DelTdev,DelXdev,D_2dev
real, device, allocatable, dimension(:) :: xxdev,yydev,abscidev,weightdev
real, device, allocatable, dimension(:,:,:) :: XiXdev,XiYdev
real, device, allocatable, dimension(:,:,:) :: XiRdev,YiRdev
real, device, allocatable, dimension(:,:,:) :: CXiXdev,CXiYdev
! dim3 variables to define the grid and block shapes
type(dim3) :: dimGrid, dimBlock
integer :: r
! Get the array sizes
integer c1, c2, c3, c4
! Begin execution, first determine the sizes of the input arrays
Nx = size( xx )
Ny = size( yy )
! Start data xfer-inclusive timer and allocate the device arrays using F90 ALLOCATE
call system_clock( count=c1 )
allocate(CaseSdev(Q_pts,Q_pts))
allocate(xxdev(Nx),yydev(Ny),abscidev(Q_pts),weightdev(Q_pts))
allocate(XiXdev(Q_pts,Q_pts,5),XiYdev(Q_pts,Q_pts,5))
allocate(XiRdev(Q_pts,Q_pts,5),YiRdev(Q_pts,Q_pts,5))
allocate(CXiXdev(Q_pts,Q_pts,5),CXiYdev(Q_pts,Q_pts,5))
! Copy Variables to the Device using F90 Array Assignments
xxdev=xx(1:Nx)
yydev=yy(1:Ny)
abscidev=absci(1:Q_pts)
weightdev=weight(1:Q_pts)
Rcdev=Rc
pidev=pi
DelTdev=DelT
DelXdev=DelX
D_2dev=D_2
CaseSdev=CaseS(1:Q_pts,1:Q_pts)
XiXdev=XiX(1:Q_pts,1:Q_pts,1:5)
XiYdev=XiY(1:Q_pts,1:Q_pts,1:5)
XiRdev=XiR(1:Q_pts,1:Q_pts,1:5)
YiRdev=YiR(1:Q_pts,1:Q_pts,1:5)
CXiXdev=CXiX(1:Q_pts,1:Q_pts,1:5)
CXiYdev=CXiY(1:Q_pts,1:Q_pts,1:5)
call system_clock( count=c2 )
DO K=1,Loop/4
! ********** Call K2 KERNEL to Execute on GPU **************************!-----------------------------------TK2
dimGrid = dim3(Nx/16,Ny/16,1)
dimBlock = dim3(16,16,1)
! PRINT*,'[Grid] ',dimGrid
! PRINT*,'[Block]',dimBlock;PRINT*,''
! PRINT*,'***** CALLING KERNEL K2 ******'
! PRINT*,''
call K2_kernel<<<dimGrid,dimBlock>>>&
&(AddG,Nx,Ny,Loop,RSF,Q_pts,&
&Rcdev,pidev,DelTdev,DelXdev,D_2dev,&
&xxdev,yydev,abscidev,weightdev,CaseSdev,&
&XiXdev,XiYdev,XiRdev,YiRdev,CXiXdev,CXiYdev)
r = cudathreadsynchronize()
! **********************************************************************!
PRINT*,'TIME-STEP:',K
ENDDO
! ****************************** GPU Timer *******************************
call system_clock( count=c3 )
call system_clock( count=c4 )
PRINT*,''
PRINT*,'Time[Cal|Cop|All]:',(c3-c2)/1e6,(c4-c3)/1e6,(c4-c1)/1e6
PRINT*,''
! ************************************************************************
! Deallocate device arrays and exit
deallocate(CaseSdev,xxdev,yydev,abscidev,weightdev,&
&XiXdev,XiYdev,XiRdev,YiRdev,CXiXdev,CXiYdev)
END SUBROUTINE SUB_FULL
END MODULE new_mod
! ***************************** The main ******************************-----------------------------------MAIN PROGRAM
! Main program to initialize arrays, invoke add, check results
program add
use new_mod
integer,parameter :: AddG=0
integer Nx,Ny,I,J,U,Lx,Ly,Loop,test_nos,Q_pts,RSF
integer,dimension(:,:),allocatable :: CaseS
real,parameter :: Rc=0.002667,pi=3.14159265
real DelT,DelX,D_2,CFL,comptime,TIME,DTIME
real,dimension(:),allocatable :: xx,yy,absci,weight
real,dimension(:,:),allocatable :: rho,velx,vely,Temp
real,dimension(:,:,:),allocatable :: XiX,XiY,XiR,YiR,CXiX,CXiY
integer idevice, istat
Nx=800
Ny=800
CFL=0.25
comptime=0.25
Q_pts=2
RSF=1
!-----Allocate Array Size
Nx=Nx+AddG;Ny=Ny+AddG
allocate(xx(Nx),yy(Ny),rho(Nx,Ny),velx(Nx,Ny),vely(Nx,Ny),Temp(Nx,Ny))
!-----Grid-Size, Time-Step, Loop-Nos
DelX=(1.0/Nx);D_2=0.5*DelX
DelT=CFL*DelX;Loop=FLOOR(comptime/DelT)
!-----Allocate Array Size of Hermite-Guass Quadrature Points
allocate(CaseS(Q_pts,Q_pts),absci(Q_pts),weight(Q_pts))
allocate(XiX(Q_pts,Q_pts,5),XiY(Q_pts,Q_pts,5))
allocate(XiR(Q_pts,Q_pts,5),YiR(Q_pts,Q_pts,5))
allocate(CXiX(Q_pts,Q_pts,5),CXiY(Q_pts,Q_pts,5))
SELECTCASE (Q_pts)
CASE (1)
absci(1) =0;weight(1)=1.77245385091
CASE (2)
absci(1)= 0.70710678118654752438
absci(2)=-0.707106781186547524382
weight(1)=0.886226925452758013655
weight(2)=0.886226925452758013655
CASE (3)
absci(1)= 0
absci(2)= 1.22474487139158904909
absci(3)=-1.22474487139158904909
weight(1)=1.18163590060367735158
weight(2)=0.295408975150919337894
weight(3)=0.295408975150919337894
CASE (4)
absci(1)= 0.524647623275290317900
absci(2)=-0.524647623275290317900
absci(3)= 1.65068012388578455585
absci(4)=-1.65068012388578455585
weight(1)=0.804914090005512836482
weight(2)=0.804914090005512836482
weight(3)=0.0813128354472451771398
weight(4)=0.0813128354472451771398
CASE (5)
absci(1)= 0
absci(2)= 0.958572464613818507097
absci(3)=-0.958572464613818507097
absci(4)= 2.02018287045608563291
absci(5)=-2.02018287045608563291
weight(1)=0.945308720482941881207
weight(2)=0.393619323152241159825
weight(3)=0.393619323152241159825
weight(4)=0.0199532420590459132082
weight(5)=0.0199532420590459132082
CASE (6)
absci(1)= 0.436077411927616508688
absci(2)=-0.436077411927616508688
absci(3)= 1.33584907401369694976
absci(4)=-1.33584907401369694976
absci(5)= 2.35060497367449222281
absci(6)=-2.35060497367449222281
weight(1)=0.724629595224392524086
weight(2)=0.724629595224392524086
weight(3)=0.157067320322856643914
weight(4)=0.157067320322856643914
weight(5)=0.00453000990550884564102
weight(6)=0.00453000990550884564102
CASE (8)
absci(1)= 0.3811869902073222
absci(2)=-0.3811869902073222
absci(3)= 1.157193712446780
absci(4)=-1.157193712446780
absci(5)= 1.981656756695843
absci(6)=-1.981656756695843
absci(7)= 2.930637420257244
absci(8)=-2.930637420257244
weight(1)=0.6611470125561838
weight(2)=0.6611470125561838
weight(3)=0.2078023258142453
weight(4)=0.2078023258142453
weight(5)=0.01707798300736033
weight(6)=0.01707798300736033
weight(7)=0.0001996040722107463
weight(8)=0.0001996040722107463
ENDSELECT
!********************************** Useful Coefficient *********************************
DO Lx=1,Q_pts;DO Ly=1,Q_pts
XiX(Lx,Ly,1)=absci(Lx);XiY(Lx,Ly,1)=absci(Ly)
XiR(Lx,Ly,1)=XiX(Lx,Ly,1)/XiY(Lx,Ly,1);YiR(Lx,Ly,1)=XiY(Lx,Ly,1)/XiX(Lx,Ly,1)
CXiX(Lx,Ly,1)=2*Rc*XiX(Lx,Ly,1)*XiX(Lx,Ly,1)
CXiY(Lx,Ly,1)=2*Rc*XiY(Lx,Ly,1)*XiY(Lx,Ly,1)
XiX(Lx,Ly,2)=absci(Lx)-0.05;XiY(Lx,Ly,2)=absci(Ly)
XiR(Lx,Ly,2)=XiX(Lx,Ly,2)/XiY(Lx,Ly,2);YiR(Lx,Ly,2)=XiY(Lx,Ly,2)/XiX(Lx,Ly,2)
CXiX(Lx,Ly,2)=2*Rc*XiX(Lx,Ly,2)*XiX(Lx,Ly,2)
CXiY(Lx,Ly,2)=2*Rc*XiY(Lx,Ly,2)*XiY(Lx,Ly,2)
XiX(Lx,Ly,3)=absci(Lx)+0.05;XiY(Lx,Ly,3)=absci(Ly)
XiR(Lx,Ly,3)=XiX(Lx,Ly,3)/XiY(Lx,Ly,3);YiR(Lx,Ly,3)=XiY(Lx,Ly,3)/XiX(Lx,Ly,3)
CXiX(Lx,Ly,3)=2*Rc*XiX(Lx,Ly,3)*XiX(Lx,Ly,3)
CXiY(Lx,Ly,3)=2*Rc*XiY(Lx,Ly,3)*XiY(Lx,Ly,3)
XiX(Lx,Ly,4)=absci(Lx);XiY(Lx,Ly,4)=absci(Ly)-0.05
XiR(Lx,Ly,4)=XiX(Lx,Ly,4)/XiY(Lx,Ly,4);YiR(Lx,Ly,4)=XiY(Lx,Ly,4)/XiX(Lx,Ly,4)
CXiX(Lx,Ly,4)=2*Rc*XiX(Lx,Ly,4)*XiX(Lx,Ly,4)
CXiY(Lx,Ly,4)=2*Rc*XiY(Lx,Ly,4)*XiY(Lx,Ly,4)
XiX(Lx,Ly,5)=absci(Lx);XiY(Lx,Ly,5)=absci(Ly)+0.05
XiR(Lx,Ly,5)=XiX(Lx,Ly,5)/XiY(Lx,Ly,5);YiR(Lx,Ly,5)=XiY(Lx,Ly,5)/XiX(Lx,Ly,5)
CXiX(Lx,Ly,5)=2*Rc*XiX(Lx,Ly,5)*XiX(Lx,Ly,5)
CXiY(Lx,Ly,5)=2*Rc*XiY(Lx,Ly,5)*XiY(Lx,Ly,5)
ENDDO;ENDDO
!***************************************************************************************
DO Lx=1,Q_pts;DO Ly=1,Q_pts;IF((abs(absci(Lx))+abs(absci(Ly))).LT.1E-7)THEN
CaseS(Lx,Ly)=0;ELSEIF(abs(absci(Lx)).LT.abs(absci(Ly)))THEN
CaseS(Lx,Ly)=1;ELSEIF(abs(absci(Ly)).LT.abs(absci(Lx)))THEN
CaseS(Lx,Ly)=2;ELSE;CaseS(Lx,Ly)=2
ENDIF;ENDDO;ENDDO
!***************************************************************************************
!-----Print Setting
PRINT*,''
PRINT*,'|Iteration-Nos|',Loop;PRINT*,''
!-----Define Node Position
xx(1)=0;do I=2,Nx;xx(I)=xx(I-1)+DelX;end do
yy(1)=0;do J=2,Ny;yy(J)=yy(J-1)+DelX;end do
!-----Start iteration
print *, 'Simulation Start !!!'
print *, ''
! **************** Initialize CPU Device *********************
idevice=0;istat=cudaSetDevice(idevice)
! ************************************************************
! ********** Call Subroutine to Execute on GPU *************** ---------------------------------------------TSUBROUTINE
CALL CPU_TIME(TIME)
PRINT*,'***** CALLING SUBROUTINE *****'
PRINT*,''
call SUB_FULL(AddG,Nx,Ny,Loop,Q_pts,RSF,Rc,pi,&
&DelT,DelX,D_2,xx,yy,absci,weight,&
&CaseS,XiX,XiY,XiR,YiR,CXiX,CXiY)
CALL CPU_TIME(DTIME)
PRINT*, 'TOTAL GPU(s)=',DTIME-TIME
PRINT*,''
! ************************************************************
end program
Many Thanks!
sinsin