Hi Mat, thanks for sending those two links, I found them very informative and helpful.
I was talking about acmdn subroutine since tridag is already a device subroutine. But I understood your point.
Here is what I want to do now. I moved all do loops into gpu. I also created a data region begins at the start of “reyneq” subroutine in Reynolds.f90 until the end of the subroutine. I am doing this to reduce execution time (now I have only 2.5X faster code than the original). I hope by eliminating data copy back and forth, I can increase speed even more than 2.5X.
here is an updated version that I just finished:
subroutine reyneq
use Dyn_sizes_module
use master_module
use config_module
use sl_mesh_module
use filmtk_module
use flows_module
use average_module
use bearing_module
use roughness_module
use reynolds_module
use dynamic0_module
implicit none
real(8) :: H2IP_B, AP2, AP3, AP1, AP4, APJ_B, RATIO, DEXP, AMI_B, RAK, H2JP, AW4, AW3, AW2, AW1, &
BJ, QNJP_B, PEMI_B, PEPJ, H2IM, APJ, H2IP, API, H2JP_B, DABS, PEPJ_B, H2JM, PEMJ_B, DMI, DMJ, &
AN4, AN1, AN2, AN3, PEPI_B, DPI_B, AMI, AMJ, PN_B, API_B, AMJ_B, PN, H2JM_B, AJ, RAKPRE, &
FPJ, AE1, AE3, AE2, AE4, QNIM_B, P2JM, FMI_B, QNJP, DELYJM2, DELYJM1, QNH2JM, AMULT, DPJ_B, &
FMJ_B, DJ, P2IM, DELXIM2, DELXIM1, FPJ_B, P2IP, QNH2IM_B, AS3, CJ, AS1, DMI_B, AS4, &
QNH2JP, FMJ, FMI, ZTAM, QNH2JP_B, ETAP, FPI_B, DELYJP1, DELYJP2, ETAM, AR1, AR2, AR3, AR4, &
AS2, QNJM_B, XMXP, DPJ, DPI, P2JP, QNH2IM, DELXIP1, DELXIP2, QNIP, QNH2JM_B, H2IM_B, QNIM, QNH2IP, &
PEPI, QNH2IP_B, AE, DMJ_B, FPI, AN, AP, AS, AR, AW, DP4, DP3, DP2, DP1, QNIP_B, YMYP, PEMJ, &
ZTAP, PEMI, QNJM
real(8) :: aj1,bj1,cj1,dj1
integer :: NY1M1, IP1, NY4, JP1, NX5, NX4, NX1, NX3, NX2, I1, I2, IM1, KINT, JM1, I, K, J, L, &
IDIRECT, NX1M1, NY2, NY3, NY1, J1, J2, NY5
dimension aw(nxmax,nymax),ap(nxmax,nymax),ae(nxmax,nymax), &
ar(nxmax,nymax),as(nxmax,nymax),an(nxmax,nymax), &
aj(nxmax),bj(nxmax),cj(nxmax),dj(nxmax)
dimension aw1(nxmax1,nymax1),ap1(nxmax1,nymax1), &
ae1(nxmax1,nymax1),as1(nxmax1,nymax1), &
an1(nxmax1,nymax1),ar1(nxmax1,nymax1), &
dp1(nxmax1,nymax1)
dimension aw2(nxmax2,nymax2),ap2(nxmax2,nymax2), &
ae2(nxmax2,nymax2),as2(nxmax2,nymax2), &
an2(nxmax2,nymax2),ar2(nxmax2,nymax2), &
dp2(nxmax2,nymax2)
dimension aw3(nxmax3,nymax3),ap3(nxmax3,nymax3), &
ae3(nxmax3,nymax3),as3(nxmax3,nymax3), &
an3(nxmax3,nymax3),ar3(nxmax3,nymax3), &
dp3(nxmax3,nymax3)
dimension aw4(nxmax4,nymax4),ap4(nxmax4,nymax4), &
ae4(nxmax4,nymax4),as4(nxmax4,nymax4), &
an4(nxmax4,nymax4),ar4(nxmax4,nymax4), &
dp4(nxmax4,nymax4)
!$acc routine(tridag) seq
!$acc routine(flow) seq
nx1 = 2 + (nx-2)/2
ny1 = 2 + (ny-2)/2
nx2 = 2 + (nx1-2)/2
ny2 = 2 + (ny1-2)/2
nx3 = 2 + (nx2-2)/2
ny3 = 2 + (ny2-2)/2
nx4 = 2 + (nx3-2)/2
ny4 = 2 + (ny3-2)/2
nreyneq=0
5 nreyneq=nreyneq+1
!$acc data copyin(xref,yref,aw,ae,as,an,ap,ar,bearx,beary, &
!$acc hydnew,hxlnew,hyunew,hxrnew,hxrnew,zta,eta,p,pold,hnew,omega0,dt) &
!$acc copyout(p)
!$acc update device (t1, t2, t3, t4, gama, pir, pit, d0, zdat,qndat, &
!$acc nter, icoe, iqpo)
!$acc parallel loop gang vector collapse(2)
!acc private(ami_b,dmi_b,pemi_b,api_b,dpi_b,pepi_b,amj_b,dmj_b,pemj_b, &
!acc apj_b,dpj_b,pepj_b,pn,pn_b,qnim,qnim_b,qnip,qnip_b,qnjp_b,qnjp,qnjm,qnjm_b)
do 300 j=2,nym1
do 250 i=2,nxm1
jm1=j-1
jp1=j+1
delyjp1=yref(jp1)-yref(j )
delyjm1=yref(j )-yref(jm1)
if(j.ne.nym1) delyjp2=yref(j+2)-yref(jp1)
if(j.ne.2) delyjm2=yref(jm1)-yref(j-2)
ymyp =(yref(jp1)-yref(jm1))/2.d0
im1=i-1
ip1=i+1
delxip1=xref(ip1)-xref(i )
delxim1=xref(i )-xref(im1)
if(i.ne.nxm1) delxip2=xref(i+2)-xref(ip1)
if(i.ne.2) delxim2=xref(im1)-xref(i-2)
xmxp =(xref(ip1)-xref(im1))/2.d0
h2im = hydnew(i ,j )
h2ip = hydnew(i+1,j )
h2jm = hxlnew(i ,j )
h2jp = hxlnew(i ,j+1)
h2im_b = hyunew(i ,j )
h2ip_b = hyunew(i+1,j )
h2jm_b = hxrnew(i ,j )
h2jp_b = hxrnew(i ,j+1)
ztam=zta(i ,j )
ztap=zta(i ,j+1)
etam=eta(i ,j )
etap=eta(i+1,j )
p2im = (p(im1,j)+p(i,j))/2.d0
p2ip = (p(ip1,j)+p(i,j))/2.d0
p2jm = (p(i,jm1)+p(i,j))/2.d0
p2jp = (p(i,jp1)+p(i,j))/2.d0
pn=p2im*h2im
call flow(pn,qnim)
qnh2im = qnim*h2im*h2im
fmi = h2im*(bearx(im1,j)+bearx(i,j))/2.d0
dmi=qnh2im/delxim1
pemi=fmi/dmi
if(etam.eq.1.d0) then
ami_b=0.d0
fmi_b=0.d0
else
pn_b=p2im*h2im_b
call flow(pn_b,qnim_b)
qnh2im_b = qnim_b*h2im_b*h2im_b
fmi_b = h2im_b*(bearx(im1,j)+bearx(i,j))/2.d0
dmi_b=qnh2im_b/delxim1
pemi_b=fmi_b/dmi_b
endif
pn=p2ip*h2ip
call flow(pn,qnip)
qnh2ip = qnip*h2ip*h2ip
fpi = h2ip*(bearx(ip1,j)+bearx(i,j))/2.d0
dpi=qnh2ip/delxip1
pepi=fpi/dpi
if(etap.eq.1.d0) then
api_b=0.d0
fpi_b=0.d0
else
pn_b=p2ip*h2ip_b
call flow(pn_b,qnip_b)
qnh2ip_b = qnip_b*h2ip_b*h2ip_b
fpi_b = h2ip_b*(bearx(ip1,j)+bearx(i,j))/2.d0
dpi_b=qnh2ip_b/delxip1
pepi_b=fpi_b/dpi_b
endif
pn=p2jm*h2jm
call flow(pn,qnjm)
qnh2jm = qnjm*h2jm*h2jm
fmj = h2jm*(beary(i,jm1)+beary(i,j))/2.d0
dmj=qnh2jm/delyjm1
pemj=fmj/dmj
if(ztam.eq.1.d0) then
amj_b=0.d0
fmj_b=0.d0
else
pn_b=p2jm*h2jm_b
call flow(pn_b,qnjm_b)
qnh2jm_b = qnjm_b*h2jm_b*h2jm_b
fmj_b = h2jm_b*(beary(i,jm1)+beary(i,j))/2.d0
dmj_b=qnh2jm_b/delyjm1
pemj_b=fmj_b/dmj_b
endif
pn=p2jp*h2jp
call flow(pn,qnjp)
qnh2jp = qnjp*h2jp*h2jp
fpj = h2jp*(beary(i,jp1)+beary(i,j))/2.d0
dpj=qnh2jp/delyjp1
pepj=fpj/dpj
if(ztap.eq.1.d0) then
apj_b=0.d0
fpj_b=0.d0
else
pn_b=p2jp*h2jp_b
call flow(pn_b,qnjp_b)
qnh2jp_b = qnjp_b*h2jp_b*h2jp_b
fpj_b = h2jp_b*(beary(i,jp1)+beary(i,j))/2.d0
dpj_b=qnh2jp_b/delyjp1
pepj_b=fpj_b/dpj_b
endif
go to (11,12,13,14,15),idisc
11 ami=dmax1(0.d0,dmi*(1.d0-0.1d0*dabs(pemi))**5)
api=dmax1(0.d0,dpi*(1.d0-0.1d0*dabs(pepi))**5)
amj=dmax1(0.d0,dmj*(1.d0-0.1d0*dabs(pemj))**5)
apj=dmax1(0.d0,dpj*(1.d0-0.1d0*dabs(pepj))**5)
if(etam.ne.1.d0) then
ami_b=dmax1(0.d0,dmi_b*(1.d0-0.1d0*dabs(pemi_b))**5)
endif
if(etap.ne.1.d0) then
api_b=dmax1(0.d0,dpi_b*(1.d0-0.1d0*dabs(pepi_b))**5)
endif
if(ztam.ne.1.d0) then
amj_b=dmax1(0.d0,dmj_b*(1.d0-0.1d0*dabs(pemj_b))**5)
endif
if(ztap.ne.1.d0) then
apj_b=dmax1(0.d0,dpj_b*(1.d0-0.1d0*dabs(pepj_b))**5)
endif
goto 20
12 ami=dmi*(1.d0-0.5d0*dabs(pemi))
api=dpi*(1.d0-0.5d0*dabs(pepi))
amj=dmj*(1.d0-0.5d0*dabs(pemj))
apj=dpj*(1.d0-0.5d0*dabs(pepj))
if(etam.ne.1.d0) then
ami_b=dmi_b*(1.d0-0.5d0*dabs(pemi_b))
endif
if(etap.ne.1.d0) then
api_b=dpi_b*(1.d0-0.5d0*dabs(pepi_b))
endif
if(ztam.ne.1.d0) then
amj_b=dmj_b*(1.d0-0.5d0*dabs(pemj_b))
endif
if(ztap.ne.1.d0) then
apj_b=dpj_b*(1.d0-0.5d0*dabs(pepj_b))
endif
goto 20
13 ami=dmi
api=dpi
amj=dmj
apj=dpj
if(etam.ne.1.d0) then
ami_b=dmi_b
endif
if(etap.ne.1.d0) then
api_b=dpi_b
endif
if(ztam.ne.1.d0) then
amj_b=dmj_b
endif
if(ztap.ne.1.d0) then
apj_b=dpj_b
endif
goto 20
14 ami=dmax1(0.d0,dmi*(1.0d0-0.5d0*dabs(pemi)))
api=dmax1(0.d0,dpi*(1.0d0-0.5d0*dabs(pepi)))
amj=dmax1(0.d0,dmj*(1.0d0-0.5d0*dabs(pemj)))
apj=dmax1(0.d0,dpj*(1.0d0-0.5d0*dabs(pepj)))
if(etam.ne.1.d0) then
ami_b=dmax1(0.d0,dmi_b*(1.0d0-0.5d0*dabs(pemi_b)))
endif
if(etap.ne.1.d0) then
api_b=dmax1(0.d0,dpi_b*(1.0d0-0.5d0*dabs(pepi_b)))
endif
if(ztam.ne.1.d0) then
amj_b=dmax1(0.d0,dmj_b*(1.0d0-0.5d0*dabs(pemj_b)))
endif
if(ztap.ne.1.d0) then
apj_b=dmax1(0.d0,dpj_b*(1.0d0-0.5d0*dabs(pepj_b)))
endif
goto 20
15 ami=dmi*dabs(pemi)/(dexp(dabs(pemi))-1.d0)
api=dpi*dabs(pepi)/(dexp(dabs(pepi))-1.d0)
amj=dmj*dabs(pemj)/(dexp(dabs(pemj))-1.d0)
apj=dpj*dabs(pepj)/(dexp(dabs(pepj))-1.d0)
if(etam.ne.1.d0) then
ami_b=dmi_b*dabs(pemi_b)/(dexp(dabs(pemi_b))-1.d0)
endif
if(etap.ne.1.d0) then
api_b=dpi_b*dabs(pepi_b)/(dexp(dabs(pepi_b))-1.d0)
endif
if(ztam.ne.1.d0) then
amj_b=dmj_b*dabs(pemj_b)/(dexp(dabs(pemj_b))-1.d0)
endif
if(ztap.ne.1.d0) then
apj_b=dpj_b*dabs(pepj_b)/(dexp(dabs(pepj_b))-1.d0)
endif
20 continue
aw(im1,jm1)=-ymyp*(etam*(ami+dmax1( fmi,0.d0)) + (1.d0-etam)*(ami_b+dmax1( fmi_b,0.d0)))
ae(im1,jm1)=-ymyp*(etap*(api+dmax1(-fpi,0.d0)) + (1.d0-etap)*(api_b+dmax1(-fpi_b,0.d0)))
as(im1,jm1)=-xmxp*(ztam*(amj+dmax1( fmj,0.d0)) + (1.d0-ztam)*(amj_b+dmax1( fmj_b,0.d0)))
an(im1,jm1)=-xmxp*(ztap*(apj+dmax1(-fpj,0.d0)) + (1.d0-ztap)*(apj_b+dmax1(-fpj_b,0.d0)))
ap(im1,jm1)=-aw(im1,jm1)-ae(im1,jm1) &
-as(im1,jm1)-an(im1,jm1) &
-ymyp*(etam*fmi+(1.d0-etam)*fmi_b) &
+ymyp*(etap*fpi+(1.d0-etap)*fpi_b) &
-xmxp*(ztam*fmj+(1.d0-ztam)*fmj_b) &
+xmxp*(ztap*fpj+(1.d0-ztap)*fpj_b)
if(idirect.ne.1) then
amult = xmxp * ymyp * si / dt / omega0
ar(im1,jm1)=amult*h(i,j)*pold(i,j)
ap(im1,jm1)=ap(im1,jm1)+amult*hnew(i,j)
else
ar(im1,jm1)=0.d0
endif
250 continue
300 continue
akd=0.d0
akn=0.d0
!$acc parallel loop reduction(+:akn,akd)
do i=2,nxm1
im1=i-1
ip1=i+1
!$acc loop vector reduction(+:akn,akd)
do j=2,nym1
jm1=j-1
jp1=j+1
aj1=aw(im1,jm1)
bj1=ap(im1,jm1)
cj1=ae(im1,jm1)
dj1=ar(im1,jm1)-as(im1,jm1)*p(i,jm1) - an(im1,jm1)*p(i,jp1)
if(i.eq.2) then
dj1=dj1-aw(im1,jm1)*p(im1,j)
aj1=0.d0
else if (i.eq.nxm1) then
dj1=dj1-ae(im1,jm1)*p(ip1,j)
cj1=0.d0
endif
akd=akd+dabs(aj1*p(im1,j)+bj1*p(i,j)+cj1*p(ip1,j)-dj1)
akn=akn+dabs(bj1*p(i,j))
enddo
enddo
ak=akd/akn
if(nreyneq.gt.1.and.(ak.lt.akmax.or.nreyneq.gt.nitmax))return
rakpre=ak
kint=0
50 kint=kint+1
!$acc kernels loop gang vector independent private(aj,bj,cj,dj)
do 500 j=2,nym1
jm1=j-1
jp1=j+1
ymyp=(yref(jp1)-yref(jm1))/2.d0
!$acc loop vector independent
do 450 i=2,nxm1
im1=i-1
ip1=i+1
xmxp=(xref(ip1)-xref(im1))/2.d0
aj(im1)=aw(im1,jm1)
bj(im1)=ap(im1,jm1)
cj(im1)=ae(im1,jm1)
dj(im1)=ar(im1,jm1)-as(im1,jm1)*p(i,jm1) - an(im1,jm1)*p(i,jp1)
if(i.eq.2) then
dj(im1)=dj(im1)-aw(im1,jm1)*p(im1,j)
aj(im1)=0.d0
else if (i.eq.nxm1) then
dj(im1)=dj(im1)-ae(im1,jm1)*p(ip1,j)
cj(im1)=0.d0
endif
450 continue
call tridag(nxm2,aj,bj,cj,dj)
p(2:nxm1,j)=dj(1:nxm2)
500 continue
!$acc parallel loop gang
do i=2,nxm1
im1=i-1
ip1=i+1
xmxp=(xref(ip1)-xref(im1))/2.d0
aj1 = 0.d0
bj1 = 0.d0
cj1 = 0.d0
dj1 = 0.d0
!$acc loop vector reduction(+:aj1,bj1,cj1,dj1)
do j=2,nym1
jm1=j-1
jp1=j+1
ymyp=(yref(jp1)-yref(jm1))/2.d0
aj1=aj1+aw(im1,jm1)
bj1=bj1+ap(im1,jm1)
if(j.ne.2) bj1=bj1+as(im1,jm1)
if(j.ne.nym1) bj1=bj1+an(im1,jm1)
cj1=cj1+ae(im1,jm1)
dj1=dj1+ar(im1,jm1)-aw(im1,jm1)*p(im1,j) &
-ae(im1,jm1)*p(ip1,j)-as(im1,jm1)*p(i,jm1) &
-an(im1,jm1)*p(i,jp1)-ap(im1,jm1)*p(i ,j)
enddo
aj(im1)= aj1
bj(im1)= bj1
cj(im1)= cj1
dj(im1)= dj1
enddo
aj(1)=0.d0
cj(nxm2)=0.d0
call tridag(nxm2,aj,bj,cj,dj)
!$acc parallel loop gang vector collapse(2)
do i=1,nxm2
do j=1,nym2
p(i+1,j+1)=p(i+1,j+1)+dj(i)
enddo
enddo
!$acc kernels loop gang vector independent private(aj,bj,cj,dj)
do 700 i=2,nxm1
im1=i-1
ip1=i+1
xmxp=(xref(ip1)-xref(im1))/2.d0
!$acc loop vector independent
do 650 j=2,nym1
jm1=j-1
jp1=j+1
ymyp=(yref(jp1)-yref(jm1))/2.d0!
aj(jm1)=as(im1,jm1)
bj(jm1)=ap(im1,jm1)
cj(jm1)=an(im1,jm1)
dj(jm1)=ar(im1,jm1)-aw(im1,jm1)*p(im1,j) - ae(im1,jm1)*p(ip1,j)
if(j.eq.2) then
dj(jm1)=dj(jm1)-as(im1,jm1)*p(i,jm1)
aj(jm1)=0.d0
else if(j.eq.nym1) then
dj(jm1)=dj(jm1)-an(im1,jm1)*p(i,jp1)
cj(jm1)=0.d0
end if
650 continue
call tridag(nym2,aj,bj,cj,dj)
p(i,2:nym1)=dj(1:nym2)
700 continue
!$acc parallel loop gang
do j=2,nym1
jm1=j-1
jp1=j+1
ymyp=(yref(jp1)-yref(jm1))/2.d0
aj1 = 0.d0
bj1 = 0.d0
cj1 = 0.d0
dj1 = 0.d0
!$acc loop vector reduction(+:aj1,bj1,cj1,dj1)
do i=2,nxm1
im1=i-1
ip1=i+1
xmxp=(xref(ip1)-xref(im1))/2.d0
aj1=aj1+as(im1,jm1)
bj1=bj1+ap(im1,jm1)
if(i.ne.2) bj1=bj1+aw(im1,jm1) ! do something about this!
if(i.ne.nxm1) bj1=bj1+ae(im1,jm1) ! and this
cj1=cj1+an(im1,jm1)
dj1=dj1+ar(im1,jm1)-aw(im1,jm1)*p(im1,j) &
-ae(im1,jm1)*p(ip1,j)-as(im1,jm1)*p(i,jm1) &
-an(im1,jm1)*p(i,jp1)-ap(im1,jm1)*p(i ,j)
enddo
aj(jm1)= aj1
bj(jm1)= bj1
cj(jm1)= cj1
dj(jm1)= dj1
enddo
aj(1)=0.d0
cj(nym2)=0.d0
call tridag(nym2,aj,bj,cj,dj)
!$acc parallel loop gang vector collapse(2)
do j=1,nym2
do i=1,nxm2
p(i+1,j+1)=p(i+1,j+1)+dj(j)
enddo
enddo
akd=0.d0
akn=0.d0
!$acc parallel loop reduction(+:akn,akd)
do i=2,nxm1
im1=i-1
ip1=i+1
!$acc loop vector reduction(+:akn,akd)
do j=2,nym1
jm1=j-1
jp1=j+1
aj1=aw(im1,jm1)
bj1=ap(im1,jm1)
cj1=ae(im1,jm1)
dj1=ar(im1,jm1)-as(im1,jm1)*p(i,jm1) - an(im1,jm1)*p(i,jp1)
if(i.eq.2) then ! do something about this..
dj1=dj1-aw(im1,jm1)*p(im1,j)
aj1=0.d0
else if (i.eq.nxm1) then ! and this..
dj1=dj1-ae(im1,jm1)*p(ip1,j)
cj1=0.d0
endif
akd=akd+dabs(aj1*p(im1,j)+bj1*p(i,j) + cj1*p(ip1,j)-dj1)
akn=akn+dabs(bj1*p(i,j))
enddo
enddo
rak=akd/akn
ratio=rak/rakpre
rakpre=rak
if(ratio.le.0.5d0) goto 50
call acmdn(aw,ae,as,an,ap,ar,p,aw1,ae1,as1,an1,ap1,ar1,dp1, &
nx,ny,nx1,ny1,nxmax,nymax,nxmax1,nymax1,1)
call acmdn(aw1,ae1,as1,an1,ap1,ar1,dp1,aw2,ae2,as2,an2,ap2, &
ar2,dp2,nx1,ny1,nx2,ny2,nxmax1,nymax1,nxmax2,nymax2,2)
call acmdn(aw2,ae2,as2,an2,ap2,ar2,dp2,aw3,ae3,as3,an3,ap3, &
ar3,dp3,nx2,ny2,nx3,ny3,nxmax2,nymax2,nxmax3,nymax3,4)
call acmdn(aw3,ae3,as3,an3,ap3,ar3,dp3,aw4,ae4,as4,an4,ap4, &
ar4,dp4,nx3,ny3,nx4,ny4,nxmax3,nymax3,nxmax4,nymax4,16)
call acmup(aw3,ae3,as3,an3,ap3,ar3,dp3,dp4,nx3,ny3, &
nx4,ny4,nxmax3,nymax3,nxmax4,nymax4,16)
call acmup(aw2,ae2,as2,an2,ap2,ar2,dp2,dp3,nx2,ny2, &
nx3,ny3,nxmax2,nymax2,nxmax3,nymax3,16)
call acmup(aw1,ae1,as1,an1,ap1,ar1,dp1,dp2,nx1,ny1, &
nx2,ny2,nxmax1,nymax1,nxmax2,nymax2,2)
nx1m1=nx1-1
ny1m1=ny1-1
!$acc parallel loop gang vector collapse(2)
do k=2,nx1m1
do l=2,ny1m1
i1=2*(k-1)
i2=2*(k-1)+1
j1=2*(l-1)
j2=2*(l-1)+1
p(i1,j1)=p(i1,j1)+dp1(k,l)
p(i2,j1)=p(i2,j1)+dp1(k,l)
p(i1,j2)=p(i1,j2)+dp1(k,l)
p(i2,j2)=p(i2,j2)+dp1(k,l)
end do
end do
!$acc kernels loop gang vector independent private(aj,bj,cj,dj)
do 800 j=2,nym1
jm1=j-1
jp1=j+1
ymyp=(yref(jp1)-yref(jm1))/2.d0
!$acc loop vector independent
do 750 i=2,nxm1
im1=i-1
ip1=i+1
xmxp=(xref(ip1)-xref(im1))/2.d0
aj(im1)=aw(im1,jm1)
bj(im1)=ap(im1,jm1)
cj(im1)=ae(im1,jm1)
dj(im1)=ar(im1,jm1)-as(im1,jm1)*p(i,jm1) - an(im1,jm1)*p(i,jp1)
if(i.eq.2) then
dj(im1)=dj(im1)-aw(im1,jm1)*p(im1,j)
aj(im1)=0.d0
else if (i.eq.nxm1) then
dj(im1)=dj(im1)-ae(im1,jm1)*p(ip1,j)
cj(im1)=0.d0
endif
750 continue
call tridag(nxm2,aj,bj,cj,dj)
p(2:nxm1,j)=dj(1:nxm2)
800 continue
!$acc parallel loop gang
do i=2,nxm1
im1=i-1
ip1=i+1
xmxp=(xref(ip1)-xref(im1))/2.d0
aj1 = 0.d0
bj1 = 0.d0
cj1 = 0.d0
dj1 = 0.d0
!$acc loop vector reduction(+:aj1,bj1,cj1,dj1)
do j=2,nym1
jm1=j-1
jp1=j+1
ymyp=(yref(jp1)-yref(jm1))/2.d0
aj1=aj1+aw(im1,jm1)
bj1=bj1+ap(im1,jm1)
if(j.ne.2) bj1=bj1+as(im1,jm1)
if(j.ne.nym1) bj1=bj1+an(im1,jm1)
cj1=cj1+ae(im1,jm1)
dj1=dj1+ar(im1,jm1)-aw(im1,jm1)*p(im1,j) &
-ae(im1,jm1)*p(ip1,j)-as(im1,jm1)*p(i,jm1) &
-an(im1,jm1)*p(i,jp1)-ap(im1,jm1)*p(i ,j)
enddo
aj(im1)= aj1
bj(im1)= bj1
cj(im1)= cj1
dj(im1)= dj1
enddo
aj(1)=0.d0
cj(nxm2)=0.d0
call tridag(nxm2,aj,bj,cj,dj)
!$acc parallel loop gang vector collapse(2)
do i=1,nxm2
do j=1,nym2
p(i+1,j+1)=p(i+1,j+1)+dj(i)
enddo
enddo
!$acc kernels loop gang vector independent private(aj,bj,cj,dj)
do 900 i=2,nxm1
im1=i-1
ip1=i+1
xmxp=(xref(ip1)-xref(im1))/2.d0
!$acc loop vector independent
do 850 j=2,nym1
jm1=j-1
jp1=j+1
ymyp=(yref(jp1)-yref(jm1))/2.d0
aj(jm1)=as(im1,jm1)
bj(jm1)=ap(im1,jm1)
cj(jm1)=an(im1,jm1)
dj(jm1)=ar(im1,jm1)-aw(im1,jm1)*p(im1,j) - ae(im1,jm1)*p(ip1,j)
if(j.eq.2) then
dj(jm1)=dj(jm1)-as(im1,jm1)*p(i,jm1)
aj(jm1)=0.d0
else if(j.eq.nym1) then
dj(jm1)=dj(jm1)-an(im1,jm1)*p(i,jp1)
cj(jm1)=0.d0
end if
850 continue
call tridag(nym2,aj,bj,cj,dj)
p(i,2:nym1)=dj(1:nym2)
900 continue
!$acc parallel loop gang
do j=2,nym1
jm1=j-1
jp1=j+1
ymyp=(yref(jp1)-yref(jm1))/2.d0
aj1 = 0.d0
bj1 = 0.d0
cj1 = 0.d0
dj1 = 0.d0
!$acc loop vector reduction(+:aj1,bj1,cj1,dj1)
do i=2,nxm1
im1=i-1
ip1=i+1
xmxp=(xref(ip1)-xref(im1))/2.d0
aj1=aj1+as(im1,jm1)
bj1=bj1+ap(im1,jm1)
if(i.ne.2) bj1=bj1+aw(im1,jm1)
if(i.ne.nxm1) bj1=bj1+ae(im1,jm1)
cj1=cj1+an(im1,jm1)
dj1=dj1+ar(im1,jm1)-aw(im1,jm1)*p(im1,j) &
-ae(im1,jm1)*p(ip1,j)-as(im1,jm1)*p(i,jm1) &
-an(im1,jm1)*p(i,jp1)-ap(im1,jm1)*p(i ,j)
enddo
aj(jm1)=aj1
bj(jm1)=bj1
cj(jm1)=cj1
dj(jm1)=dj1
enddo
aj(1)=0.d0
cj(nym2)=0.d0
call tridag(nym2,aj,bj,cj,dj)
!$acc parallel loop gang vector collapse(2)
do j=1,nym2
do i=1,nxm2
p(i+1,j+1)=p(i+1,j+1)+dj(j)
if(p(i+1,j+1).lt.0.d0) p(i+1,j+1)=5.d-2
enddo
enddo
akd=0.d0
akn=0.d0
!$acc parallel loop reduction(+:akn,akd)
do j=2,nym1
jm1=j-1
jp1=j+1
!$acc loop vector reduction(+:akn,akd)
do i=2,nxm1
im1=i-1
ip1=i+1
aj1=aw(im1,jm1)
bj1=ap(im1,jm1)
cj1=ae(im1,jm1)
dj1=ar(im1,jm1)-as(im1,jm1)*p(i,jm1) - an(im1,jm1)*p(i,jp1)
if(i.eq.2) then
dj1=dj1-aw(im1,jm1)*p(im1,j)
aj1=0.d0
else if (i.eq.nxm1) then
dj1=dj1-ae(im1,jm1)*p(ip1,j)
cj1=0.d0
endif
akd=akd+dabs(aj1*p(im1,j)+bj1*p(i,j) + cj1*p(ip1,j)-dj1)
akn=akn+dabs(bj1*p(i,j))
enddo
enddo
rak=akd/akn
rakpre=rak
if(rak.gt.akmax.and.kint.lt.nitmax) goto 50
goto 5
!$acc update host(akd,akn)
!$acc end data
end
subroutine flow(pn,qn)
use Dyn_sizes_module
use master_module
use config_module
use flows_module
use dynamic0_module
implicit none
!$acc routine seq
real(8) :: C22, Q2, QN1, QN2, S1, U1, FF0, XI, DEL, AIJ, C12, C11, DEXP, DLOG10, DFLOAT, &
V1, V2, V3, V4, V5, PN, Z1, Z2, A, B, R1, R2, V, EV, F1, F2, Z0, Z, DLOG, ALZ, QN
integer :: I, LOC
dimension a(20),b(20)
if (iqpo.eq.0) then
qn = pn
return
else if (iqpo.eq.1) then
qn = pn + t1
return
else if (iqpo.eq.2) then
r1 = t2/pn
r2 = r1/pn
qn = pn + t1 + r1
return
else if (iqpo.eq.3) then
r1 = t2/pn
s1 = t3/(pn*pn)
u1 = t4/(pn*pn*pn)
qn = pn + t1 + r1 - s1 - u1
return
else if (iqpo.eq.4) then
z = d0*pn
if (icoe.eq.1) go to 15
a(1) = 0.d0
a(2) = -1.d0
b(1) = -pir
b(2) = 1.5d0*(1.d0-gama)
do 10 i = 3,nter
xi = dfloat(i)
aij = xi*(xi-1.d0)*(xi-2.d0)
a(i) = (-2.d0*a(i-2))/aij
b(i) = (-2.d0*b(i-2)-(3.d0*xi*xi-6.d0*xi+2.d0)*a(i))/aij
10 continue
icoe = 1
15 if (z.ge.1.1d0) then
v = 1.889881575d0*z**0.66666666666666666d0
v1 = 12.d0*v
v2 = 24.d0*v*v1
v3 = 36.d0*v*v2
v4 = 48.d0*v*v3
v5 = 60.d0*v*v4
ev = pit*dexp(-v)
ff0 = ev*(1.d0-1.d0/v1+25.d0/v2-1225.d0/v3 + 89425.d0/v4-7263025.d0/v5)
f1 = ev*dsqrt(v/3.d0)*(1.d0+5.d0/v1-35.d0/v2 + 665.d0/v3+9625.d0/v4-9284275.d0/v5)
f2 = ev*(v/3.d0)*(1.d0+17.d0/v1-35.d0/v2 + 1925.d0/v3-175175.d0/v4+22247225.d0/v5)
else
ff0 = 0.d0
f1 = 0.d0
f2 = 0.d0
alz = dlog(z)
do 20 i = nter,1,-1
xi = dfloat(i)
aij = 1.d0/(1.d0+xi)
if (i.eq.1) then
ff0 = ((alz*xi+1.d0)*a(i) + b(i)*xi + ff0)
go to 12
end if
ff0 = ((alz*xi+1.d0)*a(i) + b(i)*xi + ff0)*z
12 f1 = (a(i)*alz + b(i) + f1)*z
f2 = (aij*((alz-aij)*a(i)+b(i)) + f2)*z
20 continue
ff0 = -0.5d0*ff0
f1 = 0.5d0*(f1 + 1.d0)
f2 = pir/4.d0 - 0.5d0*(f2 + 1.d0)*z
end if
c11 = 8.d0-z*z*z*(pir/12.d0-z/16.d0)-2.d0*z*(4.d0+z*z)*ff0 - (16.d0+z*z*(8.d0+z*z/8.d0))*f1-z*(16.d0+z*z)*f2
c22 = 1.d0- 2.d0*f1
c12 = 2.d0 - (pir/2.d0)*z + z*z/4.d0-2.d0*z*ff0 - (4.d0 +z*z/2.d0)*f1 - 2.d0*z*f2
del = c11*c22 - c12*c12
q2 = (pir/del)*(c11 - z*z*(c12/6.d0-z*z*c22/144.d0))
qn = (-1.d0/z+q2)*(6.d0/z)*pn
return
else if (iqpo.eq.5) then
z0=d0*pn
loc=dint(200.d0*dlog10(z0)+401)
if (loc.ge.801) goto 55
if(loc.lt.1)goto 56
z1=zdat(loc)
qn1=qndat(loc)
z2=zdat(loc+1)
qn2=qndat(loc+1)
qn=(qn2-qn1)*(z0-z1)/(z2-z1)+qn1
return
55 continue
r1 = t2/pn
s1 = t3/(pn*pn)
u1 = t4/(pn*pn*pn)
qn = pn + t1 + r1 - s1 - u1
return
56 continue
qn=-pn*6.d0/pir*dlog(z0)/z0
return
end if
end
subroutine tridag(n,a,b,c,d)
use Dyn_sizes_module
implicit real*8(a-h,o-z)
!$ACC routine seq
dimension a(1),b(1),c(1),d(1)
dimension beta(402),gamma(402)
beta(1)=b(1)
gamma(1)=d(1)/beta(1)
do 10 i=2,n
im1=i-1
beta(i)=b(i)-a(i)*c(im1)/beta(im1)
gamma(i)=(d(i)-a(i)*gamma(im1))/beta(i)
10 continue
d(n)=gamma(n)
do 20 i=n-1,1,-1
d(i)=gamma(i)-c(i)*d(i+1)/beta(i)
20 continue
return
end
subroutine acmdn(aw1,ae1,as1,an1,ap1,ar1,p1,aw2,ae2,as2, &
an2,ap2,ar2,dp2,nx1,ny1,nx2,ny2,nx1max,ny1max,nx2max, &
ny2max,nlimit)
use Dyn_sizes_module
implicit none
real(8) :: AR1, AR2, DJ, CJ, AP2, AP1, BJ, AKN, AN2, RATIO, AKPRE, AKD, AS2, AS1, P1
real(8) :: AK, AJ, AW2, AW1, AE1, AE2, DP2, DABS, AN1
real(8) :: aj1,bj1,cj1,dj1
integer :: K1, NX2M2, NX2M1, NX1, NX2, I1, I2, KINT, NY1MAX, K, L, NY2M1, NY2M2
integer :: L1, NX1MAX, NLIMIT, NY2, NY1, J1, J2, NY2MAX, NX2MAX
dimension aw1(nx1max,ny1max),ae1(nx1max,ny1max), &
as1(nx1max,ny1max),an1(nx1max,ny1max), &
ap1(nx1max,ny1max),ar1(nx1max,ny1max), &
aw2(nx2max,ny2max),ae2(nx2max,ny2max), &
as2(nx2max,ny2max),an2(nx2max,ny2max), &
ap2(nx2max,ny2max),ar2(nx2max,ny2max), &
p1(nx1max,ny1max), dp2(nx2max,ny2max)
dimension aj(nxmax),bj(nxmax),cj(nxmax),dj(nxmax)
!$acc routine(tridag) seq
nx2m1=nx2-1
ny2m1=ny2-1
nx2m2=nx2-2
ny2m2=ny2-2
! added by Dolf 7/22/2016
!$acc parallel loop gang vector collapse(2)
do k=2,nx2m1
do l=2,ny2m1
k1=k-1
l1=l-1
i1=2*(k-1)-1
i2=2*(k-1)+1-1
j1=2*(l-1)-1
j2=2*(l-1)+1-1
ap2(k1,l1)=ap1(i1,j1)+ap1(i2,j1)+ap1(i1,j2)+ap1(i2,j2)+ &
ae1(i1,j1)+ae1(i1,j2)+aw1(i2,j1)+aw1(i2,j2)+ &
as1(i1,j2)+as1(i2,j2)+an1(i1,j1)+an1(i2,j1)
aw2(k1,l1)=aw1(i1,j1)+aw1(i1,j2)
ae2(k1,l1)=ae1(i2,j1)+ae1(i2,j2)
as2(k1,l1)=as1(i1,j1)+as1(i2,j1)
an2(k1,l1)=an1(i1,j2)+an1(i2,j2)
ar2(k1,l1)=ar1(i1,j1)+ar1(i2,j1)+ar1(i1,j2)+ar1(i2,j2)- &
ap1(i1,j1)*p1(i1+1,j1+1)-ap1(i2,j1)*p1(i2+1,j1+1)- &
ap1(i1,j2)*p1(i1+1,j2+1)-ap1(i2,j2)*p1(i2+1,j2+1)- &
aw1(i1,j1)*p1(i1 ,j1+1)-aw1(i1,j2)*p1(i1 ,j2+1)- &
ae1(i1,j1)*p1(i2+1,j1+1)-ae1(i1,j2)*p1(i2+1,j2+1)- &
as1(i1,j1)*p1(i1+1,j1 )-as1(i1,j2)*p1(i1+1,j1+1)- &
an1(i1,j1)*p1(i1+1,j2+1)-an1(i1,j2)*p1(i1+1,j2+2)- &
aw1(i2,j1)*p1(i1+1,j1+1)-aw1(i2,j2)*p1(i1+1,j2+1)- &
ae1(i2,j1)*p1(i2+2,j1+1)-ae1(i2,j2)*p1(i2+2,j2+1)- &
as1(i2,j1)*p1(i2+1,j1 )-as1(i2,j2)*p1(i2+1,j1+1)- &
an1(i2,j1)*p1(i2+1,j2+1)-an1(i2,j2)*p1(i2+1,j2+2)
end do
end do
dp2(1:nx2,1:ny2) = 0.d0
kint=0
50 kint=kint+1
!$acc kernels loop gang vector independent private(aj,bj,cj,dj)
do 200 l=2,ny2m1
l1=l-1
!$acc loop vector independent
do 100 k=2,nx2m1
k1=k-1
aj(k1)=aw2(k1,l1)
bj(k1)=ap2(k1,l1)
cj(k1)=ae2(k1,l1)
dj(k1)=-as2(k1,l1)*dp2(k,l1)-an2(k1,l1)*dp2(k,l+1) + ar2(k1,l1)
100 continue
aj(1)=0.d0
cj(nx2m2)=0.d0
call tridag(nx2m2,aj,bj,cj,dj)
dp2(2:nx2m1,l)=dj(1:nx2m2)
200 continue
!$acc parallel loop gang copyout(aj,bj,cj,dj)
do k=2,nx2m1
k1=k-1
aj1 = 0.d0
bj1 = 0.d0
cj1 = 0.d0
dj1 = 0.d0
!$acc loop vector reduction(+:aj1,bj1,cj1,dj1)
do l=2,ny2m1
l1=l-1
aj1=aj1+aw2(k1,l1)
bj1=bj1+ap2(k1,l1)
if(l.ne.2) bj1=bj1+as2(k1,l1)
if(l.ne.ny2m1) bj1=bj1+an2(k1,l1)
cj1=cj1+ae2(k1,l1)
dj1=dj1+ar2(k1,l1)-aw2(k1,l1)*dp2(k1,l) &
-ae2(k1,l1)*dp2(k+1,l)-as2(k1,l1)*dp2(k,l1) &
-an2(k1,l1)*dp2(k,l+1)-ap2(k1,l1)*dp2(k,l)
enddo
aj(k1)=aj1
bj(k1)=bj1
cj(k1)=cj1
dj(k1)=dj1
enddo
aj(1)=0.d0
cj(nx2m2)=0.d0
call tridag(nx2m2,aj,bj,cj,dj)
!$acc parallel loop gang vector collapse(2)
do k=1,nx2m2
do l=1,ny2m2
dp2(k+1,l+1)=dp2(k+1,l+1)+dj(k)
enddo
enddo
!$acc kernels loop gang vector independent private(aj,bj,cj,dj)
do 400 k=2,nx2m1
k1=k-1
!$acc loop vector independent
do 300 l=2,ny2m1
l1=l-1
aj(l1)=as2(k1,l1)
bj(l1)=ap2(k1,l1)
cj(l1)=an2(k1,l1)
dj(l1)=-aw2(k1,l1)*dp2(k1,l)-ae2(k1,l1)*dp2(k+1,l) + ar2(k1,l1)
300 continue
aj(1)=0.d0
cj(ny2m2)=0.d0
call tridag(ny2m2,aj,bj,cj,dj)
dp2(k,2:ny2m1)=dj(1:ny2m2)
400 continue
!$acc parallel loop gang copyout(aj,bj,cj,dj)
do l=2,ny2m1
l1=l-1
aj1 = 0.d0
bj1 = 0.d0
cj1 = 0.d0
dj1 = 0.d0
!$acc loop vector reduction(+:aj1,bj1,cj1,dj1)
do k=2,nx2m1
k1=k-1
aj1=aj1+as2(k1,l1)
bj1=bj1+ap2(k1,l1)
if(k.ne.2) bj1=bj1+aw2(k1,l1)
if(k.ne.nx2m1) bj1=bj1+ae2(k1,l1)
cj1=cj1+an2(k1,l1)
dj1=dj1+ar2(k1,l1)-aw2(k1,l1)*dp2(k1,l) &
-ae2(k1,l1)*dp2(k+1,l)-as2(k1,l1)*dp2(k,l1) &
-an2(k1,l1)*dp2(k,l+1)-ap2(k1,l1)*dp2(k,l)
enddo
aj(l1)=aj1
bj(l1)=bj1
cj(l1)=cj1
dj(l1)=dj1
enddo
aj(1)=0.d0
cj(ny2m2)=0.d0
call tridag(ny2m2,aj,bj,cj,dj)
!$acc parallel loop gang vector collapse(2)
do l=1,ny2m2
do k=1,nx2m2
dp2(k+1,l+1)=dp2(k+1,l+1)+dj(l)
enddo
enddo
akd=0.d0
akn=0.d0
!$acc parallel loop reduction(+:akn,akd)
do k=2,nx2m1
k1=k-1
!$acc loop vector reduction(+:akn,akd)
do l=2,ny2m1
l1=l-1
aj(l1)=as2(k1,l1)
bj(l1)=ap2(k1,l1)
cj(l1)=an2(k1,l1)
dj(l1)=-aw2(k1,l1)*dp2(k1,l)-ae2(k1,l1)*dp2(k+1,l) + ar2(k1,l1)
akd=akd+dabs(aj(l1)*dp2(k,l1)+bj(l1)*dp2(k,l) + cj(l1)*dp2(k,l+1)-dj(l1))
akn=akn+dabs(bj(l1)*dp2(k,l))
enddo
enddo
ak=akd/akn
if(kint.eq.1) then
akpre=ak
goto 50
end if
ratio=ak/akpre
akpre=ak
return
end
subroutine acmup(aw2,ae2,as2,an2,ap2,ar2,dp2,dp3, &
nx2,ny2,nx3,ny3,nx2max,ny2max,nx3max,ny3max,nsweep)
use Dyn_sizes_module
implicit none
real(8) :: AS2, AR2, AE2, DJ, DP2, CJ, AP2, AJ, AW2, BJ, AN2, DP3
real(8) :: aj1,bj1,cj1,dj1
integer :: I2, NX3M1, KSWEEP, NX2M2, NX2M1, NX3, NX2, I1, NX3MAX, NY3M1, K, L, &
NY2M1, NY2M2, NSWEEP, NY3MAX, L1, NY2, NY3, J1, J2, NY2MAX, NX2MAX, K1
dimension aw2(nx2max,ny2max),ae2(nx2max,ny2max), &
as2(nx2max,ny2max),an2(nx2max,ny2max), &
ap2(nx2max,ny2max),ar2(nx2max,ny2max), &
dp2(nx2max,ny2max),dp3(nx3max,ny3max)
dimension aj(nxmax),bj(nxmax),cj(nxmax),dj(nxmax)
nx2m1=nx2-1
ny2m1=ny2-1
nx2m2=nx2-2
ny2m2=ny2-2
nx3m1=nx3-1
ny3m1=ny3-1
do k=2,nx3m1
do l=2,ny3m1
i1=2*(k-1)
i2=2*(k-1)+1
j1=2*(l-1)
j2=2*(l-1)+1
dp2(i1,j1)=dp2(i1,j1)+dp3(k,l)
dp2(i2,j1)=dp2(i2,j1)+dp3(k,l)
dp2(i1,j2)=dp2(i1,j2)+dp3(k,l)
dp2(i2,j2)=dp2(i2,j2)+dp3(k,l)
end do
end do
do 999 ksweep=1,nsweep
do 200 l=2,ny2m1
l1=l-1
do 100 k=2,nx2m1
k1=k-1
aj(k1)=aw2(k1,l1)
bj(k1)=ap2(k1,l1)
cj(k1)=ae2(k1,l1)
dj(k1)=-as2(k1,l1)*dp2(k,l1)-an2(k1,l1)*dp2(k,l+1) + ar2(k1,l1)
100 continue
aj(1)=0.d0
cj(nx2m2)=0.d0
call tridag(nx2m2,aj,bj,cj,dj)
do 150 k=1,nx2m2
dp2(k+1,l)=dj(k)
150 continue
200 continue
do k=2,nx2m1
k1=k-1
aj(k1)=0.d0
bj(k1)=0.d0
cj(k1)=0.d0
dj(k1)=0.d0
do l=2,ny2m1
l1=l-1
aj(k1)=aj(k1)+aw2(k1,l1)
bj(k1)=bj(k1)+ap2(k1,l1)
if(l.ne.2) bj(k1)=bj(k1)+as2(k1,l1)
if(l.ne.ny2m1) bj(k1)=bj(k1)+an2(k1,l1)
cj(k1)=cj(k1)+ae2(k1,l1)
dj(k1)=dj(k1)+ar2(k1,l1)-aw2(k1,l1)*dp2(k1,l) &
-ae2(k1,l1)*dp2(k+1,l)-as2(k1,l1)*dp2(k,l1) &
-an2(k1,l1)*dp2(k,l+1)-ap2(k1,l1)*dp2(k,l)
enddo
enddo
aj(1)=0.d0
cj(nx2m2)=0.d0
call tridag(nx2m2,aj,bj,cj,dj)
do k=1,nx2m2
do l=1,ny2m2
dp2(k+1,l+1)=dp2(k+1,l+1)+dj(k)
enddo
enddo
do 400 k=2,nx2m1
k1=k-1
do 300 l=2,ny2m1
l1=l-1
aj(l1)=as2(k1,l1)
bj(l1)=ap2(k1,l1)
cj(l1)=an2(k1,l1)
dj(l1)=-aw2(k1,l1)*dp2(k1,l)-ae2(k1,l1)*dp2(k+1,l) + ar2(k1,l1)
300 continue
aj(1)=0.d0
cj(ny2m2)=0.d0
call tridag(ny2m2,aj,bj,cj,dj)
do 350 l=1,ny2m2
dp2(k,l+1)=dj(l)
350 continue
400 continue
do l=2,ny2m1
l1=l-1
aj(l1)=0.d0
bj(l1)=0.d0
cj(l1)=0.d0
dj(l1)=0.d0
do k=2,nx2m1
k1=k-1
aj(l1)=aj(l1)+as2(k1,l1)
bj(l1)=bj(l1)+ap2(k1,l1)
if(k.ne.2) bj(l1)=bj(l1)+aw2(k1,l1)
if(k.ne.nx2m1) bj(l1)=bj(l1)+ae2(k1,l1)
cj(l1)=cj(l1)+an2(k1,l1)
dj(l1)=dj(l1)+ar2(k1,l1)-aw2(k1,l1)*dp2(k1,l) &
-ae2(k1,l1)*dp2(k+1,l)-as2(k1,l1)*dp2(k,l1) &
-an2(k1,l1)*dp2(k,l+1)-ap2(k1,l1)*dp2(k,l)
enddo
enddo
aj(1)=0.d0
cj(ny2m2)=0.d0
call tridag(ny2m2,aj,bj,cj,dj)
do l=1,ny2m2
do k=1,nx2m2
dp2(k+1,l+1)=dp2(k+1,l+1)+dj(l)
enddo
enddo
999 continue
return
end
------ Build started: Project: Dyn5d, Configuration: Release x64 ------
Compiling Project ...
Common.f90
Dynamic.F90
FESusp.F90
Force.F90
Grid.F90
Init.F90
allocatearrays:
1394, Generating update device(nxmax)
Mass.F90
Misc.F90
Modes.f90
NewmarkBeta.f90
OptionalParams.f90
Output.f90
Rail.F90
Reynolds.F90
C:\D Drive\CML codes\Dyn5d\Reynolds.F90(81) : warning W0155 : Invalid accelerator data region: branching into or out of region is not allowed
reyneq:
81, Invalid accelerator data region: branching into or out of region is not allowed
84, Generating update device(iqpo,icoe,nter,qndat(:),zdat(:),d0,pit,pir,gama,t4,t3,t2,t1)
86, Accelerator kernel generated
Generating Tesla code
89, !$acc loop gang, vector(128) collapse(2) ! blockidx%
90, ! blockidx%x threadidx%x collapsed
86, Generating copy(ap(:,:),an(:,:),as(:,:),ae(:,:),aw(:,:))
Generating copyin(yref(:),xref(:),hydnew(:,:),hxlnew(:,:),hyunew(:,:),hxrnew(:,:),zta(:,:),eta(:,:),p(:,:))
Generating copy(pn,qnim)
Generating copyin(bearx(:,:))
Generating copy(qnim_b,pn_b,qnip,qnip_b,qnjm)
Generating copyin(beary(:,:))
Generating copy(qnjm_b,qnjp,qnjp_b)
Generating copyin(pold(:,:),h(:,:))
Generating copy(ar(:,:))
Generating copyin(hnew(:,:))
344, Accelerator kernel generated
Generating Tesla code
344, Generating reduction(+:akn,akd)
345, !$acc loop gang ! blockidx%x
349, !$acc loop vector(128) ! threadidx%x
Generating reduction(+:akn,akd)
344, Generating copyin(p(1:nxm1+1,1:nym1+1),aw(:nxm1-1,:nym1-1),ap(:nxm1-1,:nym1-1),ae(:nxm1-1,:nym1-1),ar(:nxm1-1,:nym1-1),as(:nxm1-1,:nym1-1),an(:nxm1-1,:nym1-1))
349, Loop is parallelizable
390, Generating copy(nxm2)
Generating copyin(an(:,:),as(:,:))
Generating copy(p(:,:))
Generating copyin(ar(:,:),ae(:,:),ap(:,:),aw(:,:))
391, Loop is parallelizable
Accelerator kernel generated
Generating Tesla code
391, !$acc loop gang ! blockidx%x
396, !$acc loop vector(128) ! threadidx%x
420, !$acc loop seq
396, Loop is parallelizable
420, Loop is parallelizable
429, Accelerator kernel generated
Generating Tesla code
430, !$acc loop gang ! blockidx%x
439, !$acc loop vector(128) ! threadidx%x
Generating reduction(+:aj1,bj1,cj1,dj1)
429, Generating copyin(as(:nxm1-1,:nym1-1),aw(:nxm1-1,:nym1-1),ap(:nxm1-1,:nym1-1),an(:nxm1-1,:nym1-1),ae(:nxm1-1,:nym1-1),ar(:nxm1-1,:nym1-1),p(1:nxm1+1,1:nym1+1))
Generating copyout(aj(:nxm1-1),bj(:nxm1-1),cj(:nxm1-1),dj(:nxm1-1))
439, Loop is parallelizable
462, Accelerator kernel generated
Generating Tesla code
463, !$acc loop gang, vector(128) collapse(2) ! blockidx%
464, ! blockidx%x threadidx%x collapsed
462, Generating copy(p(2:nxm2+1,2:nym2+1))
Generating copyin(dj(:nxm2))
471, Generating copy(nym2)
Generating copyin(ae(:,:),aw(:,:))
Generating copy(p(:,:))
Generating copyin(ar(:,:),an(:,:),ap(:,:),as(:,:))
472, Loop is parallelizable
Accelerator kernel generated
Generating Tesla code
472, !$acc loop gang ! blockidx%x
477, !$acc loop vector(128) ! threadidx%x
497, !$acc loop seq
477, Loop is parallelizable
497, Loop is parallelizable
503, Accelerator kernel generated
Generating Tesla code
504, !$acc loop gang ! blockidx%x
513, !$acc loop vector(128) ! threadidx%x
Generating reduction(+:aj1,bj1,cj1,dj1)
503, Generating copyin(aw(:nxm1-1,:nym1-1),as(:nxm1-1,:nym1-1),ap(:nxm1-1,:nym1-1),ae(:nxm1-1,:nym1-1),an(:nxm1-1,:nym1-1),ar(:nxm1-1,:nym1-1),p(1:nxm1+1,1:nym1+1))
Generating copyout(aj(:nym1-1),bj(:nym1-1),cj(:nym1-1),dj(:nym1-1))
513, Loop is parallelizable
534, Accelerator kernel generated
Generating Tesla code
535, !$acc loop gang, vector(128) collapse(2) ! blockidx%
536, ! blockidx%x threadidx%x collapsed
534, Generating copyin(dj(:nym2))
Generating copy(p(2:nxm2+1,2:nym2+1))
544, Accelerator kernel generated
Generating Tesla code
544, Generating reduction(+:akn,akd)
545, !$acc loop gang ! blockidx%x
549, !$acc loop vector(128) ! threadidx%x
Generating reduction(+:akn,akd)
544, Generating copyin(p(1:nxm1+1,1:nym1+1),aw(:nxm1-1,:nym1-1),ap(:nxm1-1,:nym1-1),ae(:nxm1-1,:nym1-1),ar(:nxm1-1,:nym1-1),as(:nxm1-1,:nym1-1),an(:nxm1-1,:nym1-1))
549, Loop is parallelizable
618, Accelerator kernel generated
Generating Tesla code
619, !$acc loop gang, vector(128) collapse(2) ! blockidx%
620, ! blockidx%x threadidx%x collapsed
618, Generating copy(p(:,:))
Generating copyin(dp1(2:nx1-1,2:ny1-1))
634, Generating copy(nxm2)
Generating copyin(an(:,:),as(:,:))
Generating copy(p(:,:))
Generating copyin(ar(:,:),ae(:,:),ap(:,:),aw(:,:))
635, Loop is parallelizable
Accelerator kernel generated
Generating Tesla code
635, !$acc loop gang ! blockidx%x
640, !$acc loop vector(128) ! threadidx%x
660, !$acc loop seq
640, Loop is parallelizable
660, Loop is parallelizable
665, Accelerator kernel generated
Generating Tesla code
666, !$acc loop gang ! blockidx%x
675, !$acc loop vector(128) ! threadidx%x
Generating reduction(+:aj1,bj1,cj1,dj1)
665, Generating copyin(as(:nxm1-1,:nym1-1),aw(:nxm1-1,:nym1-1),ap(:nxm1-1,:nym1-1),an(:nxm1-1,:nym1-1),ae(:nxm1-1,:nym1-1),ar(:nxm1-1,:nym1-1),p(1:nxm1+1,1:nym1+1))
Generating copyout(aj(:nxm1-1),bj(:nxm1-1),cj(:nxm1-1),dj(:nxm1-1))
675, Loop is parallelizable
696, Accelerator kernel generated
Generating Tesla code
697, !$acc loop gang, vector(128) collapse(2) ! blockidx%
698, ! blockidx%x threadidx%x collapsed
696, Generating copy(p(2:nxm2+1,2:nym2+1))
Generating copyin(dj(:nxm2))
705, Generating copy(nym2)
Generating copyin(ae(:,:),aw(:,:))
Generating copy(p(:,:))
Generating copyin(ar(:,:),an(:,:),ap(:,:),as(:,:))
706, Loop is parallelizable
Accelerator kernel generated
Generating Tesla code
706, !$acc loop gang ! blockidx%x
711, !$acc loop vector(128) ! threadidx%x
731, !$acc loop seq
711, Loop is parallelizable
731, Loop is parallelizable
736, Accelerator kernel generated
Generating Tesla code
737, !$acc loop gang ! blockidx%x
746, !$acc loop vector(128) ! threadidx%x
Generating reduction(+:aj1,bj1,cj1,dj1)
736, Generating copyin(aw(:nxm1-1,:nym1-1),as(:nxm1-1,:nym1-1),ap(:nxm1-1,:nym1-1),ae(:nxm1-1,:nym1-1),an(:nxm1-1,:nym1-1),ar(:nxm1-1,:nym1-1),p(1:nxm1+1,1:nym1+1))
Generating copyout(aj(:nym1-1),bj(:nym1-1),cj(:nym1-1),dj(:nym1-1))
746, Loop is parallelizable
769, Accelerator kernel generated
Generating Tesla code
770, !$acc loop gang, vector(128) collapse(2) ! blockidx%
771, ! blockidx%x threadidx%x collapsed
769, Generating copy(p(2:nxm2+1,2:nym2+1))
Generating copyin(dj(:nym2))
785, Accelerator kernel generated
Generating Tesla code
785, Generating reduction(+:akn,akd)
786, !$acc loop gang ! blockidx%x
790, !$acc loop vector(128) ! threadidx%x
Generating reduction(+:akn,akd)
785, Generating copyin(p(1:nxm1+1,1:nym1+1),aw(:nxm1-1,:nym1-1),ap(:nxm1-1,:nym1-1),ae(:nxm1-1,:nym1-1),ar(:nxm1-1,:nym1-1),as(:nxm1-1,:nym1-1),an(:nxm1-1,:nym1-1))
790, Loop is parallelizable
0 inform, 1 warnings, 0 severes, 0 fatal for reyneq
acmdn:
850, Accelerator kernel generated
Generating Tesla code
851, !$acc loop gang, vector(128) collapse(2) ! blockidx%
852, ! blockidx%x threadidx%x collapsed
850, Generating copyout(ar2(:,:ny2-2))
Generating copyin(p1(:,:),ar1(:,:))
Generating copyout(an2(:,:ny2-2),as2(:,:ny2-2),ae2(:,:ny2-2),aw2(:,:ny2-2),ap2(:,:ny2-2))
Generating copyin(ap1(:,:),ae1(:,:),aw1(:,:),as1(:,:),an1(:,:))
892, Generating copy(nx2m2)
Generating copyin(aw2(:nx2-2,:ny2-2),ap2(:nx2-2,:ny2-2),ae2(:nx2-2,:ny2-2))
Generating copyin(dp2(2:nx2-1,:ny2))
Generating copyout(dp2(2:nx2-1,2:ny2-1))
Generating copyin(as2(:nx2-2,:ny2-2),an2(:nx2-2,:ny2-2),ar2(:nx2-2,:ny2-2))
893, Loop is parallelizable
Accelerator kernel generated
Generating Tesla code
893, !$acc loop gang ! blockidx%x
896, !$acc loop vector(128) ! threadidx%x
909, !$acc loop seq
896, Loop is parallelizable
909, Loop is parallelizable
914, Generating copyout(aj(:),bj(:),cj(:),dj(:))
Accelerator kernel generated
Generating Tesla code
915, !$acc loop gang ! blockidx%x
922, !$acc loop vector(128) ! threadidx%x
Generating reduction(+:aj1,bj1,cj1,dj1)
914, Generating copyin(as2(:nx2-2,:ny2-2),aw2(:nx2-2,:ny2-2),ap2(:nx2-2,:ny2-2),an2(:nx2-2,:ny2-2),ae2(:nx2-2,:ny2-2),ar2(:nx2-2,:ny2-2),dp2(:nx2,:ny2))
922, Loop is parallelizable
943, Accelerator kernel generated
Generating Tesla code
944, !$acc loop gang, vector(128) collapse(2) ! blockidx%
945, ! blockidx%x threadidx%x collapsed
943, Generating copy(dp2(2:nx2m2+1,2:ny2m2+1))
Generating copyin(dj(:nx2m2))
952, Generating copy(ny2m2)
Generating copyin(as2(:nx2-2,:ny2-2),ap2(:nx2-2,:ny2-2),an2(:nx2-2,:ny2-2),ar2(:nx2-2,:ny2-2),aw2(:nx2-2,:ny2-2))
Generating copyin(dp2(:nx2,2:ny2-1))
Generating copyout(dp2(2:nx2-1,2:ny2-1))
Generating copyin(ae2(:nx2-2,:ny2-2))
953, Loop is parallelizable
Accelerator kernel generated
Generating Tesla code
953, !$acc loop gang ! blockidx%x
956, !$acc loop vector(128) ! threadidx%x
969, !$acc loop seq
956, Loop is parallelizable
969, Loop is parallelizable
974, Generating copyout(aj(:),bj(:),cj(:),dj(:))
Accelerator kernel generated
Generating Tesla code
975, !$acc loop gang ! blockidx%x
982, !$acc loop vector(128) ! threadidx%x
Generating reduction(+:aj1,bj1,cj1,dj1)
974, Generating copyin(aw2(:nx2-2,:ny2-2),as2(:nx2-2,:ny2-2),ap2(:nx2-2,:ny2-2),ae2(:nx2-2,:ny2-2),an2(:nx2-2,:ny2-2),ar2(:nx2-2,:ny2-2),dp2(:nx2,:ny2))
982, Loop is parallelizable
1003, Accelerator kernel generated
Generating Tesla code
1004, !$acc loop gang, vector(128) collapse(2) ! blockidx%
1005, ! blockidx%x threadidx%x collapsed
1003, Generating copyin(dj(:ny2m2))
Generating copy(dp2(2:nx2m2+1,2:ny2m2+1))
1013, Accelerator kernel generated
Generating Tesla code
1013, Generating reduction(+:akn,akd)
1014, !$acc loop gang ! blockidx%x
1017, !$acc loop vector(128) ! threadidx%x
Generating reduction(+:akn,akd)
1013, Generating copyin(as2(:nx2-2,:ny2-2))
Generating copyout(aj(:ny2-2))
Generating copyin(ap2(:nx2-2,:ny2-2))
Generating copyout(bj(:ny2-2))
Generating copyin(an2(:nx2-2,:ny2-2))
Generating copyout(cj(:ny2-2))
Generating copyin(ar2(:nx2-2,:ny2-2),aw2(:nx2-2,:ny2-2),dp2(:nx2,:ny2),ae2(:nx2-2,:ny2-2))
Generating copyout(dj(:ny2-2))
1017, Loop is parallelizable
flow:
1189, Generating acc routine seq
1244, !$acc do seq
1270, !$acc do seq
1270, Scalar last value needed after loop for ff0 at line 1285
Scalar last value needed after loop for f1 at line 1286
Scalar last value needed after loop for f2 at line 1287
Roughness.F90
Stiff.F90
String.f90
Suspension.F90
UserDefGeom.f90
Util.F90
tridag:
229, Generating acc routine seq
241, !$acc do seq
247, !$acc do seq
Linking...
libvcruntime.lib(_chandler_.obj) : warning LNK4229: invalid directive '/GUARDSYM:__C_specific_handler,S' encountered; ignored
C:\D Drive\CML codes\Dyn5d\x64\Release/Dyn5d.exf: warning: unknown section ".gfids" found, executable not stripped. Use the linker's /merge option to merge this section with another. Example: "/merge:.gfids=.data"
C:\D Drive\CML codes\Dyn5d\x64\Release/Dyn5d.exf: warning: unknown section ".00cfg" found, executable not stripped. Use the linker's /merge option to merge this section with another. Example: "/merge:.00cfg=.data"
Dyn5d build succeeded.
Build log was saved at "file://C:\D Drive\CML codes\Dyn5d\x64\Release\BuildLog.htm"
========== Build: 1 succeeded, 0 failed, 0 up-to-date, 0 skipped ==========
what do you think? am I doing the right thing? please advice. thanks, Dolf