Hi Mat,
this is what flow subroutine look like:
!=======================================================
subroutine flow (pn,qn)
!========================================================
use Dyn_sizes_module
use master_module
use config_module
use flows_module
use Dyn_module
implicit none
!$acc routine seq
real(8) :: C22, Q2, QN1, QN2, S1, U1, FF0, XI, DEL, AIJ, C12, C11, DEXP, DLOG10, DFLOAT, &
V1, V2, V3, V4, V5, PN, Z1, Z2, A, B, R1, R2, V, EV, F1, F2, Z0, Z, DLOG, ALZ, QN
integer :: I, LOC
dimension a(20),b(20)
!
!ha use the continuum model.....
if (iqpo.eq.0) then
qn = pn
return
!
!ha use the first order slip model....
else if (iqpo.eq.1) then
qn = pn + t1
return
!
!ha use the second order slip model.....
else if (iqpo.eq.2) then
r1 = t2/pn
r2 = r1/pn
qn = pn + t1 + r1
return
!
!ha use the asymtotic fk model.....
else if (iqpo.eq.3) then
r1 = t2/pn
s1 = t3/(pn*pn)
u1 = t4/(pn*pn*pn)
qn = pn + t1 + r1 - s1 - u1
return
!
!ha use the full fk model.....
else if (iqpo.eq.4) then
z = d0*pn
if (icoe.eq.1) go to 15
a(1) = 0.d0
a(2) = -1.d0
b(1) = -pir
b(2) = 1.5d0*(1.d0-gama)
do 10 i = 3,nter
xi = dfloat(i)
aij = xi*(xi-1.d0)*(xi-2.d0)
a(i) = (-2.d0*a(i-2))/aij
b(i) = (-2.d0*b(i-2)-(3.d0*xi*xi-6.d0*xi+2.d0)*a(i))/aij
10 continue
icoe = 1
15 if (z.ge.1.1d0) then
v = 1.889881575d0*z**0.66666666666666666d0
v1 = 12.d0*v
v2 = 24.d0*v*v1
v3 = 36.d0*v*v2
v4 = 48.d0*v*v3
v5 = 60.d0*v*v4
ev = pit*dexp(-v)
ff0 = ev*(1.d0-1.d0/v1+25.d0/v2-1225.d0/v3 + 89425.d0/v4-7263025.d0/v5)
f1 = ev*dsqrt(v/3.d0)*(1.d0+5.d0/v1-35.d0/v2 + 665.d0/v3+9625.d0/v4-9284275.d0/v5)
f2 = ev*(v/3.d0)*(1.d0+17.d0/v1-35.d0/v2 + 1925.d0/v3-175175.d0/v4+22247225.d0/v5)
else
ff0 = 0.d0
f1 = 0.d0
f2 = 0.d0
alz = dlog(z)
do 20 i = nter,1,-1
xi = dfloat(i)
aij = 1.d0/(1.d0+xi)
if (i.eq.1) then
ff0 = ((alz*xi+1.d0)*a(i) + b(i)*xi + ff0)
go to 12
end if
ff0 = ((alz*xi+1.d0)*a(i) + b(i)*xi + ff0)*z
12 f1 = (a(i)*alz + b(i) + f1)*z
f2 = (aij*((alz-aij)*a(i)+b(i)) + f2)*z
20 continue
ff0 = -0.5d0*ff0
f1 = 0.5d0*(f1 + 1.d0)
f2 = pir/4.d0 - 0.5d0*(f2 + 1.d0)*z
end if
c11 = 8.d0-z*z*z*(pir/12.d0-z/16.d0)-2.d0*z*(4.d0+z*z)*ff0 - (16.d0+z*z*(8.d0+z*z/8.d0))*f1-z*(16.d0+z*z)*f2
c22 = 1.d0- 2.d0*f1
c12 = 2.d0 - (pir/2.d0)*z + z*z/4.d0-2.d0*z*ff0 - (4.d0 +z*z/2.d0)*f1 - 2.d0*z*f2
del = c11*c22 - c12*c12
q2 = (pir/del)*(c11 - z*z*(c12/6.d0-z*z*c22/144.d0))
qn = (-1.d0/z+q2)*(6.d0/z)*pn
return
!
!
!ha use the data base of precalculated qp based on fk model.....
else if (iqpo.eq.5) then
z0=d0*pn
loc=dint(200.d0*dlog10(z0)+401)
if (loc.ge.801) goto 55
if(loc.lt.1)goto 56
z1=zdat(loc)
!
qn1=qndat(loc)
z2=zdat(loc+1)
qn2=qndat(loc+1)
!
!ha interpolate.....
qn=(qn2-qn1)*(z0-z1)/(z2-z1)+qn1
return
!
! if inverse knudsen no. is out of range of database
! use first order slip
55 continue
r1 = t2/pn
s1 = t3/(pn*pn)
u1 = t4/(pn*pn*pn)
qn = pn + t1 + r1 - s1 - u1
return
56 continue
qn=-pn*6.d0/pir*dlog(z0)/z0
return
end if
end
here is the reynolds subroutine:
!======================================================
subroutine reyneq(idirect)
!======================================================
use Dyn_sizes_module
use master_module
use config_module
use sl_mesh_module
use filmtk_module
use flows_module
use average_module
use bearing_module
use roughness_module
use reynolds_module
use Dyn_module
! device modules
use kernels_sub
use reyneq_Dev
use Q4_globals_Dev
use flows_Dev
use average_Dev
use filmtk_Dev
use bearing_Dev
use openacc
implicit none
!$acc routine(flow) seq
real(8) :: H2IP_B, AP2, AP3, AP1, AP4, APJ_B, RATIO, DEXP, AMI_B, RAK, H2JP, AW4, AW3, AW2, AW1, &
BJ, QNJP_B, PEMI_B, PEPJ, H2IM, APJ, H2IP, API, H2JP_B, DABS, PEPJ_B, H2JM, PEMJ_B, DMI, DMJ, &
AN4, AN1, AN2, AN3, PEPI_B, DPI_B, AMI, AMJ, PN_B, API_B, AMJ_B, PN, H2JM_B, AJ, RAKPRE, &
FPJ, AE1, AE3, AE2, AE4, QNIM_B, P2JM, FMI_B, QNJP, DELYJM2, DELYJM1, QNH2JM, AMULT, DPJ_B, &
FMJ_B, AKN, DJ, AKD, P2IM, DELXIM2, DELXIM1, FPJ_B, P2IP, QNH2IM_B, AS3, CJ, AS1, DMI_B, AS4, &
QNH2JP, FMJ, FMI, ZTAM, QNH2JP_B, ETAP, FPI_B, DELYJP1, DELYJP2, ETAM, AR1, AR2, AR3, AR4, &
AS2, QNJM_B, XMXP, DPJ, DPI, P2JP, QNH2IM, DELXIP1, DELXIP2, QNIP, QNH2JM_B, H2IM_B, QNIM, QNH2IP, &
PEPI, QNH2IP_B, AE, DMJ_B, FPI, AN, AP, AS, AR, AW, DP4, DP3, DP2, DP1, QNIP_B, YMYP, PEMJ, &
ZTAP, PEMI, QNJM
integer :: NY1M1, IP1, NY4, JP1, NX5, NX4, NX1, NX3, NX2, I1, I2, IM1, KINT, JM1, I, K, J, L, &
IDIRECT, NX1M1, NY2, NY3, NY1, J1, J2, NY5
real(8) :: a,b,akdh(nxmax,nymax),aknh(nxmax,nymax),ajh(nxmax,nymax),bjh(nxmax,nymax),cjh(nxmax,nymax),djh(nxmax,nymax)
real(8), device, allocatable :: aj1Dev(:),bj1Dev(:),cj1Dev(:),dj1Dev(:),ajaDev(:),awaDev(:,:)
real(8) :: aj1,bj1,cj1,dj1
real(8), allocatable :: awa(:,:),aja(:)
integer :: s
dimension a(nxmax,nymax),b(nxmax)
! integer, parameter :: nxmax1=520, nymax1=520, nxmax2=260, nymax2=260, &
! nxmax3=140, nymax3=140, nxmax4=70, nymax4=70, nxmax5=40, nymax5=40
! commented out by Dolf 7/23/15
dimension aw(nxmax,nymax),ap(nxmax,nymax),ae(nxmax,nymax), &
ar(nxmax,nymax),as(nxmax,nymax),an(nxmax,nymax), &
aj(nxmax),bj(nxmax),cj(nxmax),dj(nxmax)
dimension aw1(nxmax1,nymax1),ap1(nxmax1,nymax1), &
ae1(nxmax1,nymax1),as1(nxmax1,nymax1), &
an1(nxmax1,nymax1),ar1(nxmax1,nymax1), &
dp1(nxmax1,nymax1)
dimension aw2(nxmax2,nymax2),ap2(nxmax2,nymax2), &
ae2(nxmax2,nymax2),as2(nxmax2,nymax2), &
an2(nxmax2,nymax2),ar2(nxmax2,nymax2), &
dp2(nxmax2,nymax2)
dimension aw3(nxmax3,nymax3),ap3(nxmax3,nymax3), &
ae3(nxmax3,nymax3),as3(nxmax3,nymax3), &
an3(nxmax3,nymax3),ar3(nxmax3,nymax3), &
dp3(nxmax3,nymax3)
dimension aw4(nxmax4,nymax4),ap4(nxmax4,nymax4), &
ae4(nxmax4,nymax4),as4(nxmax4,nymax4), &
an4(nxmax4,nymax4),ar4(nxmax4,nymax4), &
dp4(nxmax4,nymax4)
! BC - Level 5 is no longer used
! dimension aw5(nxmax5,nymax5),ap5(nxmax5,nymax5), &
! ae5(nxmax5,nymax5),as5(nxmax5,nymax5), &
! an5(nxmax5,nymax5),ar5(nxmax5,nymax5), &
! dp5(nxmax5,nymax5)
!
nx1 = 2 + (nx-2)/2
ny1 = 2 + (ny-2)/2
nx2 = 2 + (nx1-2)/2
ny2 = 2 + (ny1-2)/2
nx3 = 2 + (nx2-2)/2
ny3 = 2 + (ny2-2)/2
nx4 = 2 + (nx3-2)/2
ny4 = 2 + (ny3-2)/2
!allocate(aj1Dev(nxmax),bj1Dev(nxmax),cj1Dev(nxmax),dj1Dev(nxmax))
!allocate(ajaDev(10))
!allocate(awaDev(10,10))
!allocate(awa(10,10),aja(10))
!awa = 1.0
!aja = 0.0
!awaDev = 1.0
!ajaDev = 0.0
! calculate:
! 1) the flow rarefaction factors qn
! 2) the average flow factor of rough bearing surfaces
!
pitDev = pit
pirDev = pir
gamaDev = gama
d0Dev = d0
siDev = si
dtDev = dt
omega0Dev = omega0
t1Dev = t1
t2Dev = t2
t3Dev = t3
t4Dev = t4
zdatDev = zdat
qndatDev = qndat
xrefDev = xref
yrefDev = yref
pDev = p
poldDev = pold
hDev = h
hyunew = hyunewDev
nreyneq=0
5 nreyneq=nreyneq+1
!
! added by Dolf 6/2/2015
! reyneq runs on GPU.
! p1_kernel starts here:
! disabling p1_kernel due to errors: 10/
!grid = dim3(ceiling(real(nx-1)/threads%x), &
! ceiling(real(ny-1)/threads%y),1) !ny-1
!call reyneq_p1_kernel<<<grid,threads>>>(nx,ny,nxmax,nymax,nter,iqpo,icoe,idisc, &
!idirect,pitDev,pirDev,gamaDev,d0Dev,siDev,dtDev,omega0Dev,t1Dev,t2Dev,t3Dev,t4Dev,&
!zdatDev,qndatDev,xrefDev,yrefDev,pDev,poldDev,hDev,hnewDev,awDev,aeDev,asDev,&
!anDev,arDev,apDev,bearxDev,bearyDev,hydnewDev,hyunewDev,hxlnewDev,hxrnewDev,ztaDev,etaDev)
!istat = cudaThreadSynchronize()
!istat = cudaDeviceSynchronize()
!if (istat .ne. 0) write(*,*) ' reyneq_p1 kernel error ',cudaGetErrorString(istat),istat
! reyeq_p1 starts here, OpenAcc directives
! reyenq_p1
!$acc data copyin(ar,aw,p,ae,as,an,ap,xref,yref,hydnew,hyunew,hxlnew,hxrnew,zta,eta,bearx,beary), copyout(aj,bj,cj,dj,p)
!$acc parallel loop gang
do 300 j=2,nym1
jm1=j-1
jp1=j+1
delyjp1=yref(jp1)-yref(j )
delyjm1=yref(j )-yref(jm1)
if(j.ne.nym1) delyjp2=yref(j+2)-yref(jp1)
if(j.ne.2) delyjm2=yref(jm1)-yref(j-2)
ymyp =(yref(jp1)-yref(jm1))/2.d0
!
do 250 i=2,nxm1
im1=i-1
ip1=i+1
delxip1=xref(ip1)-xref(i )
delxim1=xref(i )-xref(im1)
if(i.ne.nxm1) delxip2=xref(i+2)-xref(ip1)
if(i.ne.2) delxim2=xref(im1)-xref(i-2)
xmxp =(xref(ip1)-xref(im1))/2.d0
!
h2im = hydnew(i ,j )
h2ip = hydnew(i+1,j )
h2jm = hxlnew(i ,j )
h2jp = hxlnew(i ,j+1)
!
h2im_b = hyunew(i ,j )
h2ip_b = hyunew(i+1,j )
h2jm_b = hxrnew(i ,j )
h2jp_b = hxrnew(i ,j+1)
!
ztam=zta(i ,j )
ztap=zta(i ,j+1)
etam=eta(i ,j )
etap=eta(i+1,j )
!
p2im = (p(im1,j)+p(i,j))/2.d0
p2ip = (p(ip1,j)+p(i,j))/2.d0
p2jm = (p(i,jm1)+p(i,j))/2.d0
p2jp = (p(i,jp1)+p(i,j))/2.d0
!
pn=p2im*h2im
call flow(pn,qnim)
qnh2im = qnim*h2im*h2im
fmi = h2im*(bearx(im1,j)+bearx(i,j))/2.d0
dmi=qnh2im/delxim1
pemi=fmi/dmi
!
if(etam.eq.1.d0) then
ami_b=0.d0
fmi_b=0.d0
else
pn_b=p2im*h2im_b
call flow(pn_b,qnim_b)
qnh2im_b = qnim_b*h2im_b*h2im_b
fmi_b = h2im_b*(bearx(im1,j)+bearx(i,j))/2.d0
dmi_b=qnh2im_b/delxim1
pemi_b=fmi_b/dmi_b
endif
!
pn=p2ip*h2ip
call flow(pn,qnip)
qnh2ip = qnip*h2ip*h2ip
fpi = h2ip*(bearx(ip1,j)+bearx(i,j))/2.d0
dpi=qnh2ip/delxip1
pepi=fpi/dpi
!
if(etap.eq.1.d0) then
api_b=0.d0
fpi_b=0.d0
else
pn_b=p2ip*h2ip_b
call flow(pn_b,qnip_b)
qnh2ip_b = qnip_b*h2ip_b*h2ip_b
fpi_b = h2ip_b*(bearx(ip1,j)+bearx(i,j))/2.d0
dpi_b=qnh2ip_b/delxip1
pepi_b=fpi_b/dpi_b
endif
!
pn=p2jm*h2jm
call flow(pn,qnjm)
qnh2jm = qnjm*h2jm*h2jm
fmj = h2jm*(beary(i,jm1)+beary(i,j))/2.d0
dmj=qnh2jm/delyjm1
pemj=fmj/dmj
!
if(ztam.eq.1.d0) then
amj_b=0.d0
fmj_b=0.d0
else
pn_b=p2jm*h2jm_b
call flow(pn_b,qnjm_b)
qnh2jm_b = qnjm_b*h2jm_b*h2jm_b
fmj_b = h2jm_b*(beary(i,jm1)+beary(i,j))/2.d0
dmj_b=qnh2jm_b/delyjm1
pemj_b=fmj_b/dmj_b
endif
!
pn=p2jp*h2jp
call flow(pn,qnjp)
qnh2jp = qnjp*h2jp*h2jp
fpj = h2jp*(beary(i,jp1)+beary(i,j))/2.d0
dpj=qnh2jp/delyjp1
pepj=fpj/dpj
!
if(ztap.eq.1.d0) then
apj_b=0.d0
fpj_b=0.d0
else
pn_b=p2jp*h2jp_b
call flow(pn_b,qnjp_b)
qnh2jp_b = qnjp_b*h2jp_b*h2jp_b
fpj_b = h2jp_b*(beary(i,jp1)+beary(i,j))/2.d0
dpj_b=qnh2jp_b/delyjp1
pepj_b=fpj_b/dpj_b
endif
!
go to (11,12,13,14,15),idisc
!
! power law
!
11 ami=dmax1(0.d0,dmi*(1.d0-0.1d0*dabs(pemi))**5)
api=dmax1(0.d0,dpi*(1.d0-0.1d0*dabs(pepi))**5)
amj=dmax1(0.d0,dmj*(1.d0-0.1d0*dabs(pemj))**5)
apj=dmax1(0.d0,dpj*(1.d0-0.1d0*dabs(pepj))**5)
!
if(etam.ne.1.d0) then
ami_b=dmax1(0.d0,dmi_b*(1.d0-0.1d0*dabs(pemi_b))**5)
endif
if(etap.ne.1.d0) then
api_b=dmax1(0.d0,dpi_b*(1.d0-0.1d0*dabs(pepi_b))**5)
endif
if(ztam.ne.1.d0) then
amj_b=dmax1(0.d0,dmj_b*(1.d0-0.1d0*dabs(pemj_b))**5)
endif
if(ztap.ne.1.d0) then
apj_b=dmax1(0.d0,dpj_b*(1.d0-0.1d0*dabs(pepj_b))**5)
endif
goto 20
!
! central difference
!
12 ami=dmi*(1.d0-0.5d0*dabs(pemi))
api=dpi*(1.d0-0.5d0*dabs(pepi))
amj=dmj*(1.d0-0.5d0*dabs(pemj))
apj=dpj*(1.d0-0.5d0*dabs(pepj))
!
if(etam.ne.1.d0) then
ami_b=dmi_b*(1.d0-0.5d0*dabs(pemi_b))
endif
if(etap.ne.1.d0) then
api_b=dpi_b*(1.d0-0.5d0*dabs(pepi_b))
endif
if(ztam.ne.1.d0) then
amj_b=dmj_b*(1.d0-0.5d0*dabs(pemj_b))
endif
if(ztap.ne.1.d0) then
apj_b=dpj_b*(1.d0-0.5d0*dabs(pepj_b))
endif
goto 20
!
! 1st-order upwind: idisc = 2
!
13 ami=dmi
api=dpi
amj=dmj
apj=dpj
!
if(etam.ne.1.d0) then
ami_b=dmi_b
endif
if(etap.ne.1.d0) then
api_b=dpi_b
endif
if(ztam.ne.1.d0) then
amj_b=dmj_b
endif
if(ztap.ne.1.d0) then
apj_b=dpj_b
endif
goto 20
!
! hybrid
!
14 ami=dmax1(0.d0,dmi*(1.0d0-0.5d0*dabs(pemi)))
api=dmax1(0.d0,dpi*(1.0d0-0.5d0*dabs(pepi)))
amj=dmax1(0.d0,dmj*(1.0d0-0.5d0*dabs(pemj)))
apj=dmax1(0.d0,dpj*(1.0d0-0.5d0*dabs(pepj)))
!
if(etam.ne.1.d0) then
ami_b=dmax1(0.d0,dmi_b*(1.0d0-0.5d0*dabs(pemi_b)))
endif
if(etap.ne.1.d0) then
api_b=dmax1(0.d0,dpi_b*(1.0d0-0.5d0*dabs(pepi_b)))
endif
if(ztam.ne.1.d0) then
amj_b=dmax1(0.d0,dmj_b*(1.0d0-0.5d0*dabs(pemj_b)))
endif
if(ztap.ne.1.d0) then
apj_b=dmax1(0.d0,dpj_b*(1.0d0-0.5d0*dabs(pepj_b)))
endif
goto 20
!
! exponential
!
15 ami=dmi*dabs(pemi)/(dexp(dabs(pemi))-1.d0)
api=dpi*dabs(pepi)/(dexp(dabs(pepi))-1.d0)
amj=dmj*dabs(pemj)/(dexp(dabs(pemj))-1.d0)
apj=dpj*dabs(pepj)/(dexp(dabs(pepj))-1.d0)
!
if(etam.ne.1.d0) then
ami_b=dmi_b*dabs(pemi_b)/(dexp(dabs(pemi_b))-1.d0)
endif
if(etap.ne.1.d0) then
api_b=dpi_b*dabs(pepi_b)/(dexp(dabs(pepi_b))-1.d0)
endif
if(ztam.ne.1.d0) then
amj_b=dmj_b*dabs(pemj_b)/(dexp(dabs(pemj_b))-1.d0)
endif
if(ztap.ne.1.d0) then
apj_b=dpj_b*dabs(pepj_b)/(dexp(dabs(pepj_b))-1.d0)
endif
!
20 continue
!
aw(im1,jm1)=-ymyp*(etam*(ami+dmax1( fmi,0.d0)) + (1.d0-etam)*(ami_b+dmax1( fmi_b,0.d0)))
ae(im1,jm1)=-ymyp*(etap*(api+dmax1(-fpi,0.d0)) + (1.d0-etap)*(api_b+dmax1(-fpi_b,0.d0)))
as(im1,jm1)=-xmxp*(ztam*(amj+dmax1( fmj,0.d0)) + (1.d0-ztam)*(amj_b+dmax1( fmj_b,0.d0)))
an(im1,jm1)=-xmxp*(ztap*(apj+dmax1(-fpj,0.d0)) + (1.d0-ztap)*(apj_b+dmax1(-fpj_b,0.d0)))
ap(im1,jm1)=-aw(im1,jm1)-ae(im1,jm1) &
-as(im1,jm1)-an(im1,jm1) &
-ymyp*(etam*fmi+(1.d0-etam)*fmi_b) &
+ymyp*(etap*fpi+(1.d0-etap)*fpi_b) &
-xmxp*(ztam*fmj+(1.d0-ztam)*fmj_b) &
+xmxp*(ztap*fpj+(1.d0-ztap)*fpj_b)
!
if(idirect.ne.1) then
amult = xmxp * ymyp * si / dt / omega0
ar(im1,jm1)=amult*h(i,j)*pold(i,j)
ap(im1,jm1)=ap(im1,jm1)+amult*hnew(i,j)
else
ar(im1,jm1)=0.d0
endif
!
250 continue
300 continue
!$acc end parallel loop
! ! reyneq_p1_kernel ends here
akd=0.d0
akn=0.d0
akdDev = 0.d0
aknDev = 0.d0
!p = pDev
!aw = awDev
!ae = aeDev
!as = asDev
!an = anDev
!ap = apDev
!ar = arDev
!do j=1,ny
! write(105,'(10000e23.15)')(ap(i,j)-1.d0,i=1,nx)
!enddo
! added by Dolf 6/9/15
! reyneq_p2_kernel starts here
!grid = dim3(ceiling(real(nx-1)/threads%x), &
! ceiling(real(ny-1)/threads%y),1) !ny-1
!call reyneq_p2_kernel<<<grid,threads>>>(nx,ny,nxmax,nymax,pDev,ajDev,bjDev,cjDev,djDev, &
!awDev,aeDev,asDev,anDev,apDev,arDev,akdDev,aknDev)
!istat = cudaThreadSynchronize()
!istat = cudaDeviceSynchronize()
!if (istat .ne. 0) write(*,*) ' reyneq_p2 kernel error ',cudaGetErrorString(istat)
! reyeq_p2 starts here
!$acc parallel loop gang
do j=2,nym1
jm1=j-1
jp1=j+1
!$acc loop vector
do i=2,nxm1
im1=i-1
ip1=i+1
aj(im1)=aw(im1,jm1)
bj(im1)=ap(im1,jm1)
cj(im1)=ae(im1,jm1)
dj(im1)=ar(im1,jm1)-as(im1,jm1)*p(i,jm1) - an(im1,jm1)*p(i,jp1)
if(i.eq.2) then
dj(im1)=dj(im1)-aw(im1,jm1)*p(im1,j)
aj(im1)=0.d0
else if (i.eq.nxm1) then
dj(im1)=dj(im1)-ae(im1,jm1)*p(ip1,j)
cj(im1)=0.d0
endif
akd=akd+dabs(aj(im1)*p(im1,j)+bj(im1)*p(i,j) + cj(im1)*p(ip1,j)-dj(im1))
akn=akn+dabs(bj(im1)*p(i,j))
enddo
enddo
!$acc end parallel loop
!reyneq_p2 end
! reyneq_p2_kernel ends here
!
!akd = 0.d0
!akn = 0.d0
!akdh = akdDev
!aknh = aknDev
!do i=2,nx-1
! do j=2,ny-1
! akd = akd + akdh(i,j)
! akn = akn + aknh(i,j)
! enddo
! enddo
!akd = akdDev
!akn = aknDev
ak = akd/akn
!write(*,*)'idirect=',idirect
write(*,*)'ak=',ak
if((mod(nreyneq,5).eq.0)) then
write(6,990)t,nreyneq,ak
990 format(' T(s)=',g12.6,',Iteration=',i3,',Residual=',g12.6)
end if
!C if(ak.lt.akmax.or.nreyneq.gt.nitmax) return
! Bug Fix
if(nreyneq.gt.1.and.(ak.lt.akmax.or.nreyneq.gt.nitmax))return
!
rakpre=ak
kint=0
50 kint=kint+1
!
! first sweep in x-direction
!
! added by Dolf 6/11/15
! reyneq_p3_kernel starts here
! returned P3 here due to problems (race condition)
!grid = dim3(ceiling(real(nx-1)/threads%x), &
! ceiling(real(ny-1)/threads%y),1) !ny-1
!call reyneq_p3_kernel<<<grid,threads>>>(nx,ny,nxmax,nymax,pDev,xrefDev,yrefDev, &
!awDev,apDev,aeDev,arDev,asDev,anDev,ajDev,bjDev,cjDev,djDev,betaDev,gammaDev)
!istat = cudaDeviceSynchronize()
!if (istat .ne. 0) write(*,*) ' reyneq_p3 kernel error ',cudaGetErrorString(istat)
!open(100, file='aw0d5.dat', status='unknown')
!do j=1,ny
! write(100,'(10000e23.15)')(aw(i,j),i=1,nx)
!enddo
!close(100)
!open(100, file='ae0d5.dat', status='unknown')
!do j=1,ny
! write(100,'(10000e23.15)')(ae(i,j),i=1,nx)
!enddo
!close(100)
!open(100, file='an0d5.dat', status='unknown')
!do j=1,ny
! write(100,'(10000e23.15)')(an(i,j),i=1,nx)
!enddo
!close(100)
!open(100, file='as0d5.dat', status='unknown')
!do j=1,ny
! write(100,'(10000e23.15)')(as(i,j),i=1,nx)
!enddo
!close(100)
!open(100, file='ap0d5.dat', status='unknown')
!do j=1,ny
! write(100,'(10000e23.15)')(ap(i,j),i=1,nx)
!enddo
!close(100)
!open(100, file='ar0d5.dat', status='unknown')
!do j=1,ny
! write(100,'(10000e23.15)')(ar(i,j),i=1,nx)
!enddo
!close(100)
!reyneq_p3 start
aj = 0.d0
bj = 0.d0
cj = 0.d0
dj = 0.d0
do 500 j=2,nym1
jm1=j-1
jp1=j+1
ymyp=(yref(jp1)-yref(jm1))/2.d0
!
do 450 i=2,nxm1
im1=i-1
ip1=i+1
xmxp=(xref(ip1)-xref(im1))/2.d0
!
aj(im1)=aw(im1,jm1)
bj(im1)=ap(im1,jm1)
cj(im1)=ae(im1,jm1)
dj(im1)=ar(im1,jm1)-as(im1,jm1)*p(i,jm1) - an(im1,jm1)*p(i,jp1)
!
if(i.eq.2) then
dj(im1)=dj(im1)-aw(im1,jm1)*p(im1,j)
aj(im1)=0.d0
else if (i.eq.nxm1) then
dj(im1)=dj(im1)-ae(im1,jm1)*p(ip1,j)
cj(im1)=0.d0
endif
!
450 continue
call tridag(nxm2,aj,bj,cj,dj)
do 475 i=1,nxm2
p(i+1,j)=dj(i)
475 continue
500 continue
! end reyneq_p3
!p = pDev
! added by Dolf 6/12/15
! _p4_kernel starts here
pDev = p
!aj1Dev = aj
!bj1Dev = bj
!cj1Dev = cj
!dj1Dev = dj
!cpu version of p4
!open acc directives
! data copyin(aj,bj,cj,dj,ar,aw,p,ae,as,an,ap,xref,yref), copyout(aj,bj,cj,dj)
!$acc parallel loop gang
do i=2,nxm1
im1=i-1
ip1=i+1
xmxp=(xref(ip1)-xref(im1))/2.d0
!
aj(im1)=0.d0
bj(im1)=0.d0
cj(im1)=0.d0
dj(im1)=0.d0
aj1=0.d0
bj1=0.d0
cj1=0.d0
dj1=0.d0
!$acc loop vector
do j=2,nym1
jm1=j-1
jp1=j+1
ymyp=(yref(jp1)-yref(jm1))/2.d0
!
aj1=aj1+aw(im1,jm1)
bj1=bj1+ap(im1,jm1)
if(j.ne.2) bj1=bj1+as(im1,jm1)
if(j.ne.nym1) bj1=bj1+an(im1,jm1)
cj1=cj1+ae(im1,jm1)
dj1=dj1+ar(im1,jm1)-aw(im1,jm1)*p(im1,j) &
-ae(im1,jm1)*p(ip1,j)-as(im1,jm1)*p(i,jm1) &
-an(im1,jm1)*p(i,jp1)-ap(im1,jm1)*p(i ,j)
!
enddo
aj(im1)=aj1
bj(im1)=bj1
cj(im1)=cj1
dj(im1)=dj1
enddo
!$acc end parallel loop
!$acc end data
I’m assuming that “flow” doesn’t have an interface. If it did, then you’d put the directive in the interface instead.
what do you mean? please look at flow subroutine above and tell me if it’s interface or not.
The biggest problem I see is that you’re using a computed Goto.
why would it be a problem?
Though you might consider changing this obsolete feature to a case statement.
how would I change it to case statements?
here is the output of the compiler when I try to compile the code. It fails actually.
------ Rebuild All started: Project: Dynamic5, Configuration: Release x64 ------
Deleting intermediate and output files for project 'Dynamic5', configuration 'Release'
Compiling Project ...
Common.F90
Kernels.F90
Dynamic.F90
FESusp.F90
Force.F90
Grid.F90
Init.F90
Mass.F90
Misc.F90
Modes.F90
NewmarkBeta.F90
OptionalParams.F90
Output.F90
Rail.F90
Reynolds.F90
C:\D Drive\CML codes\Dynamic5a\Reynolds.F90(138) : error S0155 : Invalid accelerator data region: branching into or out of region is not allowed
reyneq:
138, Invalid accelerator data region: branching into or out of region is not allowed
139, Accelerator kernel generated
Generating Tesla code
140, !$acc loop gang ! blockidx%x
139, Generating copyin(yref(:),hnew(:,:))
Generating copy(ar(:,:))
Generating copyin(h(:,:),pold(:,:))
Generating copy(qnjp_b,qnjp,qnjm_b)
Generating copyin(beary(:,:))
Generating copy(qnjm,qnip_b,qnip,pn_b,qnim_b)
Generating copyin(bearx(:,:))
Generating copy(qnim,pn)
Generating copyin(p(:,:),eta(:,:),zta(:,:),hxrnew(:,:),hyunew(:,:),hxlnew(:,:),hydnew(:,:),xref(:))
Generating copy(aw(:,:),ae(:,:),as(:,:),an(:,:),ap(:,:))
140, Accelerator restriction: scalar variable live-out from loop: pn_b,pn
149, Loop carried scalar dependence for ami_b,dmi_b,pemi_b,api_b,dpi_b,pepi_b,amj_b,dmj_b,pemj_b,apj_b,dpj_b,pepj_b at line 307
Accelerator restriction: scalar variable live-out from loop: pn_b,pn
417, Accelerator kernel generated
Generating Tesla code
418, !$acc loop gang ! blockidx%x
422, !$acc loop vector(128) ! threadidx%x
417, Generating copyin(p(1:nxm1+1,1:nym1+1),aw(:nxm1-1,:nym1-1))
Generating copy(aj(:nxm1-1))
Generating copyin(ap(:nxm1-1,:nym1-1))
Generating copy(bj(:nxm1-1))
Generating copyin(ae(:nxm1-1,:nym1-1))
Generating copy(cj(:nxm1-1))
Generating copyin(ar(:nxm1-1,:nym1-1),as(:nxm1-1,:nym1-1),an(:nxm1-1,:nym1-1))
Generating copy(dj(:nxm1-1))
422, Loop is parallelizable
567, Accelerator kernel generated
Generating Tesla code
568, !$acc loop gang ! blockidx%x
583, !$acc loop vector(128) ! threadidx%x
567, Generating copyout(aj(:nxm1-1),bj(:nxm1-1),cj(:nxm1-1),dj(:nxm1-1))
Generating copyin(p(1:nxm1+1,1:nym1+1),ar(:nxm1-1,:nym1-1),ae(:nxm1-1,:nym1-1),an(:nxm1-1,:nym1-1),ap(:nxm1-1,:nym1-1),aw(:nxm1-1,:nym1-1),as(:nxm1-1,:nym1-1))
583, Scalar last value needed after loop for bj1 at line 599
Accelerator restriction: scalar variable live-out from loop: bj1
0 inform, 0 warnings, 1 severes, 0 fatal for reyneq
C:\D Drive\CML codes\Dynamic5a\Reynolds.F90(1489) : error S0155 : Module variables are not supported in acc routine - iqpo
C:\D Drive\CML codes\Dynamic5a\Reynolds.F90(1495) : error S0155 : Module variables are not supported in acc routine - t1
C:\D Drive\CML codes\Dynamic5a\Reynolds.F90(1500) : error S0155 : Module variables are not supported in acc routine - t2
C:\D Drive\CML codes\Dynamic5a\Reynolds.F90(1508) : error S0155 : Module variables are not supported in acc routine - t3
C:\D Drive\CML codes\Dynamic5a\Reynolds.F90(1509) : error S0155 : Module variables are not supported in acc routine - t4
C:\D Drive\CML codes\Dynamic5a\Reynolds.F90(1515) : error S0155 : Module variables are not supported in acc routine - d0
C:\D Drive\CML codes\Dynamic5a\Reynolds.F90(1516) : error S0155 : Module variables are not supported in acc routine - icoe
C:\D Drive\CML codes\Dynamic5a\Reynolds.F90(1519) : error S0155 : Module variables are not supported in acc routine - pir
C:\D Drive\CML codes\Dynamic5a\Reynolds.F90(1520) : error S0155 : Module variables are not supported in acc routine - gama
C:\D Drive\CML codes\Dynamic5a\Reynolds.F90(1521) : error S0155 : Module variables are not supported in acc routine - nter
C:\D Drive\CML codes\Dynamic5a\Reynolds.F90(1537) : error S0155 : Module variables are not supported in acc routine - pit
C:\D Drive\CML codes\Dynamic5a\Reynolds.F90(1583) : error S0155 : Module variables are not supported in acc routine - zdat
C:\D Drive\CML codes\Dynamic5a\Reynolds.F90(1583) : error S0155 : Module variables are not supported in acc routine - zdat
C:\D Drive\CML codes\Dynamic5a\Reynolds.F90(1585) : error S0155 : Module variables are not supported in acc routine - qndat
C:\D Drive\CML codes\Dynamic5a\Reynolds.F90(1585) : error S0155 : Module variables are not supported in acc routine - qndat
C:\D Drive\CML codes\Dynamic5a\Reynolds.F90(1586) : error S0155 : Module variables are not supported in acc routine - zdat
C:\D Drive\CML codes\Dynamic5a\Reynolds.F90(1587) : error S0155 : Module variables are not supported in acc routine - qndat
flow:
1466, Generating acc routine seq
1521, Loop is parallelizable
1547, Loop carried scalar dependence for ff0 at line 1551
Scalar last value needed after loop for ff0 at line 1562
Loop carried scalar dependence for f1 at line 1557
Scalar last value needed after loop for f1 at line 1563
Loop carried scalar dependence for f2 at line 1559
Scalar last value needed after loop for f2 at line 1564
0 inform, 0 warnings, 17 severes, 0 fatal for flow
Roughness.F90
Stiff.F90
String.F90
Suspension.F90
UserDefGeom.F90
Util.F90
Dynamic5 build failed.
Build log was saved at "file://C:\D Drive\CML codes\Dynamic5a\x64\Release\BuildLog.htm"
========== Rebuild All: 0 succeeded, 1 failed, 0 skipped ==========
thanks,
Dolf