Hi, I am running a code on V100 using following compiler and flags:
Compiler= pgfortran
Flags = -acc -fast -mcmodel=medium -ta=tesla:managed -Minfo=accel
While executing the whole code, one of the loop structure is running far slower than expected. Other loops with same kind of loop scheduling and almost same amount of floating point operations were running ~2 times faster than this nested loop structure. I want to know why this is running slow.
Some of my observations:
My first suspect is the creation of arrays namely: Visc_Fflux,Visc_Gflux,Visc_Hflux, Total_Flux in the start using the data clause. These arrays are just needed in that subroutine only. I also notice in an other subroutine a similar kind of problem, an array was created during the start, and that subroutine is running slow similar to this one. Another observation is that both these subroutines are running fine for NK = 1 (other parameters remaining same). I can declare these variables as global but there are already so many global variables already.
I also notice that the GPU is idle and no computations were running while deallocating the arrays (Visc_Fflux,Visc_Gflux,Visc_Hflux, Total_Flux) in the end at ā!$acc end dataā.
I think this may also be because of the large number of large 5D arrays that are present in it (Ix, Qp_I, Jx, Qp_J, Kxā¦). Is the latency in data reading from RAM that is the cause?
parameters:
nblocks = 1
NK = 59
NJ = 255
NI = 599
The openacc region that is running slow:
!$acc data create(Visc_Fflux,Visc_Gflux,Visc_Hflux, Total_Flux) !dimension of this arrays NI x NJ x NK x nblocks x 4
!$acc loop seq
DO nbl = 1,nblocks
!$acc parallel loop gang collapse(2) default(present)
DO k = 1, NK(nbl)
DO j = 1, NJ(nbl)
!$acc loop vector
DO i = 1, NI(nbl)
rx = Ix(i,j,k,nbl)*Qp_I(i,j,k,nbl,1) + Jx(i,j,k,nbl)*Qp_J(i,j,k,nbl,1) + Kx(i,j,k,nbl)*Qp_K(i,j,k,nbl,1)
ux = Ix(i,j,k,nbl)*Qp_I(i,j,k,nbl,2) + Jx(i,j,k,nbl)*Qp_J(i,j,k,nbl,2) + Kx(i,j,k,nbl)*Qp_K(i,j,k,nbl,2)
vx = Ix(i,j,k,nbl)*Qp_I(i,j,k,nbl,3) + Jx(i,j,k,nbl)*Qp_J(i,j,k,nbl,3) + Kx(i,j,k,nbl)*Qp_K(i,j,k,nbl,3)
wx = Ix(i,j,k,nbl)*Qp_I(i,j,k,nbl,4) + Jx(i,j,k,nbl)*Qp_J(i,j,k,nbl,4) + Kx(i,j,k,nbl)*Qp_K(i,j,k,nbl,4)
px = Ix(i,j,k,nbl)*Qp_I(i,j,k,nbl,5) + Jx(i,j,k,nbl)*Qp_J(i,j,k,nbl,5) + Kx(i,j,k,nbl)*Qp_K(i,j,k,nbl,5)
ry = Iy(i,j,k,nbl)*Qp_I(i,j,k,nbl,1) + Jy(i,j,k,nbl)*Qp_J(i,j,k,nbl,1) + Ky(i,j,k,nbl)*Qp_K(i,j,k,nbl,1)
uy = Iy(i,j,k,nbl)*Qp_I(i,j,k,nbl,2) + Jy(i,j,k,nbl)*Qp_J(i,j,k,nbl,2) + Ky(i,j,k,nbl)*Qp_K(i,j,k,nbl,2)
vy = Iy(i,j,k,nbl)*Qp_I(i,j,k,nbl,3) + Jy(i,j,k,nbl)*Qp_J(i,j,k,nbl,3) + Ky(i,j,k,nbl)*Qp_K(i,j,k,nbl,3)
wy = Iy(i,j,k,nbl)*Qp_I(i,j,k,nbl,4) + Jy(i,j,k,nbl)*Qp_J(i,j,k,nbl,4) + Ky(i,j,k,nbl)*Qp_K(i,j,k,nbl,4)
py = Iy(i,j,k,nbl)*Qp_I(i,j,k,nbl,5) + Jy(i,j,k,nbl)*Qp_J(i,j,k,nbl,5) + Ky(i,j,k,nbl)*Qp_K(i,j,k,nbl,5)
rz = Iz(i,j,k,nbl)*Qp_I(i,j,k,nbl,1) + Jz(i,j,k,nbl)*Qp_J(i,j,k,nbl,1) + Kz(i,j,k,nbl)*Qp_K(i,j,k,nbl,1)
uz = Iz(i,j,k,nbl)*Qp_I(i,j,k,nbl,2) + Jz(i,j,k,nbl)*Qp_J(i,j,k,nbl,2) + Kz(i,j,k,nbl)*Qp_K(i,j,k,nbl,2)
vz = Iz(i,j,k,nbl)*Qp_I(i,j,k,nbl,3) + Jz(i,j,k,nbl)*Qp_J(i,j,k,nbl,3) + Kz(i,j,k,nbl)*Qp_K(i,j,k,nbl,3)
wz = Iz(i,j,k,nbl)*Qp_I(i,j,k,nbl,4) + Jz(i,j,k,nbl)*Qp_J(i,j,k,nbl,4) + Kz(i,j,k,nbl)*Qp_K(i,j,k,nbl,4)
pz = Iz(i,j,k,nbl)*Qp_I(i,j,k,nbl,5) + Jz(i,j,k,nbl)*Qp_J(i,j,k,nbl,5) + Kz(i,j,k,nbl)*Qp_K(i,j,k,nbl,5)
! ===============================
T = Gamma*Mach**2*Qp(i,j,k,nbl,5)/Qp(i,j,k,nbl,1)
mu_L = (T**1.5d0)*(1.4045988d0)/(T + 0.4045988d0)/Re
! ===============================
Blk_Visc_Term = (-2.d0/3.d0)*(ux + vy + wz)
tau_xx = mu_L*(2.d0*ux + Blk_Visc_Term) ; tau_xy = mu_L*(uy + vx)
tau_yy = mu_L*(2.d0*vy + Blk_Visc_Term) ; tau_yz = mu_L*(vz + wy)
tau_zz = mu_L*(2.d0*wz + Blk_Visc_Term) ; tau_zx = mu_L*(wx + uz)
bx = (tau_xx*Qp(i,j,k,nbl,2) + tau_xy*Qp(i,j,k,nbl,3) + tau_zx*Qp(i,j,k,nbl,4)) &
+ mu_L*funGamma*(px/Qp(i,j,k,nbl,1) - Qp(i,j,k,nbl,5)/(Qp(i,j,k,nbl,1))**2.d0*rx)
by = (tau_xy*Qp(i,j,k,nbl,2) + tau_yy*Qp(i,j,k,nbl,3) + tau_yz*Qp(i,j,k,nbl,4)) &
+ mu_L*funGamma*(py/Qp(i,j,k,nbl,1) - Qp(i,j,k,nbl,5)/(Qp(i,j,k,nbl,1))**2.d0*ry)
bz = (tau_zx*Qp(i,j,k,nbl,2) + tau_yz*Qp(i,j,k,nbl,3) + tau_zz*Qp(i,j,k,nbl,4)) &
+ mu_L*funGamma*(pz/Qp(i,j,k,nbl,1) - Qp(i,j,k,nbl,5)/(Qp(i,j,k,nbl,1))**2.d0*rz)
Visc_Fflux(i,j,k,nbl,1) =(Ix(i,j,k,nbl)*tau_xx+Iy(i,j,k,nbl)*tau_xy + Iz(i,j,k,nbl)*tau_zx)/Jac(i,j,k,nbl)
Visc_Fflux(i,j,k,nbl,2) =(Ix(i,j,k,nbl)*tau_xy+Iy(i,j,k,nbl)*tau_yy + Iz(i,j,k,nbl)*tau_yz)/Jac(i,j,k,nbl)
Visc_Fflux(i,j,k,nbl,3) =(Ix(i,j,k,nbl)*tau_zx+Iy(i,j,k,nbl)*tau_yz + Iz(i,j,k,nbl)*tau_zz)/Jac(i,j,k,nbl)*(1-f2D)
Visc_Fflux(i,j,k,nbl,4) =(Ix(i,j,k,nbl)*bx +Iy(i,j,k,nbl)*by + Iz(i,j,k,nbl)*bz )/Jac(i,j,k,nbl)
Visc_Gflux(i,j,k,nbl,1) =(Jx(i,j,k,nbl)*tau_xx+Jy(i,j,k,nbl)*tau_xy + Jz(i,j,k,nbl)*tau_zx)/Jac(i,j,k,nbl)
Visc_Gflux(i,j,k,nbl,2) =(Jx(i,j,k,nbl)*tau_xy+Jy(i,j,k,nbl)*tau_yy + Jz(i,j,k,nbl)*tau_yz)/Jac(i,j,k,nbl)
Visc_Gflux(i,j,k,nbl,3) =(Jx(i,j,k,nbl)*tau_zx+Jy(i,j,k,nbl)*tau_yz + Jz(i,j,k,nbl)*tau_zz)/Jac(i,j,k,nbl)*(1-f2D)
Visc_Gflux(i,j,k,nbl,4) =(Jx(i,j,k,nbl)*bx +Jy(i,j,k,nbl)*by + Jz(i,j,k,nbl)*bz )/Jac(i,j,k,nbl)
Visc_Hflux(i,j,k,nbl,1) =(Kx(i,j,k,nbl)*tau_xx+ Ky(i,j,k,nbl)*tau_xy+ Kz(i,j,k,nbl)*tau_zx)/Jac(i,j,k,nbl)
Visc_Hflux(i,j,k,nbl,2) =(Kx(i,j,k,nbl)*tau_xy+ Ky(i,j,k,nbl)*tau_yy+ Kz(i,j,k,nbl)*tau_yz)/Jac(i,j,k,nbl)
Visc_Hflux(i,j,k,nbl,3) =(Kx(i,j,k,nbl)*tau_zx+ Ky(i,j,k,nbl)*tau_yz+ Kz(i,j,k,nbl)*tau_zz)/Jac(i,j,k,nbl)
Visc_Hflux(i,j,k,nbl,4) =(Kx(i,j,k,nbl)*bx + Ky(i,j,k,nbl)*by + Kz(i,j,k,nbl)*bz )/Jac(i,j,k,nbl)
ENDDO
ENDDO
ENDDO
ENDDO
<some operations with Visc_Fflux,Visc_Gflux,Visc_Hflux, Total_Flux)
......
......>
!$acc end data