ok. its still running “fast”, but the GPU run is still not converging compared with the CPU multicore run, so I need to figure out what the difference is. The -Minfo=accel was useful though because it helped me to spot a bug which I hadn’t noticed before, and wasn’t being picked up by the compiler. So what else can I look for to spot the difference?
!$acc data copy(nei,oc,oleaf,wigg,u,v,w,p,r,t,m)
C-----start iterations
DO it = 1,nits
!$acc parallel loop present(r,u,v,w,p,t,m)
!$acc& private(Q,E1,E2,F1,F2,G1,G2)
!$acc& private(rn,un,vn,wn,pn,tn,mn)
!$acc& reduction(+: resid)
!$acc& reduction(max: maxvel,maxwig)
C-----loop through nleaf cells
DO n = 1,nleaf
C-----
C do all the calculations which
C give rise to new values for
C allocated flow variables (r,u,v,w,p,t,m)
C-----
ENDDO ! leaf cells loop
ENDDO ! main iteration
!$acc update self(r,u,v,w,p,t,m)
!$acc end data
flow_solve:
2032, Generating copy(m(:),nei(:,:),oc(:),w(:),wigg(:),t(:),u(:),v(:),oleaf(:),p(:),r(:)) [if not already present]
2048, Generating present(p(:),t(:),u(:),v(:),w(:),m(:),r(:))
Generating copy(resid(:)) [if not already present]
Generating implicit copy(maxwig) [if not already present]
Generating NVIDIA GPU code
2056, !$acc loop gang, vector(128) ! blockidx%x threadidx%x
Generating reduction(max:maxwig)
Generating reduction(+:resid(:))
Generating reduction(max:maxvel)
2071, !$acc loop seq
2162, !$acc loop seq
2558, !$acc loop seq
2048, Local memory used for pn,e1,e2,f1,f2,g1,resid,tn,un,wn,vn,mn,rn,g2,q
Generating implicit copy(maxvel) [if not already present]
2056, Generating implicit private(rsum,avg1,nb5,nb4,nb3,nb2,nb1,mwig,msum,zwig,ywig,xwig,mag2,dt,dotp,ccln,v2,qzz,tzz,tzy,tzx,tyz,txz,et,mm,magvel,pp,owid,tt,tsum,ww,wsum,vv,vsum,uu,usum,rr,qyy,tyy,tyx,txy,qxx,psum,txx,dtdz,dwdz,dvdz,dudz,dtdy,dwdy,dvdy,dudy,dtdx,dwdx,dvdx,dudx,dtloc,tweak,mag1,wnew,vnew,vmag,unew,diff1,cs,wnew1,vnew1,unew1,wnew2,vnew2,unew2,diff2,nb6,nrm2,nrm1,nmag,wnrm,vnrm,unrm,wold,vold,uold,srinv,avg2,olfn,nrm3,olev)
2071, Loop is parallelizable
Generating implicit private(nb)
2162, Generating implicit private(nb8,nb7,nb6,nb5,nb4,nb3,nb2,nb1,nb,cs,ma)
2302, Reference argument passing prevents parallelization:
2558, Loop is parallelizable
Generating implicit private(rhs,gflux,fflux,eflux)
2694, Generating update self(r(:),m(:),w(:),v(:),u(:),t(:),p(:))