Thanks Mat, I have corrected the problem and now it’s working flawlessly.

I have this subroutine, and I want to know which way I should make it run on GPU.

Question 1: since it has multiple nested do loops. what shall I do?

- make it run on GPU by adding !$acc routine seq at the start of the subroutine?
- go through each do loop and use !$acc parallel loop directives?

Question 2: what does !$acc routine seq do exactly?

Question 3: if I have data region (example code 1 below) and a GPU subroutine was called within that region, what should be the method used to run subroutine on GPU? Question 1.1 or 1.2? or it does not matter?

do we treat !$acc data like any other !$acc directive? in that case I have to use !$acc routine seq inside the subroutine (Q1.1).

Please advice,

thanks!

Dolf

here is the subroutine:

```
!================================================================
subroutine acmdn(aw1,ae1,as1,an1,ap1,ar1,p1,aw2,ae2,as2, &
an2,ap2,ar2,dp2,nx1,ny1,nx2,ny2,nx1max,ny1max,nx2max, &
ny2max,nlimit)
!================================================================
use Dyn_sizes_module
implicit none
real(8) :: AR1, AR2, DJ, CJ, AP2, AP1, BJ, AKN, AN2, RATIO, AKPRE, AKD, AS2, AS1, P1
real(8) :: AK, AJ, AW2, AW1, AE1, AE2, DP2, DABS, AN1
integer :: K1, NX2M2, NX2M1, NX1, NX2, I1, I2, KINT, NY1MAX, K, L, NY2M1, NY2M2
integer :: L1, NX1MAX, NLIMIT, NY2, NY1, J1, J2, NY2MAX, NX2MAX
dimension aw1(nx1max,ny1max),ae1(nx1max,ny1max), &
as1(nx1max,ny1max),an1(nx1max,ny1max), &
ap1(nx1max,ny1max),ar1(nx1max,ny1max), &
aw2(nx2max,ny2max),ae2(nx2max,ny2max), &
as2(nx2max,ny2max),an2(nx2max,ny2max), &
ap2(nx2max,ny2max),ar2(nx2max,ny2max), &
p1(nx1max,ny1max), dp2(nx2max,ny2max)
dimension aj(nxmax),bj(nxmax),cj(nxmax),dj(nxmax)
nx2m1=nx2-1
ny2m1=ny2-1
nx2m2=nx2-2
ny2m2=ny2-2
do k=2,nx2m1
k1=k-1
do l=2,ny2m1
l1=l-1
i1=2*(k-1)-1
i2=2*(k-1)+1-1
j1=2*(l-1)-1
j2=2*(l-1)+1-1
ap2(k1,l1)=ap1(i1,j1)+ap1(i2,j1)+ap1(i1,j2)+ap1(i2,j2)+ &
ae1(i1,j1)+ae1(i1,j2)+aw1(i2,j1)+aw1(i2,j2)+ &
as1(i1,j2)+as1(i2,j2)+an1(i1,j1)+an1(i2,j1)
aw2(k1,l1)=aw1(i1,j1)+aw1(i1,j2)
ae2(k1,l1)=ae1(i2,j1)+ae1(i2,j2)
as2(k1,l1)=as1(i1,j1)+as1(i2,j1)
an2(k1,l1)=an1(i1,j2)+an1(i2,j2)
ar2(k1,l1)=ar1(i1,j1)+ar1(i2,j1)+ar1(i1,j2)+ar1(i2,j2)- &
ap1(i1,j1)*p1(i1+1,j1+1)-ap1(i2,j1)*p1(i2+1,j1+1)- &
ap1(i1,j2)*p1(i1+1,j2+1)-ap1(i2,j2)*p1(i2+1,j2+1)- &
aw1(i1,j1)*p1(i1 ,j1+1)-aw1(i1,j2)*p1(i1 ,j2+1)- &
ae1(i1,j1)*p1(i2+1,j1+1)-ae1(i1,j2)*p1(i2+1,j2+1)- &
as1(i1,j1)*p1(i1+1,j1 )-as1(i1,j2)*p1(i1+1,j1+1)- &
an1(i1,j1)*p1(i1+1,j2+1)-an1(i1,j2)*p1(i1+1,j2+2)- &
aw1(i2,j1)*p1(i1+1,j1+1)-aw1(i2,j2)*p1(i1+1,j2+1)- &
ae1(i2,j1)*p1(i2+2,j1+1)-ae1(i2,j2)*p1(i2+2,j2+1)- &
as1(i2,j1)*p1(i2+1,j1 )-as1(i2,j2)*p1(i2+1,j1+1)- &
an1(i2,j1)*p1(i2+1,j2+1)-an1(i2,j2)*p1(i2+1,j2+2)
end do
end do
!
do k=1,nx2
do l=1,ny2
dp2(k,l)=0.d0
end do
end do
!
kint=0
50 kint=kint+1
do 200 l=2,ny2m1
l1=l-1
do 100 k=2,nx2m1
k1=k-1
aj(k1)=aw2(k1,l1)
bj(k1)=ap2(k1,l1)
cj(k1)=ae2(k1,l1)
dj(k1)=-as2(k1,l1)*dp2(k,l1)-an2(k1,l1)*dp2(k,l+1) + ar2(k1,l1)
100 continue
aj(1)=0.d0
cj(nx2m2)=0.d0
call tridag(nx2m2,aj,bj,cj,dj)
do 150 k=1,nx2m2
dp2(k+1,l)=dj(k)
150 continue
200 continue
do k=2,nx2m1
k1=k-1
!
aj(k1)=0.d0
bj(k1)=0.d0
cj(k1)=0.d0
dj(k1)=0.d0
!
do l=2,ny2m1
l1=l-1
aj(k1)=aj(k1)+aw2(k1,l1)
bj(k1)=bj(k1)+ap2(k1,l1)
if(l.ne.2) bj(k1)=bj(k1)+as2(k1,l1)
if(l.ne.ny2m1) bj(k1)=bj(k1)+an2(k1,l1)
cj(k1)=cj(k1)+ae2(k1,l1)
dj(k1)=dj(k1)+ar2(k1,l1)-aw2(k1,l1)*dp2(k1,l) &
-ae2(k1,l1)*dp2(k+1,l)-as2(k1,l1)*dp2(k,l1) &
-an2(k1,l1)*dp2(k,l+1)-ap2(k1,l1)*dp2(k,l)
enddo
enddo
aj(1)=0.d0
cj(nx2m2)=0.d0
call tridag(nx2m2,aj,bj,cj,dj)
do k=1,nx2m2
do l=1,ny2m2
dp2(k+1,l+1)=dp2(k+1,l+1)+dj(k)
enddo
enddo
do 400 k=2,nx2m1
k1=k-1
do 300 l=2,ny2m1
l1=l-1
aj(l1)=as2(k1,l1)
bj(l1)=ap2(k1,l1)
cj(l1)=an2(k1,l1)
dj(l1)=-aw2(k1,l1)*dp2(k1,l)-ae2(k1,l1)*dp2(k+1,l) + ar2(k1,l1)
300 continue
aj(1)=0.d0
cj(ny2m2)=0.d0
call tridag(ny2m2,aj,bj,cj,dj)
do 350 l=1,ny2m2
dp2(k,l+1)=dj(l)
350 continue
400 continue
do l=2,ny2m1
l1=l-1
!
aj(l1)=0.d0
bj(l1)=0.d0
cj(l1)=0.d0
dj(l1)=0.d0
!
do k=2,nx2m1
k1=k-1
aj(l1)=aj(l1)+as2(k1,l1)
bj(l1)=bj(l1)+ap2(k1,l1)
if(k.ne.2) bj(l1)=bj(l1)+aw2(k1,l1)
if(k.ne.nx2m1) bj(l1)=bj(l1)+ae2(k1,l1)
cj(l1)=cj(l1)+an2(k1,l1)
dj(l1)=dj(l1)+ar2(k1,l1)-aw2(k1,l1)*dp2(k1,l) &
-ae2(k1,l1)*dp2(k+1,l)-as2(k1,l1)*dp2(k,l1) &
-an2(k1,l1)*dp2(k,l+1)-ap2(k1,l1)*dp2(k,l)
enddo
enddo
aj(1)=0.d0
cj(ny2m2)=0.d0
call tridag(ny2m2,aj,bj,cj,dj)
do l=1,ny2m2
do k=1,nx2m2
dp2(k+1,l+1)=dp2(k+1,l+1)+dj(l)
enddo
enddo
!
akd=0.d0
akn=0.d0
do k=2,nx2m1
k1=k-1
do l=2,ny2m1
l1=l-1
aj(l1)=as2(k1,l1)
bj(l1)=ap2(k1,l1)
cj(l1)=an2(k1,l1)
dj(l1)=-aw2(k1,l1)*dp2(k1,l)-ae2(k1,l1)*dp2(k+1,l) + ar2(k1,l1)
akd=akd+dabs(aj(l1)*dp2(k,l1)+bj(l1)*dp2(k,l) + cj(l1)*dp2(k,l+1)-dj(l1))
akn=akn+dabs(bj(l1)*dp2(k,l))
enddo
enddo
!
ak=akd/akn
if(kint.eq.1) then
akpre=ak
goto 50
end if
ratio=ak/akpre
akpre=ak
return
end
```

```
!=======================================================
subroutine tridag(n,a,b,c,d)
!=======================================================
!....invert a tri-diagonal matrix
use Dyn_sizes_module
implicit real*8(a-h,o-z)
!$ACC routine seq
dimension a(1),b(1),c(1),d(1)
dimension beta(402),gamma(402)
beta(1)=b(1)
gamma(1)=d(1)/beta(1)
do 10 i=2,n
im1=i-1
beta(i)=b(i)-a(i)*c(im1)/beta(im1)
gamma(i)=(d(i)-a(i)*gamma(im1))/beta(i)
10 continue
d(n)=gamma(n)
do 20 i=n-1,1,-1
d(i)=gamma(i)-c(i)*d(i+1)/beta(i)
20 continue
return
end
```

```
!$acc data copyin(p,aw,ae......) &
!$acc copyout(p)
call acmdn(........)
!$acc end data
```