Hello,

I work on a Fortran MPI application that I want to accelerate using OpenACC directives.

I compile the code like this : (pgf90 version 16.4 running on Linux)

pgf90 -O3 -Mvect -Munroll -Mextend -Mmpi=sgimpi -mcmodel=medium -traceback -Mpreprocess -acc -Minfo -Minfo=accel -DGM_GPU -c ./SRC/inter.f

I get the following output from the compiler :

```
PGF90-S-0155-Compiler failed to translate accelerator region (see -Minfo messages): Unexpected flow graph (./SRC/inter.f: 89)
inter:
55, Generating present(q(:,:,:,:),qp(:,:,:,:),dist(:,:,:,:),mask(:,:,:),xa(:),yb(:),zc(:),ineigh(:,:,:,:),jneigh(:,:,:,:),kneigh(:,:,:,:),xip(:,:,:),yip(:,:,:),zip(:,:,:),matinv(:,:,:,:))
89, CUDA shared memory used for qip
Accelerator kernel generated
Generating Tesla code
92, !$acc loop gang collapse(3) ! blockidx%x
93, ! blockidx%x collapsed
94, ! blockidx%x collapsed
112, !$acc loop vector(128) ! threadidx%x
133, !$acc loop vector(128) ! threadidx%x
141, !$acc loop vector(128) ! threadidx%x
144, !$acc loop vector(128) ! threadidx%x
154, !$acc loop vector(128) ! threadidx%x
163, !$acc loop vector(128) ! threadidx%x
169, !$acc loop vector(128) ! threadidx%x
181, !$acc loop vector(128) ! threadidx%x
184, !$acc loop vector(128) ! threadidx%x
92, Loop not vectorized/parallelized: too deeply nested
93, Loop not vectorized/parallelized: too deeply nested
107, Scalar last value needed after loop for m,n,kc at line 113
Scalar last value needed after loop for gradflag at line 138
Loop not vectorized/parallelized: potential early exits
112, Loop is parallelizable
Loop unrolled 5 times (completely unrolled)
121, Loop unrolled 5 times (completely unrolled)
125, Loop unrolled 5 times (completely unrolled)
132, Loop is parallelizable
Loop not vectorized: may not be beneficial
Unrolled inner loop 4 times
Generated 3 prefetches in scalar loop
133, Loop is parallelizable
Loop unrolled 5 times (completely unrolled)
139, Loop is parallelizable
140, Loop carried dependence of wp prevents parallelization
Loop carried backward dependence of wp prevents vectorization
Generated vector sse code for the loop
141, Loop is parallelizable
Loop unrolled 3 times (completely unrolled)
144, Loop is parallelizable
Loop unrolled 2 times (completely unrolled)
152, Loop is parallelizable
153, Loop carried dependence of wp prevents parallelization
Loop carried backward dependence of wp prevents vectorization
Generated vector sse code for the loop
154, Loop is parallelizable
Loop unrolled 5 times (completely unrolled)
163, Loop is parallelizable
Loop unrolled 5 times (completely unrolled)
169, Loop is parallelizable
Loop unrolled 5 times (completely unrolled)
181, Loop is parallelizable
Loop unrolled 3 times (completely unrolled)
184, Loop is parallelizable
Loop unrolled 2 times (completely unrolled)
```

The corresponding source code is :

```
#ifdef GM_GPU
89 !$ACC PARALLEL LOOP COLLAPSE(3)
90 !$ACC& PRIVATE(gradflag,my_iMask,kk,j,i,k,m,n,kc,l,small2,D1,D2,d3,delta,QIP,PHI,WP)
91 #endif
92 DO kk = 1, Nz
93 DO j = 1, Ny
94 DO i = 1, Nx
95 !
96 gradflag = 0
97 !
98 IF (mask(i,j,kk) == -1) THEN
99 my_iMask = my_iMask + 1
100 !
101 D1 = Da * Xa(i)
102 D2 = Db * Yb(j)
103 D3 = Dc * Zc(kk)
104 delta = MIN (D1, D2, D3)
105 small2 = factor * delta
106
107 DO k = 1, ibcDim ! Loop over all neighbours of IP
108 m = INeigh(k,i,j,kk)
109 n = JNeigh(k,i,j,kk)
110 kc = KNeigh(k,i,j,kk)
111 IF ( (dist(k,i,j,kk) <= small2) .AND. (mask(m,n,kc) == 0) ) THEN
112 DO l = 1, 5
113 QIP(l) = Q(m,l,n,kc)
114 END DO
115 GOTO 30
116 END IF
117
118 !************* Fill PHI ******************
119 IF (mask(m,n,kc) /= 0) THEN ! If the neighbour of IP is a solid point
120 gradflag = 1
121 DO l = 1, 5
122 PHI(l,k) = zero
123 END DO
124 ELSE
125 DO l = 1, 5
126 PHI(l,k) = Q(m,l,n,kc)
127 END DO
128 END IF
129 END DO ! DO k = 1, ibcDim
130
131 !********** Finding out Weighting Parameters' values ****************
132 DO k = 1, ibcDim
133 DO l = 1, 5
134 WP(l,k) = zero
135 END DO
136 END DO
137
138 IF (gradflag == 1) THEN
139 DO aa = 1, ibcDim
140 DO k = 1, ibcDim
141 DO l = 1, 3
142 WP(l,aa) = WP(l,aa) + MATINV(aa,k,my_iMask,1) * PHI(l,k)
143 END DO
144 DO l = 4, 5
145 WP(l,aa) = WP(l,aa) + MATINV(aa,k,my_iMask,2) * PHI(l,k)
146 END DO
147 END DO
148 END DO
149 !
150 ELSE
151 !
152 DO aa = 1, ibcDim
153 DO k = 1, ibcDim
154 DO l = 1, 5
155 WP(l,aa) = WP(l,aa) + MATINV(aa,k,my_iMask,1) * PHI(l,k)
156 END DO
157 END DO
158 END DO
159 END IF
160
161 !*********** Calculating Q at IP ********************
162 IF (ibcDim == 4) THEN
163 DO l = 1, 5
164 QIP(l) = WP(l,1) * XIP(i,j,kk) * YIP(i,j,kk) + WP(l,2) * XIP(i,j,kk) + WP(l,3) * YIP(i,j,kk) + W P(l,4)
165 END DO
166 !
167 ELSE
168 !
169 DO l = 1, 5
170 QIP(l) = WP(l,1) * XIP(i,j,kk) * YIP(i,j,kk) * ZIP(i,j,kk) + WP(l,2) * XIP(i,j,kk) * YIP(i,j,kk)
171 & + WP(l,3) * XIP(i,j,kk) * ZIP(i,j,kk) + WP(l,4) * YIP(i,j,kk) * ZIP(i,j,kk)
172 & + WP(l,5) * XIP(i,j,kk) + WP(l,6) * YIP(i,j,kk)
173 & + WP(l,7) * ZIP(i,j,kk) + WP(l,8)
174 END DO
175 !
176 END IF
177
178 !********* Putting appropriate values of Q at the ghost point ************
179 30 CONTINUE
180 !
181 DO l = 1, 3
182 Q(i,l,j,kk) = - QIP(l)
183 END DO
184 DO l = 4, 5
185 Q(i,l,j,kk) = QIP(l)
186 END DO
187 END IF ! IF (mask(i,j,kk) == -1) THEN
188 !
189 END DO
190 END DO
191 END DO
192 #ifdef GM_GPU
193 !$ACC END PARALLEL LOOP
194 #endif
```

First I do not understand the message “Unexpected flow graph” in my case, I find only few things about it.

Next, I do not understand why the messages about “scalar last value needed after loop” are present because these scalar variables m,n,kc,gradflag should be private.

Moreover, the message about QIP (“CUDA shared memory used for qip”) : this array is small and it should be private too.

```
17 REAL(rp), DIMENSION(5) :: QIP
```

Thanks for any/all the advice you can give.