Output incorrect when using accelerator

I’m using VS2010 and PGI Visual Fortran 10.8

Current working old code is

      SUBROUTINE NLMMT(TH,IFN,JFN,Z,M,N,DZ,DM,DN,HZ,HM,HN,RX,DT,FM)
C
      PARAMETER   (GX=1.0E-5, GG=9.81)
C
      INTEGER   IFN, JFN
      REAL      TH, Z(IFN,JFN,2), M(IFN,JFN,2), N(IFN,JFN,2)
      REAL      DZ(IFN,JFN,2), DM(IFN,JFN,2), DN(IFN,JFN,2)
      REAL      HZ(IFN,JFN), HM(IFN,JFN), HN(IFN,JFN), RX, DT, FM
	REAL      DM1, DM2, DN1, DN2, FN
C
      DO 10 K=1,2
        DO 10 J=1,JFN
          DO 10 I=1,IFN
            DM(I,J,K)=0.0
            DN(I,J,K)=0.0
   10 CONTINUE   
C
      DO J=1, JFN
         DO I=1, IFN	 
	      IF (I .LT. IFN) THEN
               DM1 = 0.25*(Z(I,J,1)+Z(I,J,2)+Z(I+1,J,1)+Z(I+1,J,2))
     &             + 0.5*(HZ(I,J)+HZ(I+1,J))
               DM2 = 0.5*(Z(I,J,2)+Z(I+1,J,2)+HZ(I,J)+HZ(I+1,J))
            ELSE
               DM1 = Z(I,J,2) + HZ(I,J)
               DM2 = Z(I,J,2) + HZ(I,J)
            END IF
            IF (DM1 .GE. GX) DM(I,J,1) = DM1
            IF (DM2 .GE. GX) DM(I,J,2) = DM2
C
            IF (J .LT. JFN) THEN
               DN1 = 0.25*(Z(I,J,1)+Z(I,J,2)+Z(I,J+1,1)+Z(I,J+1,2))
     &             + 0.5*(HZ(I,J)+HZ(I,J+1))
               DN2 = 0.5*(Z(I,J,2)+Z(I,J+1,2)+HZ(I,J)+HZ(I,J+1))
            ELSE
               DN1 = Z(I,J,2)+HZ(I,J)
               DN2 = Z(I,J,2)+HZ(I,J)
            END IF
            IF (DN1 .GE. GX) DN(I,J,1) = DN1
            IF (DN2 .GE. GX) DN(I,J,2) = DN2
         END DO
      END DO  

      FN=0.5*DT*GG*FM**2
C
C   ------- X-DIRECTION -------
C
      DO J=1, JFN
         DO I=1, IFN
            IF (HZ(I,J).GT.TH .AND. HM(I,J).GT.TH) THEN
               CALL XMMT(GG, I, J, IFN, JFN, HZ, Z, DZ, DM,
     &                                      M, N, RX, FN)
            END IF
C
C   ------- Y-DIRECTION -------
C
            IF (HZ(I,J).GT.TH .AND. HN(I,J).GT.TH) THEN
               CALL YMMT(GG, I, J, IFN, JFN, HZ, Z, DZ, DN,
     &                                      M, N, RX, FN)
            END IF
         END DO
      END DO

      RETURN
      END

This is the code after modification, but it doesn’t work correctly. Although some local variables are changed to arrays, it works correctly without !$acc region.

      SUBROUTINE NLMMT(TH,IFN,JFN,Z,M,N,DZ,DM,DN,HZ,HM,HN,RX,DT,FM)
C
      PARAMETER   (GX=1.0E-5, GG=9.81)
C
      INTEGER   IFN, JFN
      REAL      TH, Z(IFN,JFN,2), M(IFN,JFN,2), N(IFN,JFN,2)
      REAL      DZ(IFN,JFN,2), DM(IFN,JFN,2), DN(IFN,JFN,2)
      REAL      HZ(IFN,JFN), HM(IFN,JFN), HN(IFN,JFN), RX, DT, FM
	REAL      DM1(IFN,JFN), DM2(IFN,JFN), DN1(IFN,JFN), DN2(IFN,JFN), FN
C
!$acc region
      DO 10 K=1,2
        DO 10 J=1,JFN
          DO 10 I=1,IFN
            DM(I,J,K)=0.0
            DN(I,J,K)=0.0
   10 CONTINUE   
C
      DO J=1, JFN
         DO I=1, IFN	 
	      IF (I .LT. IFN) THEN
               DM1(I,J) = 0.25*(Z(I,J,1)+Z(I,J,2)+Z(I+1,J,1)+Z(I+1,J,2))
     &             + 0.5*(HZ(I,J)+HZ(I+1,J))
               DM2(I,J) = 0.5*(Z(I,J,2)+Z(I+1,J,2)+HZ(I,J)+HZ(I+1,J))
            ELSE
               DM1(I,J) = Z(I,J,2) + HZ(I,J)
               DM2(I,J) = Z(I,J,2) + HZ(I,J)
            END IF
C
            IF (J .LT. JFN) THEN
               DN1(I,J) = 0.25*(Z(I,J,1)+Z(I,J,2)+Z(I,J+1,1)+Z(I,J+1,2))
     &             + 0.5*(HZ(I,J)+HZ(I,J+1))
               DN2(I,J) = 0.5*(Z(I,J,2)+Z(I,J+1,2)+HZ(I,J)+HZ(I,J+1))
            ELSE
               DN1(I,J) = Z(I,J,2)+HZ(I,J)
               DN2(I,J) = Z(I,J,2)+HZ(I,J)
            END IF
         END DO
      END DO
      DO J=1, JFN
         DO I=1, IFN	 
	  		IF (DM1(I,J) .GE. GX) DM(I,J,1) = DM1(I,J)
            IF (DM2(I,J) .GE. GX) DM(I,J,2) = DM2(I,J)
            IF (DN1(I,J) .GE. GX) DN(I,J,1) = DN1(I,J)
            IF (DN2(I,J) .GE. GX) DN(I,J,2) = DN2(I,J)
		END DO
	  END DO
!$acc end region

      FN=0.5*DT*GG*FM**2
C
C   ------- X-DIRECTION -------
C
      DO J=1, JFN
         DO I=1, IFN
            IF (HZ(I,J).GT.TH .AND. HM(I,J).GT.TH) THEN
               CALL XMMT(GG, I, J, IFN, JFN, HZ, Z, DZ, DM,
     &                                      M, N, RX, FN)
            END IF
C
C   ------- Y-DIRECTION -------
C
            IF (HZ(I,J).GT.TH .AND. HN(I,J).GT.TH) THEN
               CALL YMMT(GG, I, J, IFN, JFN, HZ, Z, DZ, DN,
     &                                      M, N, RX, FN)
            END IF
         END DO
      END DO

      RETURN
      END

The size of arrays M,N,Z are all 2293416x2293416x2
The main program call this function on only parts of the M,N,Z as around IFN=721 and JFN=841 per call

Hi Eno_Khem,

Can you post a reproduce able example which shows the problem?

The code compiles fine and does create the several device kernels. Though, I did need add “-Mextend” since one line is longer than 72 columns in length.

  • Mat

well, another example is…

      SUBROUTINE INTERQT (IFN,JFN,KK,NT,M,N,M_T,N_T)
C
      INTEGER IFN,JFN,KK,NT
      REAL M(IFN,JFN,2),N(IFN,JFN,2)
      REAL M_T(IFN,JFN,2),N_T(IFN,JFN,2)
C
!$acc region
      DO 10 J=1,JFN
		DO 10 I=1,IFN
			M_T(I,J,1)=M(I,J,1)
			N_T(I,J,1)=N(I,J,1)

			M_T(I,J,2)=M(I,J,1)+(M(I,J,2)-M(I,J,1))*KK/NT
			N_T(I,J,2)=N(I,J,1)+(N(I,J,2)-N(I,J,1))*KK/NT
   10 CONTINUE
!$acc end region
      RETURN
      END

and

      SUBROUTINE INTERQT (IFN,JFN,KK,NT,M,N,M_T,N_T)
C
      INTEGER IFN,JFN,KK,NT
      REAL M(IFN,JFN,2),N(IFN,JFN,2)
      REAL M_T(IFN,JFN,2),N_T(IFN,JFN,2)
C
!$acc region
      DO 10 J=1,JFN
		DO 10 I=1,IFN
			M_T(I,J,1)=M(I,J,1)
			N_T(I,J,1)=N(I,J,1)
   10 CONTINUE
	  DO 20 J=1,JFN
		DO 20 I=1,IFN
			M_T(I,J,2)=M(I,J,1)+(M(I,J,2)-M(I,J,1))*KK/NT
			N_T(I,J,2)=N(I,J,1)+(N(I,J,2)-N(I,J,1))*KK/NT
   20 CONTINUE
!$acc end region
      RETURN
      END

I think these 2 should have same output but they’re not
the 1st subroutine’s output is wrong (ONLY when using accelerator)

1st subroutine’s compiler message

interqt:
   2376, Generating copyout(n_t(1:ifn,1:jfn,1:2))
         Generating copyin(n(1:ifn,1:jfn,1:2))
         Generating copyout(m_t(1:ifn,1:jfn,1:2))
         Generating copyin(m(1:ifn,1:jfn,1:2))
         Generating compute capability 1.0 binary
         Generating compute capability 1.3 binary
   2377, Loop is parallelizable
   2378, Loop is parallelizable
         Accelerator kernel generated
       2377, !$acc do parallel, vector(16)
       2378, !$acc do parallel, vector(16)
             Cached references to size [16x16x2] block of 'n'
             Cached references to size [16x16x2] block of 'm'
             CC 1.0 : 20 registers; 4120 shared, 92 constant, 0 local memory bytes; 33 occupancy
             CC 1.3 : 20 registers; 4120 shared, 92 constant, 0 local memory bytes; 75 occupancy

2nd subroutine’s compiler message

interqt:
   2376, Generating copyin(m(1:ifn,1:jfn,1:2))
         Generating copyout(m_t(1:ifn,1:jfn,1:2))
         Generating copyin(n(1:ifn,1:jfn,1:2))
         Generating copyout(n_t(1:ifn,1:jfn,1:2))
         Generating compute capability 1.0 binary
         Generating compute capability 1.3 binary
   2377, Loop is parallelizable
   2378, Loop is parallelizable
         Accelerator kernel generated
       2377, !$acc do parallel, vector(16)
       2378, !$acc do parallel, vector(16)
             CC 1.0 : 6 registers; 24 shared, 80 constant, 0 local memory bytes; 100 occupancy
             CC 1.3 : 6 registers; 24 shared, 80 constant, 0 local memory bytes; 100 occupancy
   2382, Loop is parallelizable
   2383, Loop is parallelizable
         Accelerator kernel generated
       2382, !$acc do parallel, vector(16)
       2383, !$acc do parallel, vector(16)
             CC 1.0 : 18 registers; 24 shared, 124 constant, 0 local memory bytes; 33 occupancy
             CC 1.3 : 18 registers; 24 shared, 124 constant, 0 local memory bytes; 75 occupancy

well I found that I have to set private clase for variables

!$acc region local(I,J,K)
!$acc do private(J)
      DO 10 K=1,2
!$acc do private(I)
        DO 10 J=1,JFN
          DO 10 I=1,IFN
            DM(I,J,K)=0.0
            DN(I,J,K)=0.0
   10 CONTINUE   
C
!$acc do private(I)
      DO J=1, JFN
!$acc do private(DM1,DM2,DN1,DN2)
         DO I=1, IFN
             ...