Hello,
I am uncertain how to remove the loop dependencies from the following code.
Any help you can provide would be greatly appreciated.
PROGRAM TESTPGI
!**** MODULE Usage
USE CONSTANTSPGI
REAL , PARAMETER :: FIT=0.90
INTRINSIC SQRT,EXP ! In GPU processing section
!**** Local Variable Declarations
INTEGER, PARAMETER :: NB = 64
INTEGER :: I, J, K, KK, IX, IY, IWL, OUTERLOOP, OUTERLOOPMAX, NX, NY, NWL, END_MEMBER(0:255,2)
INTEGER , ALLOCATABLE :: IBK0(:)
INTEGER (KIND=KI4), ALLOCATABLE :: IEND_MEMBER(:,:)
REAL, ALLOCATABLE :: P_RAD(:,:,:), M_RAD(:,:), DOTPR(:,:)
REAL :: EXECTIME(5), BL, DOTPR_MATL, DOTPR0, DOTPR_MIN(0:255), DOTPR_MAX(0:255)
DATA ((END_MEMBER(I,J),J=1,2),I=0,NB) / 0,0, &
1,0, 2,0, 3,0, 4,0, 5,0, 6,0, 7,0, 8,0, 9,0, 10,0, 11,0, 12,0, 13,0, 14,0, 15,0, 16,0, &
17,0, 18,0, 19,0, 20,0, 21,0, 22,0, 23,0, 24,0, 25,0, 26,0, 27,0, 28,0, 29,0, 30,0, 31,0, 32,0, 33,0, &
34,0, 35,0, 36,0, 37,0, 38,0, 39,0, 40,0, 41,0, 42,0, 43,0, 44,0, 45,0, 46,0, 47,0, 48,0, 49,0, 50,0, &
51,0, 52,0, 53,0, 54,0, 55,0, 56,0, 57,0, 58,0, 59,0, 60,0, 61,0, 62,0, 63,0, 64,0/
! 65,0, 66,0, 67,0, 68,0, 69,0, 70,0, 71,0, 72,0, 73,0, &
! 74,0, 75,0, 76,0, 77,0, 78,0, 79,0, 80,0, 81,0/
NX = 753
!NY = 1924
NY = 1400
NWL = 224
OUTERLOOPMAX = NX * NY
ALLOCATE(M_RAD(NWL,NB),STAT=IOS)
ALLOCATE(P_RAD(NX,NY,NWL),STAT=IOS)
ALLOCATE(IEND_MEMBER(NX,NY),DOTPR(NX,NY),STAT=IOS)
ALLOCATE(IBK0(OUTERLOOPMAX),STAT=IOS)
DO I = 1,NWL
DO J = 1,NX
DO K = 1,NY
P_RAD(J,K,I) = .005 ** 2
END DO
END DO
END DO
DO I = 1,NWL
DO J = 1,NB
M_RAD(I,J) = .003 ** 2
END DO
END DO
DO IBK = 1,NB
DOTPR_MIN(IBK) = 1.0
DOTPR_MAX(IBK) = -1.0
END DO
!$acc region
!$acc do independent
DO OUTERLOOP = 1,OUTERLOOPMAX
KK = ( OUTERLOOP - 1 )/NX
IY = KK+1
IX = OUTERLOOP - KK * NX
DOTPR(IX,IY) = 0
BL = 0
DO IWL = 1, NWL
IF ( P_RAD(IX,IY,IWL) > 0.0 ) BL=BL + P_RAD(IX,IY,IWL)**2
END DO
IF ( BL < EPSMIN4 ) THEN
IEND_MEMBER(IX,IY)=255
IBK0(OUTERLOOP)=255
BL = 1.0
ELSE
IEND_MEMBER(IX,IY)=254
IBK0(OUTERLOOP)=254
END IF
BL=SQRT(BL)
DO IWL = 1,NWL
P_RAD(IX,IY,IWL)=P_RAD(IX,IY,IWL)/BL
END DO
DOTPR_MATL=0.0
DO IBK = 1,NB
IEM=END_MEMBER(IBK,1)
DOTPR0=0.0
DO IWL = 1,NWL
DOTPR0=DOTPR0+P_RAD(IX,IY,IWL)*M_RAD(IWL,IBK)
END DO
IF (DOTPR0>=FIT.AND.DOTPR0>DOTPR_MATL &
.AND.IEM/=39.AND.IEM/=49.AND.IEM/=17.AND.IEM/=12.AND.IEM/=13.AND.IEM/=16.AND.IEM/=18) THEN
DOTPR_MATL=DOTPR0
IEND_MEMBER(IX,IY)=END_MEMBER(IBK,1)
DOTPR(IX,IY)=DOTPR0
IBK0(OUTERLOOP)=IBK
END IF
END DO
END DO
!$acc end region
IY = 0
IK = 0
DO OUTERLOOP = 1,OUTERLOOPMAX
KK = ( OUTERLOOP - 1 )/NX
IY = KK+1
IX = OUTERLOOP - KK * NX
IF (IEND_MEMBER(IX,IY)==0) END_MEMBER(0,2)=END_MEMBER(0,2)+1
DO IBK = 1,NBCKGND
IF (IEND_MEMBER(IX,IY)==END_MEMBER(IBK,1)) END_MEMBER(IBK,2)=END_MEMBER(IBK,2)+1
END DO
IF (DOTPR(IX,IY)<DOTPR_MIN(IBK0(OUTERLOOP))) DOTPR_MIN(IBK0(OUTERLOOP))=DOTPR(IX,IY)
IF (DOTPR(IX,IY)>DOTPR_MAX(IBK0(OUTERLOOP))) DOTPR_MAX(IBK0(OUTERLOOP))=DOTPR(IX,IY)
END DO
DEALLOCATE(P_RAD,STAT=IOS)
END PROGRAM TESTPGI
testpgi:
57, Generating copyout(ibk0(1:1054200))
Generating copy(p_rad(:,:,1:224))
Generating copyin(m_rad(1:224,1:64))
Generating copyout(iend_member(:,:))
Generating copyout(dotpr(:,:))
Generating copyin(end_member(1:64,1))
Generating compute capability 1.0 binary
Generating compute capability 1.3 binary
59, Loop is parallelizable
Accelerator kernel generated
59, !$acc do parallel, vector(256)
Using register for ‘ibk0’
CC 1.0 : 21 registers; 20 shared, 200 constant, 0 local memory bytes; 33 occupancy
CC 1.3 : 21 registers; 20 shared, 200 constant, 0 local memory bytes; 50 occupancy
67, Loop is parallelizable
81, Loop is parallelizable
86, Loop carried scalar dependence for ‘dotpr_matl’ at line 93
Complex loop carried dependence of ‘iend_member’ prevents parallelization
Loop carried reuse of ‘iend_member’ prevents parallelization
Complex loop carried dependence of ‘dotpr’ prevents parallelization
Loop carried reuse of ‘dotpr’ prevents parallelization
Loop carried reuse of ‘ibk0’ prevents parallelization
89, Loop is parallelizable