Removing Loop Carried Dependencies

Hello,

I am uncertain how to remove the loop dependencies from the following code.

Any help you can provide would be greatly appreciated.

PROGRAM TESTPGI

!**** MODULE Usage

USE CONSTANTSPGI
REAL , PARAMETER :: FIT=0.90
INTRINSIC SQRT,EXP ! In GPU processing section

!**** Local Variable Declarations

INTEGER, PARAMETER :: NB = 64
INTEGER :: I, J, K, KK, IX, IY, IWL, OUTERLOOP, OUTERLOOPMAX, NX, NY, NWL, END_MEMBER(0:255,2)
INTEGER , ALLOCATABLE :: IBK0(:)
INTEGER (KIND=KI4), ALLOCATABLE :: IEND_MEMBER(:,:)
REAL, ALLOCATABLE :: P_RAD(:,:,:), M_RAD(:,:), DOTPR(:,:)
REAL :: EXECTIME(5), BL, DOTPR_MATL, DOTPR0, DOTPR_MIN(0:255), DOTPR_MAX(0:255)
DATA ((END_MEMBER(I,J),J=1,2),I=0,NB) / 0,0, &
1,0, 2,0, 3,0, 4,0, 5,0, 6,0, 7,0, 8,0, 9,0, 10,0, 11,0, 12,0, 13,0, 14,0, 15,0, 16,0, &
17,0, 18,0, 19,0, 20,0, 21,0, 22,0, 23,0, 24,0, 25,0, 26,0, 27,0, 28,0, 29,0, 30,0, 31,0, 32,0, 33,0, &
34,0, 35,0, 36,0, 37,0, 38,0, 39,0, 40,0, 41,0, 42,0, 43,0, 44,0, 45,0, 46,0, 47,0, 48,0, 49,0, 50,0, &
51,0, 52,0, 53,0, 54,0, 55,0, 56,0, 57,0, 58,0, 59,0, 60,0, 61,0, 62,0, 63,0, 64,0/
! 65,0, 66,0, 67,0, 68,0, 69,0, 70,0, 71,0, 72,0, 73,0, &
! 74,0, 75,0, 76,0, 77,0, 78,0, 79,0, 80,0, 81,0/

NX = 753
!NY = 1924
NY = 1400
NWL = 224

OUTERLOOPMAX = NX * NY

ALLOCATE(M_RAD(NWL,NB),STAT=IOS)
ALLOCATE(P_RAD(NX,NY,NWL),STAT=IOS)
ALLOCATE(IEND_MEMBER(NX,NY),DOTPR(NX,NY),STAT=IOS)
ALLOCATE(IBK0(OUTERLOOPMAX),STAT=IOS)

DO I = 1,NWL
DO J = 1,NX
DO K = 1,NY
P_RAD(J,K,I) = .005 ** 2
END DO
END DO
END DO

DO I = 1,NWL
DO J = 1,NB
M_RAD(I,J) = .003 ** 2
END DO
END DO

DO IBK = 1,NB
DOTPR_MIN(IBK) = 1.0
DOTPR_MAX(IBK) = -1.0
END DO

!$acc region
!$acc do independent
DO OUTERLOOP = 1,OUTERLOOPMAX
KK = ( OUTERLOOP - 1 )/NX
IY = KK+1
IX = OUTERLOOP - KK * NX

DOTPR(IX,IY) = 0

BL = 0
DO IWL = 1, NWL
IF ( P_RAD(IX,IY,IWL) > 0.0 ) BL=BL + P_RAD(IX,IY,IWL)**2
END DO

IF ( BL < EPSMIN4 ) THEN
IEND_MEMBER(IX,IY)=255
IBK0(OUTERLOOP)=255
BL = 1.0
ELSE
IEND_MEMBER(IX,IY)=254
IBK0(OUTERLOOP)=254
END IF
BL=SQRT(BL)

DO IWL = 1,NWL
P_RAD(IX,IY,IWL)=P_RAD(IX,IY,IWL)/BL
END DO

DOTPR_MATL=0.0
DO IBK = 1,NB
IEM=END_MEMBER(IBK,1)
DOTPR0=0.0
DO IWL = 1,NWL
DOTPR0=DOTPR0+P_RAD(IX,IY,IWL)*M_RAD(IWL,IBK)
END DO

IF (DOTPR0>=FIT.AND.DOTPR0>DOTPR_MATL &
.AND.IEM/=39.AND.IEM/=49.AND.IEM/=17.AND.IEM/=12.AND.IEM/=13.AND.IEM/=16.AND.IEM/=18) THEN
DOTPR_MATL=DOTPR0
IEND_MEMBER(IX,IY)=END_MEMBER(IBK,1)
DOTPR(IX,IY)=DOTPR0
IBK0(OUTERLOOP)=IBK
END IF
END DO
END DO
!$acc end region

IY = 0
IK = 0

DO OUTERLOOP = 1,OUTERLOOPMAX
KK = ( OUTERLOOP - 1 )/NX
IY = KK+1
IX = OUTERLOOP - KK * NX
IF (IEND_MEMBER(IX,IY)==0) END_MEMBER(0,2)=END_MEMBER(0,2)+1
DO IBK = 1,NBCKGND
IF (IEND_MEMBER(IX,IY)==END_MEMBER(IBK,1)) END_MEMBER(IBK,2)=END_MEMBER(IBK,2)+1
END DO
IF (DOTPR(IX,IY)<DOTPR_MIN(IBK0(OUTERLOOP))) DOTPR_MIN(IBK0(OUTERLOOP))=DOTPR(IX,IY)
IF (DOTPR(IX,IY)>DOTPR_MAX(IBK0(OUTERLOOP))) DOTPR_MAX(IBK0(OUTERLOOP))=DOTPR(IX,IY)
END DO

DEALLOCATE(P_RAD,STAT=IOS)

END PROGRAM TESTPGI

testpgi:
57, Generating copyout(ibk0(1:1054200))
Generating copy(p_rad(:,:,1:224))
Generating copyin(m_rad(1:224,1:64))
Generating copyout(iend_member(:,:))
Generating copyout(dotpr(:,:))
Generating copyin(end_member(1:64,1))
Generating compute capability 1.0 binary
Generating compute capability 1.3 binary
59, Loop is parallelizable
Accelerator kernel generated
59, !$acc do parallel, vector(256)
Using register for ‘ibk0’
CC 1.0 : 21 registers; 20 shared, 200 constant, 0 local memory bytes; 33 occupancy
CC 1.3 : 21 registers; 20 shared, 200 constant, 0 local memory bytes; 50 occupancy
67, Loop is parallelizable
81, Loop is parallelizable
86, Loop carried scalar dependence for ‘dotpr_matl’ at line 93
Complex loop carried dependence of ‘iend_member’ prevents parallelization
Loop carried reuse of ‘iend_member’ prevents parallelization
Complex loop carried dependence of ‘dotpr’ prevents parallelization
Loop carried reuse of ‘dotpr’ prevents parallelization
Loop carried reuse of ‘ibk0’ prevents parallelization
89, Loop is parallelizable

Hi Pebbles,

I am uncertain how to remove the loop dependencies from the following code.

You don’t need to remove these since they are independent in the outer parallel loop. These just prevents the inner loops from being accelerated, which you wouldn’t want anyway here.

  • Mat