Hi,
I have questions about the following routine that has added OpenACC pragmas.
subroutine hecmw_precond_SSOR_33_apply(ZP)
   !use hecmw_tuning_fx
   implicit none
   real(kind=kreal), intent(inout) :: ZP(:)
   integer(kind=kint) :: ic, i, iold, j, isL, ieL, isU, ieU, k
   real(kind=kreal) :: SW1, SW2, SW3, X1, X2, X3
   !call start_collection("loopInPrecond33")
!OCL CACHE_SECTOR_SIZE(sectorCacheSize0,sectorCacheSize1)
!OCL CACHE_SUBSECTOR_ASSIGN(ZP)
!$acc parallel loop private(SW1,SW2,SW3,X1,X2,X3,ic,i,iold,isL,ieL,isU,ieU,j,k) seq
   !C-- FORWARD
   do ic=1,NColor
!$acc loop worker independent
     do i=COLORindex(ic-1)+1, COLORindex(ic)
         iold = perm(i)
         SW1= ZP(3*iold-2)
         SW2= ZP(3*iold-1)
         SW3= ZP(3*iold )
         isL= indexL(i-1)+1
         ieL= indexL(i)
         !$acc loop vector independent
         do j= isL, ieL
           !k= perm(itemL(j))
           k= itemL(j)
           X1= ZP(3*k-2)
           X2= ZP(3*k-1)
           X3= ZP(3*k )
           SW1= SW1 - AL(9*j-8)*X1 - AL(9*j-7)*X2 - AL(9*j-6)*X3
           SW2= SW2 - AL(9*j-5)*X1 - AL(9*j-4)*X2 - AL(9*j-3)*X3
           SW3= SW3 - AL(9*j-2)*X1 - AL(9*j-1)*X2 - AL(9*j )*X3
         enddo ! j
         if (NContact.ne.0) then
           isL= indexCL(i-1)+1
           ieL= indexCL(i)
          !$acc loop vector independent
           do j= isL, ieL
             !k= perm(itemCL(j))
             k= itemCL(j)
             X1= ZP(3*k-2)
             X2= ZP(3*k-1)
             X3= ZP(3*k )
             SW1= SW1 - CAL(9*j-8)*X1 - CAL(9*j-7)*X2 - CAL(9*j-6)*X3
             SW2= SW2 - CAL(9*j-5)*X1 - CAL(9*j-4)*X2 - CAL(9*j-3)*X3
             SW3= SW3 - CAL(9*j-2)*X1 - CAL(9*j-1)*X2 - CAL(9*j )*X3
           enddo ! j
         endif
         X1= SW1
         X2= SW2
         X3= SW3
         X2= X2 - ALU(9*i-5)*X1
         X3= X3 - ALU(9*i-2)*X1 - ALU(9*i-1)*X2
         X3= ALU(9*i )* X3
         X2= ALU(9*i-4)*( X2 - ALU(9*i-3)*X3 )
         X1= ALU(9*i-8)*( X1 - ALU(9*i-6)*X3 - ALU(9*i-7)*X2)
         ZP(3*iold-2)= X1
         ZP(3*iold-1)= X2
         ZP(3*iold )= X3
     enddo ! i
   enddo ! ic
!$acc end parallel
   !C-- BACKWARD
!$acc parallel loop private(SW1,SW2,SW3,X1,X2,X3,ic,i,iold,isL,ieL,isU,ieU,j,k) seq
   do ic=NColor, 1, -1
!$acc loop worker independent
     do i=COLORindex(ic), COLORindex(ic-1)+1, -1
         SW1= 0.d0
         SW2= 0.d0
         SW3= 0.d0
         isU= indexU(i-1) + 1
         ieU= indexU(i)
         !$acc loop vector independent
         do j= ieU, isU, -1
           !k= perm(itemU(j))
           k= itemU(j)
           X1= ZP(3*k-2)
           X2= ZP(3*k-1)
           X3= ZP(3*k )
           SW1= SW1 + AU(9*j-8)*X1 + AU(9*j-7)*X2 + AU(9*j-6)*X3
           SW2= SW2 + AU(9*j-5)*X1 + AU(9*j-4)*X2 + AU(9*j-3)*X3
           SW3= SW3 + AU(9*j-2)*X1 + AU(9*j-1)*X2 + AU(9*j )*X3
         enddo ! j
         if (NContact.gt.0) then
           isU= indexCU(i-1) + 1
           ieU= indexCU(i)
           !$acc loop vector independent
           do j= ieU, isU, -1
             !k= perm(itemCU(j))
             k= itemCU(j)
             X1= ZP(3*k-2)
             X2= ZP(3*k-1)
             X3= ZP(3*k )
             SW1= SW1 + CAU(9*j-8)*X1 + CAU(9*j-7)*X2 + CAU(9*j-6)*X3
             SW2= SW2 + CAU(9*j-5)*X1 + CAU(9*j-4)*X2 + CAU(9*j-3)*X3
             SW3= SW3 + CAU(9*j-2)*X1 + CAU(9*j-1)*X2 + CAU(9*j )*X3
           enddo ! j
         endif
         X1= SW1
         X2= SW2
         X3= SW3
         X2= X2 - ALU(9*i-5)*X1
         X3= X3 - ALU(9*i-2)*X1 - ALU(9*i-1)*X2
         X3= ALU(9*i )* X3
         X2= ALU(9*i-4)*( X2 - ALU(9*i-3)*X3 )
         X1= ALU(9*i-8)*( X1 - ALU(9*i-6)*X3 - ALU(9*i-7)*X2)
         iold = perm(i)
         ZP(3*iold-2)= ZP(3*iold-2) - X1
         ZP(3*iold-1)= ZP(3*iold-1) - X2
         ZP(3*iold )= ZP(3*iold ) - X3
     enddo ! i
   enddo ! ic
!$acc end parallel
!OCL END_CACHE_SUBSECTOR
!OCL END_CACHE_SECTOR_SIZE
   !call stop_collection("loopInPrecond33")
 end subroutine hecmw_precond_SSOR_33_apply
This code leads to the following output from the compiler:
hecmw_precond_ssor_33_apply:
   242, Accelerator kernel generated
        Generating Tesla code
       247, !$acc loop seq
       251, !$acc loop worker(4) ! threadidx%y
       259, !$acc loop vector(32) ! threadidx%x
            Vector barrier inserted due to potential dependence out of a vector loop
            Vector barrier inserted to share data across vector lanes
       265, Generating implicit reduction(+:sw1)
       266, Generating implicit reduction(+:sw2)
       267, Generating implicit reduction(+:sw3)
       274, !$acc loop vector(32) ! threadidx%x
            Vector barrier inserted due to potential dependence out of a vector loop
            Vector barrier inserted to share data across vector lanes
       280, Generating implicit reduction(+:sw1)
       281, Generating implicit reduction(+:sw2)
       282, Generating implicit reduction(+:sw3)
       284, Vector barrier inserted to share data across vector lanes
       378, Vector barrier inserted due to potential dependence into a vector loop
   242, Generating implicit copyin(cal(:),itemcl(:))
        Generating implicit copy(zp(:))
        Generating implicit copyin(indexcl(:),perm(:),iteml(:),alu(:),indexl(:),colorindex(0:ncolor),al(:))
   247, Loop carried dependence due to exposed use of zp(:) prevents parallelization
   251, Loop is parallelizable
   259, Loop is parallelizable
   274, Loop is parallelizable
   307, Accelerator kernel generated
        Generating Tesla code
       301, Vector barrier inserted due to potential dependence into a vector loop
       310, !$acc loop seq
       314, !$acc loop worker(4) ! threadidx%y
       321, !$acc loop vector(32) ! threadidx%x
            Vector barrier inserted due to potential dependence out of a vector loop
            Vector barrier inserted to share data across vector lanes
       327, Generating implicit reduction(+:sw1)
       328, Generating implicit reduction(+:sw2)
       329, Generating implicit reduction(+:sw3)
       336, !$acc loop vector(32) ! threadidx%x
            Vector barrier inserted due to potential dependence out of a vector loop
            Vector barrier inserted to share data across vector lanes
       342, Generating implicit reduction(+:sw1)
       343, Generating implicit reduction(+:sw2)
       344, Generating implicit reduction(+:sw3)
       346, Vector barrier inserted to share data across vector lanes
   307, Generating implicit copyin(cau(:),alu(:))
        Generating implicit copy(zp(:))
        Generating implicit copyin(itemcu(:),indexcu(:),au(:),indexu(:),colorindex(0:ncolor),perm(:),itemu(:))
   310, Loop carried dependence due to exposed use of zp(:) prevents parallelization
   314, Loop is parallelizable
   321, Loop is parallelizable
   336, Loop is parallelizable
During a solver run this routine is called a few thousands of times. Despite the parallel execution that is possible due the the use of multicoloring, it still runs each call to his code with an execution time of about 196ms, which is much longer than it takes running it serially on the CPU.
Could there be setup-time issues part of each call to the routine (e.g. kernel re-uploaded to the GPU for every call or similar)?
As of now, the data locations are not ideally set up yet, so it will copy data over the bus between each call (I tried to avoid this by testing the “managed” flag, however this ends in error as described in my other thread). However, still the routine itself should run efficiently when looking at its execution time through the profiler, as by the time of each kernel execution all needed data should already be in GPU memory, is that not correct to assume?
Any advices here on how to improve on this ? The outer loops have to be sequential as there is serial dependence at that level between each color being handled.
Thanks!
Regards,
Olav