Hi there,

I have three quite demanding loops (pid2, pid3 and pid4) each running to ~2000 (counter1) and a loop within a subroutine wich runs from 1 to 7:

```
!$acc parallel loop reduction(+:erg,counter4) private(DDDD_4T_temp(1:7)) collapse(2)
do pid2 = 1, counter1
do pid3 = 1, counter1
x2 = part_pos_rc1(1,pid2)
y2 = part_pos_rc1(2,pid2)
z2 = part_pos_rc1(3,pid2)
r12x = x2-x1
r12y = y2-y1
r12z = z2-z1
r12_2= r12x*r12x + r12y*r12y + r12z*r12z
r12 = sqrt(r12_2)
!do pid3 = 1, counter1 ...not here due to collapse(2)
if (pid3==pid2) cycle
x3 = part_pos_rc1(1,pid3)
y3 = part_pos_rc1(2,pid3)
z3 = part_pos_rc1(3,pid3)
r23x = x3 - x2
r23y = y3 - y2
r23z = z3 - z2
r23_2 = r23x*r23x + r23y*r23y + r23z*r23z
if (r23_2 .lt. rc2) then
r23 = sqrt(r23_2)
!$acc loop independent
do pid4 = 1, counter1
if (pid4==pid3 .or. pid4==pid2) cycle
r24x = part_pos_rc1(1,pid4) - x2
r24y = part_pos_rc1(2,pid4) - y2
r24z = part_pos_rc1(3,pid4) - z2
r24_2 = r24x*r24x + r24y*r24y + r24z*r24z
if (r24_2 .ge. rc2) cycle
r34x = part_pos_rc1(1,pid4) - x3
r34y = part_pos_rc1(2,pid4) - y3
r34z = part_pos_rc1(3,pid4) - z3
r34_2 = r34x*r34x + r34y*r34y + r34z*r34z
if (r34_2 .lt. rc2) then
r34 = sqrt(r34_2)
r41x = x1 - part_pos_rc1(1,pid4)
r41y = y1 - part_pos_rc1(2,pid4)
r41z = z1 - part_pos_rc1(3,pid4)
r41_2= r41x*r41x + r41y*r41y + r41z*r41z
r41 = sqrt(r41_2)
! calculates CCCC_temp(7),...,DDDD_4u5T_temp(7) using
! !$acc routine vector within the subroutine potentials with
! !$acc loop vector bevore the actual loop (do i = 1, 7 ...)
call ptt (r12,r23,r34,r41, &
r12_2,r23_2,r34_2,r41_2,&
r12x, r23x, r34x, r41x, &
r12y, r23y, r34y, r41y, &
r12z, r23z, r34z, r41z, &
CCCC_4T_temp(1:7), &
DDDD_4T_temp(1:7), DDDD_5T_temp(1:7), DDDD_4u5T_temp(1:7) )
counter4 = counter4 + 1
! I use erg to test this parallelization..
erg = erg + DDDD_4T_temp(7)
! ..but I'm actually interested in these arrays:
! CCCC_4T(:) = CCCC_4T(:) + CCCC_4T_temp(:)
! DDDD_4T(:) = DDDD_4T(:) + DDDD_4T_temp(:)
! DDDD_5T(:) = DDDD_5T(:) + DDDD_5T_temp(:)
! DDDD_4u5T(:) = DDDD_4u5T(:) + DDDD_4u5T_temp(:)
end if
end do !pid4
end if
end do ! pid3
end do ! pid2
!$acc end parallel
```

Furthermore, here are some information of the compiler:

launch CUDA kernel:

line=591 device=0 threadid=1 num_gangs=65535 num_workers=1 vector_length=32 grid=65535 block=32 shared memory=2048

launch CUDA kernel:

line=591 device=0 threadid=1 num_gangs=4 num_workers=1 vector_length=256 grid=4 block=256 shared memory=2048

Here is also the output of the visual profiler:

The main problem I have is a very low speed-up… Do you have any ideas to improve the efficiency??

Thank you very much in advance!