 # Fortran OMP Parrallel Merge preformance

Hi, I am simulation engineer who has ended up having to write a parallel merge functions

I started with this base code
http://rosettacode.org/wiki/Sorting_algorithms/Merge_sort

I followed the work of 2 papers
which explains about using sections and using a binary sort as well

This one covers how one to make mergers happen parallel and how to make the algorithm cache aware which i think I failed to do

In theory of the papers it should be possible to create a program which would get very near to O N/p log n for large values of N

When I run the program on Intel compiler 11.038 i get about 2x times speed up on a window 7, 6 core computer.

When I compile this on PGI compiler on 13.4 I get about 2x slow on a window 7, 16 core computer.

this is even when dealing with a n of 10^8 = 100,000,000.

I find that the cache aware version on the Intel version of the merger is required for speed up.

using the not cache aware version parallel or serial merger, see about 1.5x time increase on pgi

I want to understand how i can improve this and at least get the same benefit I see on the Intel compiler if not better (which I this presume from the papers that this should be possible for large N’s)

``````!Works with: Fortran version 90 and later

SUBROUTINE Merger_serial(A,NA,B,NB,C,NC)

IMPLICIT NONE

integer, intent(in) :: NA,NB,NC         ! Normal usage: NA+NB = NC
integer, intent(in out) :: A(NA)        ! B overlays C(NA+1:NC)
integer, intent(in)     :: B(NB)
integer, intent(in out) :: C(NC)

integer :: I,J,K

I = 1; J = 1; K = 1;
DO WHILE(I <= NA .and. J <= NB)
IF (A(I) <= B(J)) THEN
C(K) = A(I)
I = I+1
ELSE
C(K) = B(J)
J = J+1
END IF
K = K + 1
END DO
DO WHILE (I <= NA) !parrall
C(K) = A(I)
I = I + 1
K = K + 1
END DO
RETURN

END SUBROUTINE Merger_serial

SUBROUTINE  Merger_Parrallel(A,NA,B,NB,C,NC,p)
!\$ use omp_lib

IMPLICIT NONE

INTEGER, INTENT(in) :: NA,NB,NC,p        ! Normal usage: NA+NB = NC
INTEGER, INTENT(in out) :: A(NA)        ! B overlays C(NA+1:NC)
INTEGER, INTENT(in out) :: B(NB) !needs in out other wise is only a pionter and gets over written
INTEGER, INTENT(in out) :: C(NC)
INTEGER :: k, k_start, k_end, j, low, high, mid, threadID ,mid_past,j_past

!search a long the diagonal see merge made simple

!\$OMP PARALLEL PRIVATE(k, k_start, k_end, j, low, high, mid, threadID, mid_past, j_past)

!\$OMP DO

IF(k_start == 0)THEN
Mid = 0
j = 0
ELSE
Low = MAX((k_start-NB),0)
High = MIN(k_start,NA)
Mid = Low + ((High-Low)*0.5)
j = (k_start - Mid) + 1  !diagonal
COMPARE: DO WHILE (Low<High)
IF(MID == 0) THEN
IF(a(Mid+1) > b(j-1)) THEN
EXIT COMPARE !the turning spot
ELSE
Low = Mid+1; ! search UP
END IF
ELSE IF(j > NB ) THEN
EXIT COMPARE
ELSE IF (a(Mid) > b(j)) THEN
High = Mid; !search Down
ELSE
IF(a(Mid+1) > b(j-1)) THEN
EXIT COMPARE !the turning spot
ELSE
Low = Mid+1; ! search UP
END IF
END IF
Mid = Low + ((High - Low)*0.5) ! next search
j = (k_start - Mid) + 1  !diagonal
IF(Mid == 0 .or. j == 0 ) EXIT COMPARE
!
END DO COMPARE
J = J -1
END IF
DO k= k_start+1, k_end !k is starting piont
IF (mid == na) THEN
j = j+1
C(k) = b(j)
ELSEIF (j == nb) THEN
Mid = Mid+1
C(k) = a(Mid)
ELSEIF (a(Mid+1) > b(j+1)) THEN
j = j+1
C(k) = b(j)
ELSE
Mid = Mid+1
C(k) = a(Mid)
END IF
END DO
mid_past = mid
j_past = j
END DO
!\$OMP END DO
!\$OMP END PARALLEL
END SUBROUTINE
!(T,		NA,		T(NA+1),NB,		A,		N,	Threads)
!\$ use omp_lib

!IMPLICIT NONE

INTEGER, INTENT(in out) :: B_load(NB_load) !needs in out other wise is only a pionter and gets over written
INTEGER:: A(2000), B(2000), C(2000), NA, NB, NC
INTEGER :: k, k_start, k_end, j, low, high, mid, threadID, p
INTEGER :: CACHE_SIZE, Elementsize, Segments, count, L, K_run, Mid_past, j_past
CACHE_SIZE = 32000 !32K
Elementsize = 32/8 !32 bits in standard integer
l = 2000 !CACHE_SIZE / 3
p = Threads !over writen if different
Mid_past = 0
j_past = 0
DO count = 1, Segments !cache aware to prevent long searches
! a = 0
!b = 0
!c = 0
IF(count == 1) THEN !load in values

ELSE

END IF

p =  MIN(NC, Threads) !cant have more threads than stuff to store
!search a long the diagonal see merge made simple

!\$OMP PARALLEL
!\$OMP DO PRIVATE(k, k_start, k_end,  low, high,threadID,Mid,j)
IF(k_start == 0)THEN
Mid = 0
j = 0
ELSE
Low = MAX((k_start-NB),0)
High = MIN(k_start,NA)
Mid = Low + ((High-Low)*0.5)
j = (k_start - Mid) + 1  !diagonal
COMPARE: DO WHILE (Low<High)
IF(MID == 0) THEN
IF(a(Mid+1) > b(j-1)) THEN
EXIT COMPARE !the turning spot
ELSE
Low = Mid+1; ! search UP
END IF
ELSE IF(j > NB ) THEN
IF(a(Mid+1) > b(j-1)) THEN
EXIT COMPARE
ELSE
Low = Mid+1; ! search UP
END IF
!EXIT COMPARE
ELSE IF (a(Mid) > b(j)) THEN
High = Mid; !search Down
ELSE
IF(a(Mid+1) > b(j-1)) THEN
EXIT COMPARE !the turning spot
ELSE
Low = Mid+1; ! search UP
END IF
END IF
Mid = Low + ((High - Low)*0.5) ! next search
j = (k_start - Mid) + 1  !diagonal
IF(Mid == 0 .or. j == 0 ) EXIT COMPARE
!
END DO COMPARE
J = J -1
END IF
k_start =  k_start + 1
k =  k_start
DO  K = k_start,  k_end !k is starting piont
IF (mid == na) THEN !
j = j+1
C(k) = b(j)
ELSEIF (j == nb) THEN
Mid = Mid+1
C(k) = a(Mid)
ELSEIF (a(Mid+1) > b(j+1)) THEN
j = j+1
C(k) = b(j)
ELSE
Mid = Mid+1
C(k) = a(Mid)
END IF
END DO
!may have more to do

IF(threadID == p-1) THEN !because last private dosnt work
Mid_past = Mid
j_past = j
END IF

END DO
!\$OMP END DO

!\$OMP END PARALLEL
END DO
END SUBROUTINE

SUBROUTINE BinarySort(a, n)

IMPLICIT NONE

integer, intent(in) :: N !size of array
integer, dimension(N), intent(in out) :: a !array to be sorted

INTEGER :: i, j, Low, High, Mid, Tmp

DO i = 2, n !do every item a(1) is the starting piont and with be moved if need
Low = 1
High = i; !the new particle
Mid = Low + ((High-Low)*0.5) !find the mid piont

compare: DO WHILE (Low<High) !find the piont where it belongs
IF (a(i) > a(Mid)) THEN
Low = Mid + 1; ! search right
ELSE IF (a(i) < a(Mid)) THEN
High = Mid; !search left
ELSE
EXIT compare !found spot exit
END IF
Mid = Low + ((High - Low)*0.5) ! next mid piont
END DO compare

IF (Mid < i) THEN ! does the particle
tmp =  a(i)
DO j = i-1, mid, -1
a(j+1) = a(j) !shift up
END DO
a(mid) = tmp
END IF
END DO

END SUBROUTINE BinarySort

RECURSIVE SUBROUTINE MergeSort_serial(A,N,T)

IMPLICIT NONE

integer, INTENT(in) :: N
integer, dimension(N), INTENT(in out) :: A
integer, dimension(N), INTENT (out) :: T

integer :: NA,NB

IF (N <= 64) THEN !64 is considered small I think it depends on cache size
CALL BinarySort (a, n)
RETURN
END IF

NA=(N+1)/2
NB=N-NA

CALL MergeSort_serial(A,NA,T)
CALL MergeSort_serial(A(NA+1),NB,T(NA+1))

IF (A(N) < A(1)) THEN !arrays joined wrong way round eg. 56781234
T(1:NA)=A(1:NA)
A(1:NB)=A(NA+1:N)
A(NB+1:N)=T(1:NA)
ELSE IF (A(NA) > A(NA+1)) THEN
T(1:NA)=A(1:NA)
CALL Merger_serial(T,NA,A(NA+1),NB,A,N) !apperntly if you pass A(NA+1) into an array which is meant to be N long it pass the array and the start piont no need to do A(NA+1:n)
END IF
RETURN

END SUBROUTINE MergeSort_serial

!\$ use omp_lib

IMPLICIT NONE

INTEGER, dimension(N), INTENT(in out) :: A
INTEGER, dimension(N), INTENT (out) :: T

IF (N <= 64) THEN !64 is considered small I think it depends on cache size
CALL BinarySort (A, N)
RETURN
END IF

IF(Threads == 1) THEN ! switch over
CALL MergeSort_serial(A,N,T)
RETURN
END IF

NA=(N+1)/2
NB=N-NA
!\$OMP PARALLEL
!\$OMP SECTIONS
!\$OMP SECTION
!\$OMP SECTION
!\$OMP END SECTIONS
!\$OMP END PARALLEL

IF ( A(NA) > A(NA+1) ) THEN
T=A
! WRITE(6,*) " merge"
! call Merger_serial(T,NA,T(NA+1),NB,A,N)
!CALL Merger_Parrallel(T,NA,T(NA+1),NB,A,N,Threads) !apperntly if you pass an array A(NA+1) to another array it is the same as A(NA+1:)
END IF
RETURN

END SUBROUTINE MergeSort_Parrallel

!\$ use omp_lib

IMPLICIT NONE

INTEGER, dimension(N), INTENT(in out) :: A
INTEGER, dimension(N), INTENT (out) :: T

IF (N <= 64) THEN !64 is considered small I think it depends on cache size
CALL BinarySort (a, n)
RETURN
END IF

IF(Threads == 1) THEN ! switch over
CALL MergeSort_serial(A,N,T)
RETURN
ELSE
NA=(N+1)/2
NB=N-NA
!\$OMP SECTIONS
!\$OMP SECTION
!\$OMP SECTION
!\$OMP END SECTIONS
!\$OMP END PARALLEL
IF ( A(NA) > A(NA+1) ) THEN
T=A
!call Merger_serial(T,NA,T(NA+1),NB,A,N)
!CALL Merger_Parrallel(T,NA,T(NA+1),NB,A,N,Threads) !apperntly if you pass an array A(NA+1) to another array it is the same as A(NA+1:)
END IF
RETURN
END IF
END SUBROUTINE MergeSort_Parrallel_MAIN

program TestMergeSort

!\$ use omp_lib

IMPLICIT NONE

integer :: N = 10, maxium_threads, power, i, DEBUG
integer, Allocatable :: test(:), A(:),B(:),C(:), T(:)
REAL :: rnd, Starttime, endtime
!integer, dimension ((N+1)/2) :: T, TB
LOGICAL:: parallel = .false.

Allocate(test(N), A(N),B(N),C(N), T(N))

!\$ call omp_set_nested(.true.)

!\$OMP PARALLEL
maxium_threads =  1 !6 type 6 for testing
!\$OMP END PARALLEL
IF( maxium_threads > 1 ) WRITE(6,*) "parallel activated using " , maxium_threads

!test= (/ 7, 6,  9, 8 , 7, 9, 6, 3, 2  ,2,9,2,6,5,2,1,8,8,4,6,6,1,0,7,2,2/)

!test= (/25480,352516,666914,963055,838288,335355,915327,795863,832693,345042/)

!test= (/871183,       89918 ,     888283,      700978  ,    734552,      300175, &
!       &   49717,      908189 ,      97658,       40313   ,    85024  ,    558820,&
!      &   926451,       75640 ,     911789,       91822     , 637766,      852292,&
!      &   121077,      994574 ,     778250,       19665    ,  169863 ,     994743,&
!      &   753692,      217973 ,     659292,      539018   ,   480063  ,    839226,&
!       &   25420,      989382,      585770,      767067  ,    444816   ,   709505,&
!      &   794526,      438701,      208753,      731112 ,     665731    ,  722074,&
!      &   211470,      308251,      262988,      752295,      549794,     636723,&
!      &   631010,      494784,      470695,      122514     , 534896 ,    194157,&
!      &   967905,      589230 ,      17623,      314342    ,  553303  ,   676283,&
!       &   16883,      265655,      460880,      837450   ,   228953   ,  623515,&
!       &   45456,      523222,      809683,      331785  ,    891928,     702815,&
!      &   746032,      259072,      671299,      654974 ,     935445,     971257,&
!   &      500952,      429800 ,     317115,      603684 ,     168708 ,    926455,&
!&         535565,      537790,      564870,      369716,      488091  ,   192628,&
!    &   889262,      783895,      977774,      644044/)

test= (/190692,67165,800084,797314,636830,588782,159065,320647,448176,657950/)

!test= (/ 1, 2, 3, 4, 5, 6, 7, 8 , 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26/)
WRITE(6,*)'test :',test

a = test
CALL BinarySort(A,N)
WRITE(6,*)'Sorted array BinarySort :',A

b = test
CALL MergeSort_serial(b,N,T)
WRITE(6,*)'Sorted array, serial_hybrid :',b

c = test
WRITE(6,*)'Sorted array, parrallel_hybrid :',c

DO I = 1, n
IF(c(i) .NE. b(i)) THEN
WRITE(6,*) "error found" , i
PAUSE
END IF
END DO

PAUSE

power = 2

1 power = power+1

IF(power == 9) THEN
GOTO 2
END IF

n = 10** power
WRITE (6,*) "Timing_test for bodies", N , power
!WRITE (30,*) "Timing_test for bodies", N
DEALLOCATE(test, A,B,C,T)
ALLOCATE(test(N), A(N),B(N),C(N), T(N))

DO I = 1, n
CALL RANDOM_NUMBER (HARVEST  = rnd)  !random rumber W

test(i) = INT(rnd * 1000000) !random value between (spell?) 0 and 10
END DO
!WRITE(30,*) (test(i), i = 1, n)

WRITE(6,*) "Generate the test"

!!    a = test
!!    !!\$ Starttime = OMP_get_wtick()
!!    CALL CPU_TIME (Starttime)
!!    CALL BinarySort(A,N)
!!    CALL CPU_TIME (Endtime)
!!    !!\$ Endtime = OMP_get_wtick()
!!    WRITE(6,*)'Sorted array BinarySort in', Endtime - Starttime
!!
!!    DEBUG = 0

b = test
CALL CPU_TIME (Starttime)
!\$ Starttime = OMP_get_wtime()
CALL MergeSort_serial(b,N,T)
CALL CPU_TIME (Endtime)
!\$ Endtime = OMP_get_wtime()
WRITE(6,*)'Sorted array MergeSort_serial in', Endtime - Starttime

c = test
CALL CPU_TIME (Starttime)
!\$ Starttime = OMP_get_wtime()
CALL CPU_TIME (Endtime)
!\$ Endtime = OMP_get_wtime()

WRITE(6,*)'Sorted array parrallel_hybrid in', Endtime - Starttime

DO I = 1, n
IF(c(i) .NE. b(i)) THEN
WRITE(6,*) "error found" , i
PAUSE
END IF
END DO
!pause
GOTO 1

2 CONTINUE
end program TestMergeSort
``````

Hi Marc,

Did you set OMP_NUM_THREADS or compile with “-mp=allcores”? I see about a 2x speed-up using 4 threads on a Sandy-Bridge system. Otherwise, we default to using a single thread while Intel defaults to using all available cores.

• Mat
``````PGI\$ pgf90 -fast -mp=allcores -o pgi_merge_omp_win.exe merge.F90
PGI\$ pgi_merge_omp_win.exe
parallel activated using             4
test :       190692        67165       800084       797314       636830
588782       159065       320647       448176       657950
Sorted array BinarySort :        67165       159065       190692       320647
448176       588782       636830       657950       797314       800084
Sorted array, serial_hybrid :        67165       159065       190692
320647       448176       588782       636830       657950       797314
800084
Sorted array, parrallel_hybrid :        67165       159065       190692
320647       448176       588782       636830       657950       797314
800084
FORTRAN PAUSE: enter <return> or <ctrl>d to continue>
Timing_test for bodies         1000            3
Generate the test
Sorted array MergeSort_serial in   8.7395310E-05
Sorted array parrallel_hybrid in   1.5195459E-04
Timing_test for bodies        10000            4
Generate the test
Sorted array MergeSort_serial in   9.5014274E-04
Sorted array parrallel_hybrid in   6.3563883E-04
Timing_test for bodies       100000            5
Generate the test
Sorted array MergeSort_serial in   1.1345372E-02
Sorted array parrallel_hybrid in   6.6102371E-03
Timing_test for bodies      1000000            6
Generate the test
Sorted array MergeSort_serial in   0.1325048
Sorted array parrallel_hybrid in   7.0271447E-02
Timing_test for bodies     10000000            7
Generate the test
Sorted array MergeSort_serial in    1.404848
Sorted array parrallel_hybrid in   0.7480912
Timing_test for bodies    100000000            8
Generate the test
Sorted array MergeSort_serial in    14.96873
Sorted array parrallel_hybrid in    8.036676
PGI\$
``````

Thanks, I think you lead me to a solution!

did have OMP_NUM_THREADS set to 16, I had tried to 8 threads (the 16 core machine is 2 E5-2687W’s I thought that might avoid NUMA problems). after seeing that you got the speedup with 4 threads, I tired that and got the 2 time speed up which I think that means it a false sharing problem.