Fortran OMP Parrallel Merge preformance

Hi, I am simulation engineer who has ended up having to write a parallel merge functions

I started with this base code
http://rosettacode.org/wiki/Sorting_algorithms/Merge_sort

I followed the work of 2 papers
http://www1.chapman.edu/~radenski/research/papers/mergesort-pdpta11.pdf
which explains about using sections and using a binary sort as well


http://www.cc.gatech.edu/~ogreen3/docs/Merge_Path-_Parallel_Merging_Made_Simple.pdf
This one covers how one to make mergers happen parallel and how to make the algorithm cache aware which i think I failed to do


In theory of the papers it should be possible to create a program which would get very near to O N/p log n for large values of N

When I run the program on Intel compiler 11.038 i get about 2x times speed up on a window 7, 6 core computer.

When I compile this on PGI compiler on 13.4 I get about 2x slow on a window 7, 16 core computer.

this is even when dealing with a n of 10^8 = 100,000,000.

I find that the cache aware version on the Intel version of the merger is required for speed up.

using the not cache aware version parallel or serial merger, see about 1.5x time increase on pgi

I want to understand how i can improve this and at least get the same benefit I see on the Intel compiler if not better (which I this presume from the papers that this should be possible for large N’s)

!Works with: Fortran version 90 and later

SUBROUTINE Merger_serial(A,NA,B,NB,C,NC)
    
    IMPLICIT NONE
     
   integer, intent(in) :: NA,NB,NC         ! Normal usage: NA+NB = NC
   integer, intent(in out) :: A(NA)        ! B overlays C(NA+1:NC)
   integer, intent(in)     :: B(NB)
   integer, intent(in out) :: C(NC)
 
   integer :: I,J,K
 
   I = 1; J = 1; K = 1;
   DO WHILE(I <= NA .and. J <= NB)
      IF (A(I) <= B(J)) THEN
         C(K) = A(I)
         I = I+1
      ELSE
         C(K) = B(J)
         J = J+1
      END IF
      K = K + 1
   END DO
   DO WHILE (I <= NA) !parrall
      C(K) = A(I)
      I = I + 1
      K = K + 1
   END DO
   RETURN
 
END SUBROUTINE Merger_serial

SUBROUTINE  Merger_Parrallel(A,NA,B,NB,C,NC,p)
   !$ use omp_lib
   
   IMPLICIT NONE
   
   
   INTEGER, INTENT(in) :: NA,NB,NC,p        ! Normal usage: NA+NB = NC
   INTEGER, INTENT(in out) :: A(NA)        ! B overlays C(NA+1:NC)
   INTEGER, INTENT(in out) :: B(NB) !needs in out other wise is only a pionter and gets over written
   INTEGER, INTENT(in out) :: C(NC) 
   INTEGER :: k, k_start, k_end, j, low, high, mid, threadID ,mid_past,j_past 
   
   

    !search a long the diagonal see merge made simple   
    
    !$OMP PARALLEL PRIVATE(k, k_start, k_end, j, low, high, mid, threadID, mid_past, j_past)
    
    !$OMP DO
    DO threadID = 0, p-1
    
    k_start = (nc* threadID)/p 
    k_end = (nc*(threadID+1))/p 
    IF(k_start == 0)THEN
        Mid = 0
        j = 0
    ELSE
        Low = MAX((k_start-NB),0) 
	    High = MIN(k_start,NA)
	    Mid = Low + ((High-Low)*0.5)
	    j = (k_start - Mid) + 1  !diagonal
	    COMPARE: DO WHILE (Low<High)
            IF(MID == 0) THEN 
			    IF(a(Mid+1) > b(j-1)) THEN 
				    EXIT COMPARE !the turning spot  
			    ELSE
				    Low = Mid+1; ! search UP
			    END IF
			ELSE IF(j > NB ) THEN
			    EXIT COMPARE 
		    ELSE IF (a(Mid) > b(j)) THEN
		        High = Mid; !search Down
	        ELSE 
	            IF(a(Mid+1) > b(j-1)) THEN 
			        EXIT COMPARE !the turning spot  
			    ELSE
			        Low = Mid+1; ! search UP
			    END IF
		    END IF 
		    Mid = Low + ((High - Low)*0.5) ! next search
		    j = (k_start - Mid) + 1  !diagonal
		    IF(Mid == 0 .or. j == 0 ) EXIT COMPARE 
		    !
	    END DO COMPARE
	    J = J -1 
	END IF	
    DO k= k_start+1, k_end !k is starting piont
        IF (mid == na) THEN
            j = j+1 
		    C(k) = b(j)
        ELSEIF (j == nb) THEN
            Mid = Mid+1 
		    C(k) = a(Mid)
		ELSEIF (a(Mid+1) > b(j+1)) THEN
		    j = j+1 
		    C(k) = b(j) 
		ELSE 
		    Mid = Mid+1 
		    C(k) = a(Mid)
		END IF
	END DO	
	mid_past = mid
	j_past = j
	END DO
	!$OMP END DO 
    !$OMP END PARALLEL 
END SUBROUTINE  
										!(T,		NA,		T(NA+1),NB,		A,		N,	Threads)
SUBROUTINE  Merger_Parrallel_cache_aware(A_load,NA_load,B_load,NB_load,C_load,NC_load,	Threads)
   !$ use omp_lib
   
   !IMPLICIT NONE
   
   
   INTEGER, INTENT(in) :: NA_load,NB_load,NC_load,Threads       ! Normal usage: NA+NB = NC
   INTEGER, INTENT(in out) :: A_load(NA_load)        ! B overlays C(NA+1:NC)
   INTEGER, INTENT(in out) :: B_load(NB_load) !needs in out other wise is only a pionter and gets over written
   INTEGER, INTENT(in out) :: C_load(NC_load)
   INTEGER:: A(2000), B(2000), C(2000), NA, NB, NC
   INTEGER :: k, k_start, k_end, j, low, high, mid, threadID, p 
   INTEGER :: NA_loader_min,NA_loader_max,nb_loader_min, NB_loader_max, NC_loader_min
   INTEGER :: CACHE_SIZE, Elementsize, Segments, count, L, K_run, Mid_past, j_past
   CACHE_SIZE = 32000 !32K 
   Elementsize = 32/8 !32 bits in standard integer 
   l = 2000 !CACHE_SIZE / 3
   p = Threads !over writen if different
   Segments = INT(CEILING(REAL(NC_load)/ REAL(l) ))
    Mid_past = 0
	j_past = 0
	NC_loader_min= 0 
	DO count = 1, Segments !cache aware to prevent long searches
	   ! a = 0 
	    !b = 0
	    !c = 0
		IF(count == 1) THEN !load in values
			NA_loader_min = 1
			NA_loader_max = MIN(l, NA_load) 
			NA = NA_loader_max - NA_loader_min + 1
			A(1:NA) = A_load(NA_loader_min:NA_loader_max)

			nb_loader_min = 1
			nb_loader_max = MIN(l, nb_load) 
			NB = nb_loader_max - nb_loader_min + 1
			B(1:NB) = B_load(nb_loader_min:nb_loader_max)
		ELSE
			!NA_loader_min = MIN((mid + NA_loader_min), NA_load)
			NA_loader_min = mid_past + NA_loader_min
			NA_loader_max = MIN((l-1+NA_loader_min), NA_load) 
			!NA_loader_max = l-1+NA_loader_min
			NA = NA_loader_max - NA_loader_min + 1
			A(1:NA) = A_load(NA_loader_min:NA_loader_max)
			
			!NB_loader_min = MIN((j + NB_loader_min), NB_load)
			NB_loader_min = j_past + NB_loader_min
			NB_loader_max = MIN((l-1+NB_loader_min), NB_load) 
			!NB_loader_max = l-1+NB_loader_min
			NB = NB_loader_max - NB_loader_min + 1
			B(1:NB) = B_load(NB_loader_min:NB_loader_max)
		END IF 
        
        NC_loader_min = NC_loader_min+(mid_past + j_past)
		NC = Min((NC_load-NC_loader_min),l) 
		
        
        p =  MIN(NC, Threads) !cant have more threads than stuff to store
		!search a long the diagonal see merge made simple   
    
		!$OMP PARALLEL 
		!$OMP DO PRIVATE(k, k_start, k_end,  low, high,threadID,Mid,j)
		DO threadID = 0, p-1
			k_start = (NC* threadID)/p 
			k_end = (NC*(threadID+1))/p 
			IF(k_start == 0)THEN
				Mid = 0
				j = 0
			ELSE
				Low = MAX((k_start-NB),0) 
				High = MIN(k_start,NA)
				Mid = Low + ((High-Low)*0.5)
				j = (k_start - Mid) + 1  !diagonal
				COMPARE: DO WHILE (Low<High)
					IF(MID == 0) THEN 
					    IF(a(Mid+1) > b(j-1)) THEN 
							EXIT COMPARE !the turning spot  
						ELSE
							Low = Mid+1; ! search UP
						END IF
					ELSE IF(j > NB ) THEN
					    IF(a(Mid+1) > b(j-1)) THEN 
					        EXIT COMPARE 
					    ELSE
					        Low = Mid+1; ! search UP
					    END IF
						!EXIT COMPARE 
					ELSE IF (a(Mid) > b(j)) THEN
						High = Mid; !search Down
					ELSE 
						IF(a(Mid+1) > b(j-1)) THEN 
							EXIT COMPARE !the turning spot  
						ELSE
							Low = Mid+1; ! search UP
						END IF
					END IF 
					Mid = Low + ((High - Low)*0.5) ! next search
					j = (k_start - Mid) + 1  !diagonal
					IF(Mid == 0 .or. j == 0 ) EXIT COMPARE 
					!
				END DO COMPARE
				J = J -1 
			END IF	
			k_start =  k_start + 1
			k =  k_start
			DO  K = k_start,  k_end !k is starting piont
				IF (mid == na) THEN ! 
					j = j+1 
					C(k) = b(j)
				ELSEIF (j == nb) THEN
					Mid = Mid+1 
					C(k) = a(Mid)
				ELSEIF (a(Mid+1) > b(j+1)) THEN
					j = j+1 
					C(k) = b(j) 
				ELSE 
					Mid = Mid+1 
					C(k) = a(Mid)
				END IF
			END DO	
			!may have more to do
			
			C_load(NC_loader_min+k_start:NC_loader_min+k_end) = c(k_start:k_end)
			IF(threadID == p-1) THEN !because last private dosnt work 
			    Mid_past = Mid
			    j_past = j
			END IF
			
		END DO
		!$OMP END DO
		
		!$OMP END PARALLEL 
	END DO
END SUBROUTINE  


SUBROUTINE BinarySort(a, n) 

    IMPLICIT NONE
    
   integer, intent(in) :: N !size of array
   integer, dimension(N), intent(in out) :: a !array to be sorted
   
   INTEGER :: i, j, Low, High, Mid, Tmp
   

   DO i = 2, n !do every item a(1) is the starting piont and with be moved if need 
        !search the allready sorted
		Low = 1 
		High = i; !the new particle
		Mid = Low + ((High-Low)*0.5) !find the mid piont
    
		compare: DO WHILE (Low<High) !find the piont where it belongs
			IF (a(i) > a(Mid)) THEN 
				Low = Mid + 1; ! search right
			ELSE IF (a(i) < a(Mid)) THEN
				High = Mid; !search left
			ELSE
				EXIT compare !found spot exit 
			END IF 
			Mid = Low + ((High - Low)*0.5) ! next mid piont
		END DO compare
    
		IF (Mid < i) THEN ! does the particle 
			tmp =  a(i)
			DO j = i-1, mid, -1
				a(j+1) = a(j) !shift up
			END DO
			a(mid) = tmp
		END IF
	END DO
	
END SUBROUTINE BinarySort


RECURSIVE SUBROUTINE MergeSort_serial(A,N,T)
 
    IMPLICIT NONE
    
   integer, INTENT(in) :: N
   integer, dimension(N), INTENT(in out) :: A
   integer, dimension(N), INTENT (out) :: T
 
   integer :: NA,NB
 
   IF (N <= 64) THEN !64 is considered small I think it depends on cache size 
        CALL BinarySort (a, n) 
        RETURN
   END IF
   
   NA=(N+1)/2
   NB=N-NA
   
   CALL MergeSort_serial(A,NA,T)
   CALL MergeSort_serial(A(NA+1),NB,T(NA+1))
 
   IF (A(N) < A(1)) THEN !arrays joined wrong way round eg. 56781234
      T(1:NA)=A(1:NA)
      A(1:NB)=A(NA+1:N)
      A(NB+1:N)=T(1:NA)
   ELSE IF (A(NA) > A(NA+1)) THEN
      T(1:NA)=A(1:NA)
      CALL Merger_serial(T,NA,A(NA+1),NB,A,N) !apperntly if you pass A(NA+1) into an array which is meant to be N long it pass the array and the start piont no need to do A(NA+1:n)
   END IF
   RETURN
 
END SUBROUTINE MergeSort_serial



RECURSIVE SUBROUTINE MergeSort_Parrallel(A,N,T,Threads)

    !$ use omp_lib

    IMPLICIT NONE

   
   INTEGER, INTENT(in) :: N, Threads
   INTEGER, dimension(N), INTENT(in out) :: A
   INTEGER, dimension(N), INTENT (out) :: T
 
   INTEGER :: NA,NB, Threads_left, Threads_right, threadID ! , maxium_threads, threadID
   
   IF (N <= 64) THEN !64 is considered small I think it depends on cache size 
        CALL BinarySort (A, N) 
        RETURN
   END IF
   
   
   IF(Threads == 1) THEN ! switch over
        CALL MergeSort_serial(A,N,T)
        RETURN
   END IF
   
   
    NA=(N+1)/2
    NB=N-NA
    Threads_left = (Threads+1)/2
    Threads_right = Threads - Threads_left 
    !$OMP PARALLEL 
    !$OMP SECTIONS
    !$OMP SECTION
    CALL MergeSort_Parrallel(A,NA,T,Threads_left)
    !$OMP SECTION
    CALL MergeSort_Parrallel(A(NA+1),NB,T(NA+1),Threads_right)
    !$OMP END SECTIONS
    !$OMP END PARALLEL 

    IF ( A(NA) > A(NA+1) ) THEN
        T=A
        ! WRITE(6,*) " merge"
        ! call Merger_serial(T,NA,T(NA+1),NB,A,N)
        !CALL Merger_Parrallel(T,NA,T(NA+1),NB,A,N,Threads) !apperntly if you pass an array A(NA+1) to another array it is the same as A(NA+1:)
		CALL Merger_Parrallel_cache_aware(T,NA,T(NA+1),NB,A,N,Threads)
    END IF
    RETURN
 
END SUBROUTINE MergeSort_Parrallel
    
RECURSIVE SUBROUTINE MergeSort_Parrallel_MAIN(A,N,T,Threads)

   !$ use omp_lib
   
    IMPLICIT NONE

   
   INTEGER, INTENT(in) :: N, Threads
   INTEGER, dimension(N), INTENT(in out) :: A
   INTEGER, dimension(N), INTENT (out) :: T
 
   INTEGER :: NA,NB, Threads_left, Threads_right, maxium_threads, threadID

    IF (N <= 64) THEN !64 is considered small I think it depends on cache size 
        CALL BinarySort (a, n) 
        RETURN
   END IF
   
   IF(Threads == 1) THEN ! switch over
        CALL MergeSort_serial(A,N,T)
        RETURN
   ELSE
        NA=(N+1)/2
        NB=N-NA
        Threads_left = (Threads+1)/2
        Threads_right = Threads - Threads_left 
        !$OMP PARALLEL  ! PRIVATE(threadID)
        !$OMP SECTIONS
        !$OMP SECTION
        CALL MergeSort_Parrallel(A,NA,T,Threads_left)
        !$OMP SECTION
        CALL MergeSort_Parrallel(A(NA+1),NB,T(NA+1),Threads_right)
        !$OMP END SECTIONS
        !$OMP END PARALLEL 
        IF ( A(NA) > A(NA+1) ) THEN
            T=A
            !call Merger_serial(T,NA,T(NA+1),NB,A,N)
            !CALL Merger_Parrallel(T,NA,T(NA+1),NB,A,N,Threads) !apperntly if you pass an array A(NA+1) to another array it is the same as A(NA+1:)
			CALL Merger_Parrallel_cache_aware(T,NA,T(NA+1),NB,A,N,Threads)
        END IF
        RETURN
    END IF
END SUBROUTINE MergeSort_Parrallel_MAIN


 
program TestMergeSort

    !$ use omp_lib

    IMPLICIT NONE

    
    integer :: N = 10, maxium_threads, power, i, DEBUG
    integer, Allocatable :: test(:), A(:),B(:),C(:), T(:) 
    REAL :: rnd, Starttime, endtime
    !integer, dimension ((N+1)/2) :: T, TB
    LOGICAL:: parallel = .false.


    Allocate(test(N), A(N),B(N),C(N), T(N))
    
    
    !$ call omp_set_nested(.true.) 

    !$OMP PARALLEL
    maxium_threads =  1 !6 type 6 for testing
    !$ maxium_threads = OMP_get_num_threads()
    !$OMP END PARALLEL
    IF( maxium_threads > 1 ) WRITE(6,*) "parallel activated using " , maxium_threads
    
    !test= (/ 7, 6,  9, 8 , 7, 9, 6, 3, 2  ,2,9,2,6,5,2,1,8,8,4,6,6,1,0,7,2,2/)
    
     !test= (/25480,352516,666914,963055,838288,335355,915327,795863,832693,345042/)

!test= (/871183,       89918 ,     888283,      700978  ,    734552,      300175, &
!       &   49717,      908189 ,      97658,       40313   ,    85024  ,    558820,&
!      &   926451,       75640 ,     911789,       91822     , 637766,      852292,&
!      &   121077,      994574 ,     778250,       19665    ,  169863 ,     994743,&
!      &   753692,      217973 ,     659292,      539018   ,   480063  ,    839226,&
!       &   25420,      989382,      585770,      767067  ,    444816   ,   709505,&
!      &   794526,      438701,      208753,      731112 ,     665731    ,  722074,&
!      &   211470,      308251,      262988,      752295,      549794,     636723,&
!      &   631010,      494784,      470695,      122514     , 534896 ,    194157,&
!      &   967905,      589230 ,      17623,      314342    ,  553303  ,   676283,&
!       &   16883,      265655,      460880,      837450   ,   228953   ,  623515,&
!       &   45456,      523222,      809683,      331785  ,    891928,     702815,&
!      &   746032,      259072,      671299,      654974 ,     935445,     971257,&
!   &      500952,      429800 ,     317115,      603684 ,     168708 ,    926455,&
!&         535565,      537790,      564870,      369716,      488091  ,   192628,&
!    &   889262,      783895,      977774,      644044/)
    
	test= (/190692,67165,800084,797314,636830,588782,159065,320647,448176,657950/)
	
	    !test= (/ 1, 2, 3, 4, 5, 6, 7, 8 , 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26/)
    WRITE(6,*)'test :',test
    
    

    a = test
    CALL BinarySort(A,N)
    WRITE(6,*)'Sorted array BinarySort :',A

    b = test
    CALL MergeSort_serial(b,N,T)
    WRITE(6,*)'Sorted array, serial_hybrid :',b

    c = test
    CALL MergeSort_Parrallel_MAIN(c,N,T,maxium_threads)
    WRITE(6,*)'Sorted array, parrallel_hybrid :',c

    DO I = 1, n 
        IF(c(i) .NE. b(i)) THEN
            WRITE(6,*) "error found" , i 
            PAUSE 
        END IF
    END DO 


    PAUSE
    
    power = 2
    
    1 power = power+1
	
	IF(power == 9) THEN
		GOTO 2
	END IF
	    
    n = 10** power
     WRITE (6,*) "Timing_test for bodies", N , power 
     !WRITE (30,*) "Timing_test for bodies", N 
     DEALLOCATE(test, A,B,C,T)
     ALLOCATE(test(N), A(N),B(N),C(N), T(N))
    
    
    
    DO I = 1, n 
        CALL RANDOM_NUMBER (HARVEST  = rnd)  !random rumber W
	    
        test(i) = INT(rnd * 1000000) !random value between (spell?) 0 and 10
    END DO
    !WRITE(30,*) (test(i), i = 1, n)
    
    WRITE(6,*) "Generate the test"
    
!!    a = test
!!    !!$ Starttime = OMP_get_wtick()
!!    CALL CPU_TIME (Starttime)
!!    CALL BinarySort(A,N)
!!    CALL CPU_TIME (Endtime)
!!    !!$ Endtime = OMP_get_wtick()
!!    WRITE(6,*)'Sorted array BinarySort in', Endtime - Starttime
!!
!!    DEBUG = 0

    b = test
    CALL CPU_TIME (Starttime)
    !$ Starttime = OMP_get_wtime()
    CALL MergeSort_serial(b,N,T)
    CALL CPU_TIME (Endtime)
    !$ Endtime = OMP_get_wtime()
    WRITE(6,*)'Sorted array MergeSort_serial in', Endtime - Starttime

    c = test
    CALL CPU_TIME (Starttime)
    !$ Starttime = OMP_get_wtime()
    CALL MergeSort_Parrallel_MAIN(c,N,T,maxium_threads)
    CALL CPU_TIME (Endtime)
    !$ Endtime = OMP_get_wtime()
    
    WRITE(6,*)'Sorted array parrallel_hybrid in', Endtime - Starttime

    DO I = 1, n 
        IF(c(i) .NE. b(i)) THEN
            WRITE(6,*) "error found" , i 
            PAUSE 
        END IF
   END DO 
   !pause
    GOTO 1
	
	2 CONTINUE 
end program TestMergeSort

Hi Marc,

Did you set OMP_NUM_THREADS or compile with “-mp=allcores”? I see about a 2x speed-up using 4 threads on a Sandy-Bridge system. Otherwise, we default to using a single thread while Intel defaults to using all available cores.

  • Mat
PGI$ pgf90 -fast -mp=allcores -o pgi_merge_omp_win.exe merge.F90
PGI$ pgi_merge_omp_win.exe               
 parallel activated using             4
 test :       190692        67165       800084       797314       636830
       588782       159065       320647       448176       657950
 Sorted array BinarySort :        67165       159065       190692       320647
       448176       588782       636830       657950       797314       800084
 Sorted array, serial_hybrid :        67165       159065       190692
       320647       448176       588782       636830       657950       797314
       800084
 Sorted array, parrallel_hybrid :        67165       159065       190692
       320647       448176       588782       636830       657950       797314
       800084
FORTRAN PAUSE: enter <return> or <ctrl>d to continue>
 Timing_test for bodies         1000            3
 Generate the test
 Sorted array MergeSort_serial in   8.7395310E-05
 Sorted array parrallel_hybrid in   1.5195459E-04
 Timing_test for bodies        10000            4
 Generate the test
 Sorted array MergeSort_serial in   9.5014274E-04
 Sorted array parrallel_hybrid in   6.3563883E-04
 Timing_test for bodies       100000            5
 Generate the test
 Sorted array MergeSort_serial in   1.1345372E-02
 Sorted array parrallel_hybrid in   6.6102371E-03
 Timing_test for bodies      1000000            6
 Generate the test
 Sorted array MergeSort_serial in   0.1325048
 Sorted array parrallel_hybrid in   7.0271447E-02
 Timing_test for bodies     10000000            7
 Generate the test
 Sorted array MergeSort_serial in    1.404848
 Sorted array parrallel_hybrid in   0.7480912
 Timing_test for bodies    100000000            8
 Generate the test
 Sorted array MergeSort_serial in    14.96873
 Sorted array parrallel_hybrid in    8.036676
PGI$

Thanks, I think you lead me to a solution!

did have OMP_NUM_THREADS set to 16, I had tried to 8 threads (the 16 core machine is 2 E5-2687W’s I thought that might avoid NUMA problems). after seeing that you got the speedup with 4 threads, I tired that and got the 2 time speed up which I think that means it a false sharing problem.