How can I make the parallel work?

Dear all,

First I have to apologize for the long code. I tried to make a piece of code to deal with sparse matrix multiplication. The basic concept for this is to store the non-zero elements of the sparse matrix in a list in certain order, then this list is used to carry out the matrix-multiplication-ish calculation. For example, if I have a sparse matrix:

     1  -1  *  -3  *
    -2   5  *   *  *
     *   *  4   6  4
    -4   *  2   7  *
     *   8  *   *  -5

Thus I can generate several lists that mark the positions and the values of the non-zero elements (X is the array to store the value, DR stores the row index of the corresponding non-zero element):

X  = 1  -2  -4  -1  5  8  4  2  -3  6  7  4  -5
DR = 1   2   4   1  2  5  3  4   1  3  4  2   5 
DS = 1   4   7   9  12
DE = 4   7   9  12  14

where DE(i)-DS(i) gives the number of non-zero elements of the ith column.

Based on this thinking, I developed the following code:

      program xgemmtest
      implicit real*8 (A-H,O-Z)
      integer i,j,k,l
      integer n,num_pes
      parameter (n=6)
      Real*8 XA(n,n),XCa(n,n),XMA(n,n)
      Real*8 XVal(10000000)
      integer*4 IRIdxS(n),IRIdxE(n),IRowN(10000000)
      character DayTim*24
      real*8 Zero, One, Ten
      data Zero/0.0d0/, One/1.0d0/, Ten/10.0d0/
C Matrix initialization.
      do i=1,n
        do j=1,n
          XA(j,i)=Zero
          XCa(j,i)=Zero
          XMA(j,i)=Zero
        end do
      end do
C Sparse matrix generation
      itmp=0
      itmp2=0
      do i=1,n
         XA(i,i)=real(((i*(i-1))/2)+i)
        if(i.eq.(itmp2*68+1))then
          itmp2=itmp2+1
          do j=i+1,n
            itmp=itmp+2
            l=((j*(j-1))/2)+i
            XA(j,i)=real(l)
            XA(i,j)=XA(j,i)
          end do
        endif
      end do
C
      write(*,*)'non-zero elements number itmp=',itmp
      write(*,*)'dimension of matrices n*n=',n*n
C Dense matrix generation
      do i=1,n
        do j=1,n
          XCa(j,i)=One/(Ten**4)
        end do
      end do
C
         call FDate(DayTim)
        write(*,"5X,A") DayTim 
C
C Carry out the sparse matrix multiplication.
        call XSPMM('N',n,n,n,one,XCa,n,XA,IRIdxS,IRIdxE,XVal,IRowN,one,
     $            XMA,n)
        call XSPMM('N',n,n,n,one,XCa,n,XA,IRIdxS,IRIdxE,XVal,IRowN,one,
     $            XMA,n)
C
         call FDate(DayTim)
        write(*,"5X,A") DayTim
C
        write(*,*)'XMA=',XMA
C
      stop
      end
C
*Deck XSPMM
      Subroutine XSPMM(Mtyp,M,N,K,Alpha,A,LDA,B,DS,DE,X,DR,BetaI,C,
     $  LDC)
      Implicit Integer(A-Z)
C
C     Wrapper for sparse matrix multiplies (square) which handles both real and
C     OpenMP parallelism:
C
C     The matrix multiplication of:
C
C     C := Beta*C + Alpha*A*B
C
C     is carried out. Where the matrix B is a sparse matrix. In this subroutine, 
C     the non-zero elements of the sparse matrix is first to be stored in a list
C     so that all the non-zero elements can be arranged sequentially in the list.
C     Then this list will be used to carry out the matrix-matrix multiplication.
C     Therefore the zero-test process used previously in the DGEMM subroutine can 
C     be eliminated here.
C
C     There will be two types of sparse matrix: 'N' specifies normal square matrix,
C     and 'S' specifies symmetry matrix.
C
      Parameter (N3Min=1)
      Character*(*) Mtyp, XStr1*1
      Logical DoInit, TypeN, TypeS
      Real*8 Alpha,A(*),B(K,*),X(*),Beta,BetaI,C(*)
      Real*8 CAlpha(2),CBeta(2)
      Integer*4 DS(*),DE(*),DR(*),ICont,SP
      Integer omp_get_num_threads
#ifdef BLAS_I4
      Integer*4 XM, XK, XN, XLDA, XLDC
#else
      Integer XM, XK, XN, XLDA, XLDC
#endif
#ifdef _OPENMP_MM_
      Parameter (MinCoW=1)
#endif
      Save Zero, One
      Data Zero/0.0d0/, One/1.0d0/
C
      DoInit = K.eq.0
      If(DoInit) then
        If(BetaI.eq.Zero) then
          Do 10 I=1,N
            II=(I-1)*LDC
            Do 10 J=1,M
  10          C(II+J)=Zero
        elseif(BetaI.ne.One) then
          Do 20 I = 1, N
            II = (I-1)*LDC
            Do 20 J = 1, M
  20          C(II+J) = Beta*C(II+J)
          endIf
         Beta = One
      else
         Beta = BetaI
      Endif
C
      If(K.ne.0.and.M.ne.0.and.N.ne.0) then
        CAlpha(1) = Alpha
        CAlpha(2) = Zero
        CBeta(1) = Beta
        CBeta(2) = Zero
        XM = M
        XK = K
        XN = N
        XLDA = LDA
        XLDC = LDC
        XStr1 = Mtyp
        TypeN = Mtyp.eq.'N'
        TypeS = Mtyp.eq.'S'
        ICont = 0
C
        NP=1
C$OMP PARALLEL 
C$OMP SINGLE
C$      NP=omp_get_num_threads()
C$OMP END SINGLE
C$OMP END PARALLEL
C
C  Non-zero elements list generation for normal square matrix.
C
        IF (TypeN) Then
          DS(1)=1
          Do 30 J = 1, N
            If (B(J,1).NE.Zero) Then
              ICont = ICont + 1
              DR(ICont) = J
              X (ICont) = B(J,1)
            EndIf
  30      Continue
          DE(1) = ICont + 1
          Do 40 I = 2,K
            DS(I) = DE(I-1)
            Do 50 J = 1, N
              If(B(J,I).NE.Zero)Then
                ICont = ICont + 1
                DR(ICont) = J
                X (ICont) = B(J,I)
              End If
  50        Continue
            DE(I) = ICont + 1
  40      Continue
C
C  Using the list to carry out matrix multiplication.
C
          IF (ICont.LE.N) THEN
            XN = ICont
            ColPW = Max((ICont+NP-1)/NP,MinCoW)
            NWork = (ICont+ColPW-1)/ColPW
            If(NWork.eq.1) then
              Call SPGEMM('N',XM,XN,XK,Alpha,A,XLDA,DS,DE,X,DR,Beta,C,
     $           XLDC)
            Else
              IncC = ColPW*LDC
C$OMP Parallel Do Default(Shared) Schedule(Static,1) Private(IP,XN,SP)
              Do 60 IP = 0, (NWork-1)
                XN = Min(ICont-IP*ColPW,ColPW)
                SP = DS(1+IP*ColPW)
                Call SPGEMM('N',XM,XN,XK,Alpha,A,XLDA,DS(1+IP*ColPW),
     $             DE(1+IP*ColPW),X(SP),DR(SP),
     $             Beta,C(1+IP*IncC),XLDC)
  60          Continue
            EndIf
          ELSE
            ColPW = Max((N+NP-1)/NP,MinCoW)
            NWork = (N+ColPW-1)/ColPW
            If(NWork.eq.1) then
              Call SPGEMM('N',XM,XN,XK,Alpha,A,XLDA,DS,DE,X,DR,Beta,C,
     $           XLDC)
            Else
              IncC = ColPW*LDC
C$OMP Parallel Do Default(Shared) Schedule(Static,1) Private(IP,XN,SP)
              Do 70 IP = 0, (NWork-1)
                XN = Min(N-IP*ColPW,ColPW)
                SP = DS(1+IP*ColPW)
                Call SPGEMM('N',XM,XN,XK,Alpha,A,XLDA,DS(1+IP*ColPW),
     $             DE(1+IP*ColPW),X(SP),DR(SP),
     $             Beta,C(1+IP*IncC),XLDC)
  70          Continue
            EndIf
          ENDIF
C
C Non-zero elements list generation for symmetrical sparse matrix.
C
        ELSEIF (TypeS) THEN
          DS(1)=1
          Do 80 J = 1, N
            If(B(J,1).NE.Zero)Then
              ICont = ICont + 1
              DR(ICont) = J 
              X (ICont) = B(J,1)
            EndIf
  80      Continue
          DE(1) = ICont + 1
          Do 90 I = 2,K 
            DS(I) = DE(I-1)
            Do 100 J = I, N
              If(B(J,I).NE.Zero)Then
                ICont = ICont + 1
                DR(ICont) = J 
                X (ICont) = B(J,I)
              EndIf
 100        Continue
            DE(I) = ICont + 1
  90      Continue
C
C Using list to carry out the matrix multiplication.
C
          ColPW = Max((N+NP-1)/NP,MinCoW)
          NWork = (N+ColPW-1)/ColPW
          If(NWork.eq.1) then
            Call SPGEMM('S',XM,XN,XK,Alpha,A,XLDA,DS,DE,X,DR,Beta,C,
     $         XLDC)
          Else
C$OMP Parallel Do Default(Shared) Schedule(Static,1) Private(IP,XN,SP)
            Do 110 IP = 0, (NWork-1)
              XN = Min(N-IP*ColPW,ColPW)
              SP = DS(1+IP*ColPW)
              Call SPGEMM('S',XM,XN,XK,Alpha,A,XLDA,DS(1+IP*ColPW),
     $           DE(1+IP*ColPW),X(SP),DR(SP),Beta,C,XLDC)
 110        Continue
          EndIf
        ELSE
          Write(*,*)'Wrong matrix type! Should be N or S only'
          Stop
        ENDIF
      EndIf
      Return
      End
*Deck SPGEMM
      Subroutine SPGEMM ( MTYPE, M, N, K, ALPHA, A, LDA, DS, DE, X, DR,
     $                   BETA, C, LDC )
      Implicit Real*8(A-H,O-Z)
      CHARACTER*1        MTYPE
      INTEGER            M, N, K, LDA, LDC
      Real*8   ALPHA, BETA
      Real*8   A( LDA, * ), X( * ), C( LDC, * )
      INTEGER*4 DS( * ), DE( * ), DR( * )
      LOGICAL            NOTA, NOTB
      INTEGER            I, INFO, J, L, ISP, IEP, NR
      Real*8   TEMP
      Real*8   ONE         , ZERO
      PARAMETER        ( ONE = 1.0D+0, ZERO = 0.0D+0 )
C     ..
      NOTA = MTYPE.eq.'N'
      NOTB = MTYPE.eq.'S'
C
      INFO = 0
      IF( M  .LT.0               )THEN
         INFO = 1
      ELSE IF( N  .LT.0               )THEN
         INFO = 2
      ELSE IF( K  .LT.0               )THEN
         INFO = 3
      ELSE IF( LDA.LT.MAX( 1, M ) )THEN
         INFO = 4
      ELSE IF( LDC.LT.MAX( 1, M     ) )THEN
         INFO = 5
      END IF
      If(Info.ne.0) then
        write(*,*)'---SPGEMM, wrong Info:',Info,'---'
        stop
      endif
C
      If(M.eq.0.or.N.eq.0.or.
     $  ((Alpha.eq.Zero.or.K.eq.0).and.Beta.eq.One)) Return
C
C     And if  alpha.eq.zero.
C
      IF( ALPHA.EQ.ZERO )THEN
         IF( BETA.EQ.ZERO )THEN
            DO 20, J = 1, N
               DO 10, I = 1, M
                  C( I, J ) = ZERO
   10          CONTINUE
   20       CONTINUE
         ELSE
            DO 40, J = 1, N
               DO 30, I = 1, M
                  C( I, J ) = BETA*C( I, J )
   30          CONTINUE
   40       CONTINUE
         END IF
         RETURN
      END IF
C
C     Start the operations.
C
      IF( NOTA )THEN
C
C   Deal with normal square sparse matrix multiplication
C
        DO 90, J = 1, N
          IF( BETA.EQ.ZERO )THEN
             DO 50, I = 1, M
                C( I, J ) = ZERO
   50        CONTINUE
          ELSE IF( BETA.NE.ONE )THEN
             DO 60, I = 1, M
                C( I, J ) = BETA*C( I, J )
   60        CONTINUE
          ENDIF
          ISP = DS(J)
          IEP = DE(J)-1
          DO 80, L = ISP, IEP
            TEMP = X( L )
            NR   = DR( L )
            DO 70, I = 1, M
              C( I, J ) = C( I, J ) + TEMP*A( I, NR )
   70       CONTINUE
   80     CONTINUE
   90   CONTINUE
      ELSEIF( NOTB )THEN
C
C   Deal with symmetry square sparse matrix multiplication
C
        DO 160, J = 1, N
          IF( BETA.EQ.ZERO )THEN
             DO 100, I = 1, M
                C( I, J ) = ZERO
  100        CONTINUE
          ELSE IF( BETA.NE.ONE )THEN
             DO 110, I = 1, M
                C( I, J ) = BETA*C( I, J )
  110        CONTINUE
          ENDIF
          ISP = DS(J)
          IEP = DE(J)-1
          DO 130, L = ISP, IEP
            TEMP = X( L )
            NR   = DR( L )
            DO 120, I = 1, M
              C( I, J ) = C( I, J ) + TEMP*A( I, NR )
  120       CONTINUE
  130     CONTINUE
          ISP = DS(J)+1
          IEP = DE(J)-1
          DO 150, L = ISP, IEP
            TEMP = X( L )
            NR   = DR( L )
            DO 140, I = 1, M
              C( I, NR ) = C( I, NR ) + TEMP*A( I, J )
  140       CONTINUE
  150     CONTINUE
  160   CONTINUE
      ENDIF
      RETURN
      END

Compiled it with:

gau-cpp  -I -D_OPENMP_ -D_OPENMP_MM_ -DCA1_DGEMM -DCA2_DGEMM -DCAB_DGEMM -DGREAL=DREAL xgemm.F xgemm.f

pgf77 -i8 -mp  -O2 -tp p7-64 -time -fast -c xgemm.f

pgf77 -i8 -mp  -O2 -tp p7-64 -time -fast -o xgemm.exe xgemm.o

For serial job, this code works fine. And the output is:

     XMA=  8.2E-003  8.2E-003   8.2E-003  8.2E-003  8.2E-003  8.2E-003
           1.0E-003  1.0E-003   1.0E-003  1.0E-003  1.0E-003  1.0E-003
           2.0E-003  2.0E-003   2.0E-003  2.0E-003  2.0E-003  2.0E-003
           3.4E-003  3.4E-003   3.4E-003  3.4E-003  3.4E-003  3.4E-003
           5.2E-003  5.2E-003   5.2E-003  5.2E-003  5.2E-003  5.2E-003
           7.4E-003  7.4E-003   7.4E-003  7.4E-003  7.4E-003  7.4E-003

Once the number of threads changes to more than 1, e.g. 2 thread, the output becomes:

     XMA=  8.2E-003  8.2E-003   8.2E-003  8.2E-003  8.2E-003   8.2E-003
           1.0E-003  1.0E-003   1.0E-003  1.0E-003  1.0E-003   1.0E-003
           2.0E-003  2.0E-003   2.0E-003  2.0E-003  2.0E-003   2.0E-003
           0.0E+000  0.0E+000   0.0E+000  0.0E+000  0.0E+000   0.0E+000
           0.0E+000  0.0E+000   0.0E+000  0.0E+000  0.0E+000   0.0E+000
           0.0E+000  0.0E+000   0.0E+000  0.0E+000  0.0E+000   0.0E+000

As we can see, the first thread works fine, but it seems the second thread doesn’t work at all. Any one can help me on finding out the reason for this? Thank you very much!!

Sharp

Hi Sharp,

Using the ‘print *’ method for debugging, I noticed that your ‘TEMP’ value can be 0 when the using more than 1 thread. So I tinkered around a bit and found I could fix this if I changed the call to SPGEMM to pass in ‘X,DR’ instead of ‘X(SP),DR(SP)’. I didn’t take it farther than that, so will leave it to to figure out why.

  • Mat
% diff -u sharp1.F sharp2.F
--- sharp1.F    2009-03-02 12:15:11.000000000 -0800
+++ sharp2.F    2009-03-02 12:16:11.000000000 -0800
@@ -179,7 +179,7 @@
                 XN = Min(ICont-IP*ColPW,ColPW)
                 SP = DS(1+IP*ColPW)
                 Call SPGEMM('N',XM,XN,XK,Alpha,A,XLDA,DS(1+IP*ColPW),
-     $             DE(1+IP*ColPW),X(SP),DR(SP),
+     $             DE(1+IP*ColPW),X,DR,
      $             Beta,C(1+IP*IncC),XLDC)
   60          Continue
             EndIf
@@ -196,7 +196,7 @@
                 XN = Min(N-IP*ColPW,ColPW)
                 SP = DS(1+IP*ColPW)
                 Call SPGEMM('N',XM,XN,XK,Alpha,A,XLDA,DS(1+IP*ColPW),
-     $             DE(1+IP*ColPW),X(SP),DR(SP),
+     $             DE(1+IP*ColPW),X,DR,
      $             Beta,C(1+IP*IncC),XLDC)
   70          Continue
             EndIf

% pgf90 -mp -fast sharp1.F -D_OPENMP_ -D_OPENMP_MM_ -DCA1_DGEMM -DCA2_DGEMM -DCAB_DGEMM -DGREAL=DREAL -o sharp1.exe -V8.0-4
% setenv OMP_NUM_THREADS 8       
% sharp1.exe
 non-zero elements number itmp=           10
 dimension of matrices n*n=           36
     Mon Mar  2 12:21:17 2009
     Mon Mar  2 12:21:17 2009
 XMA=   8.2000000000000007E-003   8.2000000000000007E-003
   8.2000000000000007E-003   8.2000000000000007E-003   8.2000000000000007E-003
   8.2000000000000007E-003   5.1999999999999998E-003   5.1999999999999998E-003
   5.1999999999999998E-003   5.1999999999999998E-003   5.1999999999999998E-003
   5.1999999999999998E-003    0.000000000000000         0.000000000000000
    0.000000000000000         0.000000000000000         0.000000000000000
    0.000000000000000         0.000000000000000         0.000000000000000
    0.000000000000000         0.000000000000000         0.000000000000000
    0.000000000000000         0.000000000000000         0.000000000000000
    0.000000000000000         0.000000000000000         0.000000000000000
    0.000000000000000         0.000000000000000         0.000000000000000
    0.000000000000000         0.000000000000000         0.000000000000000
    0.000000000000000
FORTRAN STOP

% pgf90 -mp -fast sharp2.F -D_OPENMP_ -D_OPENMP_MM_ -DCA1_DGEMM -DCA2_DGEMM -DCAB_DGEMM -DGREAL=DREAL -o sharp2.exe -V8.0-4
% setenv OMP_NUM_THREADS 8
% sharp2.exe
 non-zero elements number itmp=           10
 dimension of matrices n*n=           36
     Mon Mar  2 12:20:48 2009
     Mon Mar  2 12:20:48 2009
 XMA=   8.2000000000000007E-003   8.2000000000000007E-003
   8.2000000000000007E-003   8.2000000000000007E-003   8.2000000000000007E-003
   8.2000000000000007E-003   1.0000000000000000E-003   1.0000000000000000E-003
   1.0000000000000000E-003   1.0000000000000000E-003   1.0000000000000000E-003
   1.0000000000000000E-003   2.0000000000000000E-003   2.0000000000000000E-003
   2.0000000000000000E-003   2.0000000000000000E-003   2.0000000000000000E-003
   2.0000000000000000E-003   3.4000000000000002E-003   3.4000000000000002E-003
   3.4000000000000002E-003   3.4000000000000002E-003   3.4000000000000002E-003
   3.4000000000000002E-003   5.1999999999999998E-003   5.1999999999999998E-003
   5.1999999999999998E-003   5.1999999999999998E-003   5.1999999999999998E-003
   5.1999999999999998E-003   7.4000000000000003E-003   7.4000000000000003E-003
   7.4000000000000003E-003   7.4000000000000003E-003   7.4000000000000003E-003
   7.4000000000000003E-003
FORTRAN STOP

Hi Matt, the reason I passed ‘X(SP),DR(SP)’ to the call to SPGEMM is to set a start point to each thread. So the piece of array X and DR started from SP can be fetched directly into the cache corresponding to the thread without a memory access. I thought this might speed up the parallel scale. I just can’t understand why this doesn’t work. I am really confused.

Also I found the following thing:

If I use the key word “S” instead of “N” when I call XSPMM:

        call XSPMM('S',n,n,n,one,XCa,n,XA,IRIdxS,IRIdxE,XVal,IRowN,one,
     $            XMA,n)
        call XSPMM('S',n,n,n,one,XCa,n,XA,IRIdxS,IRIdxE,XVal,IRowN,one,
     $            XMA,n)

And in the calling SPGEMM bit if I pass “X(SP),DR(SP)” instead of “X,DR”, I can have:
serial:

     XMA=  8.2E-003  8.2E-003   8.2E-003  8.2E-003  8.2E-003  8.2E-003
           1.0E-003  1.0E-003   1.0E-003  1.0E-003  1.0E-003  1.0E-003
           2.0E-003  2.0E-003   2.0E-003  2.0E-003  2.0E-003  2.0E-003
           3.4E-003  3.4E-003   3.4E-003  3.4E-003  3.4E-003  3.4E-003
           5.2E-003  5.2E-003   5.2E-003  5.2E-003  5.2E-003  5.2E-003
           7.4E-003  7.4E-003   7.4E-003  7.4E-003  7.4E-003  7.4E-003

2 cpus:

     XMA=  8.2E-003  8.2E-003   8.2E-003  8.2E-003  8.2E-003   8.2E-003
           1.0E-003  1.0E-003   1.0E-003  1.0E-003  1.0E-003   1.0E-003
           2.0E-003  2.0E-003   2.0E-003  2.0E-003  2.0E-003   2.0E-003
           1.4E-003  1.4E-003   1.4E-003  1.4E-003  1.4E-003   1.4E-003
           2.2E-003  2.2E-003   2.2E-003  2.2E-003  2.2E-003   2.2E-003
           3.2E-003  3.2E-003   3.2E-003  3.2E-003  3.2E-003   3.2E-003

As we can see, at least half of the result is right. However, when I pass “X,DR” to SPGEMM instead of “X(SP),DR(SP)”, I got:

     XMA=  1.0E-002  1.0E-002   1.0E-002  1.0E-002  1.0E-002   1.0E-002
           2.5E-003  4.0E-003   4.0E-003  4.0E-003  4.0E-003   4.0E-003
           6.2E-003  6.2E-003   6.2E-003  6.2E-003  6.2E-003   6.2E-003
           1.4E-003  1.4E-003   1.4E-003  1.4E-003  1.4E-003   1.4E-003
           2.2E-003  2.2E-003   2.2E-003  2.2E-003  2.2E-003   2.2E-003
           3.2E-003  3.2E-003   3.2E-003  3.2E-003  3.2E-003   3.2E-003

The result is completely wrong. I am quite confused about this output. If I parallel the code inside the subroutine SPGEMM, I think this won’t happen. But I would like to leave SPGEMM alone and parallel it outside. How can I make this happen? Thank you very much!!!