Dear all,
First I have to apologize for the long code. I tried to make a piece of code to deal with sparse matrix multiplication. The basic concept for this is to store the non-zero elements of the sparse matrix in a list in certain order, then this list is used to carry out the matrix-multiplication-ish calculation. For example, if I have a sparse matrix:
1 -1 * -3 *
-2 5 * * *
* * 4 6 4
-4 * 2 7 *
* 8 * * -5
Thus I can generate several lists that mark the positions and the values of the non-zero elements (X is the array to store the value, DR stores the row index of the corresponding non-zero element):
X = 1 -2 -4 -1 5 8 4 2 -3 6 7 4 -5
DR = 1 2 4 1 2 5 3 4 1 3 4 2 5
DS = 1 4 7 9 12
DE = 4 7 9 12 14
where DE(i)-DS(i) gives the number of non-zero elements of the ith column.
Based on this thinking, I developed the following code:
program xgemmtest
implicit real*8 (A-H,O-Z)
integer i,j,k,l
integer n,num_pes
parameter (n=6)
Real*8 XA(n,n),XCa(n,n),XMA(n,n)
Real*8 XVal(10000000)
integer*4 IRIdxS(n),IRIdxE(n),IRowN(10000000)
character DayTim*24
real*8 Zero, One, Ten
data Zero/0.0d0/, One/1.0d0/, Ten/10.0d0/
C Matrix initialization.
do i=1,n
do j=1,n
XA(j,i)=Zero
XCa(j,i)=Zero
XMA(j,i)=Zero
end do
end do
C Sparse matrix generation
itmp=0
itmp2=0
do i=1,n
XA(i,i)=real(((i*(i-1))/2)+i)
if(i.eq.(itmp2*68+1))then
itmp2=itmp2+1
do j=i+1,n
itmp=itmp+2
l=((j*(j-1))/2)+i
XA(j,i)=real(l)
XA(i,j)=XA(j,i)
end do
endif
end do
C
write(*,*)'non-zero elements number itmp=',itmp
write(*,*)'dimension of matrices n*n=',n*n
C Dense matrix generation
do i=1,n
do j=1,n
XCa(j,i)=One/(Ten**4)
end do
end do
C
call FDate(DayTim)
write(*,"5X,A") DayTim
C
C Carry out the sparse matrix multiplication.
call XSPMM('N',n,n,n,one,XCa,n,XA,IRIdxS,IRIdxE,XVal,IRowN,one,
$ XMA,n)
call XSPMM('N',n,n,n,one,XCa,n,XA,IRIdxS,IRIdxE,XVal,IRowN,one,
$ XMA,n)
C
call FDate(DayTim)
write(*,"5X,A") DayTim
C
write(*,*)'XMA=',XMA
C
stop
end
C
*Deck XSPMM
Subroutine XSPMM(Mtyp,M,N,K,Alpha,A,LDA,B,DS,DE,X,DR,BetaI,C,
$ LDC)
Implicit Integer(A-Z)
C
C Wrapper for sparse matrix multiplies (square) which handles both real and
C OpenMP parallelism:
C
C The matrix multiplication of:
C
C C := Beta*C + Alpha*A*B
C
C is carried out. Where the matrix B is a sparse matrix. In this subroutine,
C the non-zero elements of the sparse matrix is first to be stored in a list
C so that all the non-zero elements can be arranged sequentially in the list.
C Then this list will be used to carry out the matrix-matrix multiplication.
C Therefore the zero-test process used previously in the DGEMM subroutine can
C be eliminated here.
C
C There will be two types of sparse matrix: 'N' specifies normal square matrix,
C and 'S' specifies symmetry matrix.
C
Parameter (N3Min=1)
Character*(*) Mtyp, XStr1*1
Logical DoInit, TypeN, TypeS
Real*8 Alpha,A(*),B(K,*),X(*),Beta,BetaI,C(*)
Real*8 CAlpha(2),CBeta(2)
Integer*4 DS(*),DE(*),DR(*),ICont,SP
Integer omp_get_num_threads
#ifdef BLAS_I4
Integer*4 XM, XK, XN, XLDA, XLDC
#else
Integer XM, XK, XN, XLDA, XLDC
#endif
#ifdef _OPENMP_MM_
Parameter (MinCoW=1)
#endif
Save Zero, One
Data Zero/0.0d0/, One/1.0d0/
C
DoInit = K.eq.0
If(DoInit) then
If(BetaI.eq.Zero) then
Do 10 I=1,N
II=(I-1)*LDC
Do 10 J=1,M
10 C(II+J)=Zero
elseif(BetaI.ne.One) then
Do 20 I = 1, N
II = (I-1)*LDC
Do 20 J = 1, M
20 C(II+J) = Beta*C(II+J)
endIf
Beta = One
else
Beta = BetaI
Endif
C
If(K.ne.0.and.M.ne.0.and.N.ne.0) then
CAlpha(1) = Alpha
CAlpha(2) = Zero
CBeta(1) = Beta
CBeta(2) = Zero
XM = M
XK = K
XN = N
XLDA = LDA
XLDC = LDC
XStr1 = Mtyp
TypeN = Mtyp.eq.'N'
TypeS = Mtyp.eq.'S'
ICont = 0
C
NP=1
C$OMP PARALLEL
C$OMP SINGLE
C$ NP=omp_get_num_threads()
C$OMP END SINGLE
C$OMP END PARALLEL
C
C Non-zero elements list generation for normal square matrix.
C
IF (TypeN) Then
DS(1)=1
Do 30 J = 1, N
If (B(J,1).NE.Zero) Then
ICont = ICont + 1
DR(ICont) = J
X (ICont) = B(J,1)
EndIf
30 Continue
DE(1) = ICont + 1
Do 40 I = 2,K
DS(I) = DE(I-1)
Do 50 J = 1, N
If(B(J,I).NE.Zero)Then
ICont = ICont + 1
DR(ICont) = J
X (ICont) = B(J,I)
End If
50 Continue
DE(I) = ICont + 1
40 Continue
C
C Using the list to carry out matrix multiplication.
C
IF (ICont.LE.N) THEN
XN = ICont
ColPW = Max((ICont+NP-1)/NP,MinCoW)
NWork = (ICont+ColPW-1)/ColPW
If(NWork.eq.1) then
Call SPGEMM('N',XM,XN,XK,Alpha,A,XLDA,DS,DE,X,DR,Beta,C,
$ XLDC)
Else
IncC = ColPW*LDC
C$OMP Parallel Do Default(Shared) Schedule(Static,1) Private(IP,XN,SP)
Do 60 IP = 0, (NWork-1)
XN = Min(ICont-IP*ColPW,ColPW)
SP = DS(1+IP*ColPW)
Call SPGEMM('N',XM,XN,XK,Alpha,A,XLDA,DS(1+IP*ColPW),
$ DE(1+IP*ColPW),X(SP),DR(SP),
$ Beta,C(1+IP*IncC),XLDC)
60 Continue
EndIf
ELSE
ColPW = Max((N+NP-1)/NP,MinCoW)
NWork = (N+ColPW-1)/ColPW
If(NWork.eq.1) then
Call SPGEMM('N',XM,XN,XK,Alpha,A,XLDA,DS,DE,X,DR,Beta,C,
$ XLDC)
Else
IncC = ColPW*LDC
C$OMP Parallel Do Default(Shared) Schedule(Static,1) Private(IP,XN,SP)
Do 70 IP = 0, (NWork-1)
XN = Min(N-IP*ColPW,ColPW)
SP = DS(1+IP*ColPW)
Call SPGEMM('N',XM,XN,XK,Alpha,A,XLDA,DS(1+IP*ColPW),
$ DE(1+IP*ColPW),X(SP),DR(SP),
$ Beta,C(1+IP*IncC),XLDC)
70 Continue
EndIf
ENDIF
C
C Non-zero elements list generation for symmetrical sparse matrix.
C
ELSEIF (TypeS) THEN
DS(1)=1
Do 80 J = 1, N
If(B(J,1).NE.Zero)Then
ICont = ICont + 1
DR(ICont) = J
X (ICont) = B(J,1)
EndIf
80 Continue
DE(1) = ICont + 1
Do 90 I = 2,K
DS(I) = DE(I-1)
Do 100 J = I, N
If(B(J,I).NE.Zero)Then
ICont = ICont + 1
DR(ICont) = J
X (ICont) = B(J,I)
EndIf
100 Continue
DE(I) = ICont + 1
90 Continue
C
C Using list to carry out the matrix multiplication.
C
ColPW = Max((N+NP-1)/NP,MinCoW)
NWork = (N+ColPW-1)/ColPW
If(NWork.eq.1) then
Call SPGEMM('S',XM,XN,XK,Alpha,A,XLDA,DS,DE,X,DR,Beta,C,
$ XLDC)
Else
C$OMP Parallel Do Default(Shared) Schedule(Static,1) Private(IP,XN,SP)
Do 110 IP = 0, (NWork-1)
XN = Min(N-IP*ColPW,ColPW)
SP = DS(1+IP*ColPW)
Call SPGEMM('S',XM,XN,XK,Alpha,A,XLDA,DS(1+IP*ColPW),
$ DE(1+IP*ColPW),X(SP),DR(SP),Beta,C,XLDC)
110 Continue
EndIf
ELSE
Write(*,*)'Wrong matrix type! Should be N or S only'
Stop
ENDIF
EndIf
Return
End
*Deck SPGEMM
Subroutine SPGEMM ( MTYPE, M, N, K, ALPHA, A, LDA, DS, DE, X, DR,
$ BETA, C, LDC )
Implicit Real*8(A-H,O-Z)
CHARACTER*1 MTYPE
INTEGER M, N, K, LDA, LDC
Real*8 ALPHA, BETA
Real*8 A( LDA, * ), X( * ), C( LDC, * )
INTEGER*4 DS( * ), DE( * ), DR( * )
LOGICAL NOTA, NOTB
INTEGER I, INFO, J, L, ISP, IEP, NR
Real*8 TEMP
Real*8 ONE , ZERO
PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 )
C ..
NOTA = MTYPE.eq.'N'
NOTB = MTYPE.eq.'S'
C
INFO = 0
IF( M .LT.0 )THEN
INFO = 1
ELSE IF( N .LT.0 )THEN
INFO = 2
ELSE IF( K .LT.0 )THEN
INFO = 3
ELSE IF( LDA.LT.MAX( 1, M ) )THEN
INFO = 4
ELSE IF( LDC.LT.MAX( 1, M ) )THEN
INFO = 5
END IF
If(Info.ne.0) then
write(*,*)'---SPGEMM, wrong Info:',Info,'---'
stop
endif
C
If(M.eq.0.or.N.eq.0.or.
$ ((Alpha.eq.Zero.or.K.eq.0).and.Beta.eq.One)) Return
C
C And if alpha.eq.zero.
C
IF( ALPHA.EQ.ZERO )THEN
IF( BETA.EQ.ZERO )THEN
DO 20, J = 1, N
DO 10, I = 1, M
C( I, J ) = ZERO
10 CONTINUE
20 CONTINUE
ELSE
DO 40, J = 1, N
DO 30, I = 1, M
C( I, J ) = BETA*C( I, J )
30 CONTINUE
40 CONTINUE
END IF
RETURN
END IF
C
C Start the operations.
C
IF( NOTA )THEN
C
C Deal with normal square sparse matrix multiplication
C
DO 90, J = 1, N
IF( BETA.EQ.ZERO )THEN
DO 50, I = 1, M
C( I, J ) = ZERO
50 CONTINUE
ELSE IF( BETA.NE.ONE )THEN
DO 60, I = 1, M
C( I, J ) = BETA*C( I, J )
60 CONTINUE
ENDIF
ISP = DS(J)
IEP = DE(J)-1
DO 80, L = ISP, IEP
TEMP = X( L )
NR = DR( L )
DO 70, I = 1, M
C( I, J ) = C( I, J ) + TEMP*A( I, NR )
70 CONTINUE
80 CONTINUE
90 CONTINUE
ELSEIF( NOTB )THEN
C
C Deal with symmetry square sparse matrix multiplication
C
DO 160, J = 1, N
IF( BETA.EQ.ZERO )THEN
DO 100, I = 1, M
C( I, J ) = ZERO
100 CONTINUE
ELSE IF( BETA.NE.ONE )THEN
DO 110, I = 1, M
C( I, J ) = BETA*C( I, J )
110 CONTINUE
ENDIF
ISP = DS(J)
IEP = DE(J)-1
DO 130, L = ISP, IEP
TEMP = X( L )
NR = DR( L )
DO 120, I = 1, M
C( I, J ) = C( I, J ) + TEMP*A( I, NR )
120 CONTINUE
130 CONTINUE
ISP = DS(J)+1
IEP = DE(J)-1
DO 150, L = ISP, IEP
TEMP = X( L )
NR = DR( L )
DO 140, I = 1, M
C( I, NR ) = C( I, NR ) + TEMP*A( I, J )
140 CONTINUE
150 CONTINUE
160 CONTINUE
ENDIF
RETURN
END
Compiled it with:
gau-cpp -I -D_OPENMP_ -D_OPENMP_MM_ -DCA1_DGEMM -DCA2_DGEMM -DCAB_DGEMM -DGREAL=DREAL xgemm.F xgemm.f
pgf77 -i8 -mp -O2 -tp p7-64 -time -fast -c xgemm.f
pgf77 -i8 -mp -O2 -tp p7-64 -time -fast -o xgemm.exe xgemm.o
For serial job, this code works fine. And the output is:
XMA= 8.2E-003 8.2E-003 8.2E-003 8.2E-003 8.2E-003 8.2E-003
1.0E-003 1.0E-003 1.0E-003 1.0E-003 1.0E-003 1.0E-003
2.0E-003 2.0E-003 2.0E-003 2.0E-003 2.0E-003 2.0E-003
3.4E-003 3.4E-003 3.4E-003 3.4E-003 3.4E-003 3.4E-003
5.2E-003 5.2E-003 5.2E-003 5.2E-003 5.2E-003 5.2E-003
7.4E-003 7.4E-003 7.4E-003 7.4E-003 7.4E-003 7.4E-003
Once the number of threads changes to more than 1, e.g. 2 thread, the output becomes:
XMA= 8.2E-003 8.2E-003 8.2E-003 8.2E-003 8.2E-003 8.2E-003
1.0E-003 1.0E-003 1.0E-003 1.0E-003 1.0E-003 1.0E-003
2.0E-003 2.0E-003 2.0E-003 2.0E-003 2.0E-003 2.0E-003
0.0E+000 0.0E+000 0.0E+000 0.0E+000 0.0E+000 0.0E+000
0.0E+000 0.0E+000 0.0E+000 0.0E+000 0.0E+000 0.0E+000
0.0E+000 0.0E+000 0.0E+000 0.0E+000 0.0E+000 0.0E+000
As we can see, the first thread works fine, but it seems the second thread doesn’t work at all. Any one can help me on finding out the reason for this? Thank you very much!!
Sharp