Hi every one,
I have had some performance problem. I have developed 3 versions of simulation code. They all do the same work but with different implementations. The first one is using a subroutine called XGEMM which is a DGEMM stub to carry out a matrix multiplication. Since one of the matrices used for multiplication is sparse, the speedup of parallelization for this method is very poor (this is something else and we can ignore this poor performance here). The second version deals with the non-zero elements in the sparse matrix only. So there will be no matrix multiplication routine evolved. Instead, the non-zero elements are used directly to carry out a scalar times a vector calculation. The third version is quite similar to the second one, just change the code of scalar times a vector into a DAXPY call.
After I measure the MFlops for each method, I found the result is quite strange. The first method gives a 515MFlops despite one of the matrices used for multiplication is quite sparse. The second method gives ~500MFlops, which is slightly slower. Well, since the loop of the second method is not optimized, I can understand where this slightly slower comes from. However, the third method, which uses a daxpy to carry out the scalar times a vector, only gives 423MFlops. Since I think the DAXPY subroutine is a well optimized routine, it should at least give the same performance as the first method. I have checked my code carefully for several times and cannot find any mistakes. Any one has any idea about the reason of this poor performance?
All jobs are run serially on Intel Xeon 1U severs (2 Intel Xeon Quad-core, 2.6GHz) and the compiler is:
pgf77 -i8 -mp -O2 -tp p7-64 -time -fast -o xvect.exe xvect.F -lacml
Here are the codes of the 3 versions:
XGEMM:
program xgemmtest
implicit real*8 (A-H,O-Z)
integer i,j,k,l
integer n,num_pes
parameter (n=3432)
Real*8 XA(n,n),XCa(n,n),XMA(n,n),XMB(n,n)
character DayTim*24
real*8 Zero, One, Ten
data Zero/0.0d0/, One/1.0d0/, Ten/10.0d0/
do i=1,n
do j=1,n
XA(j,i)=Zero
XCa(j,i)=Zero
XMA(j,i)=Zero
XMB(j,i)=Zero
end do
end do
C
itmp=0
itmp2=0
do i=1,n
XA(i,i)=real(((i*(i-1))/2)+i)
if(i.eq.(itmp2*68+1))then
itmp2=itmp2+1
do j=i+1,n
itmp=itmp+1
l=((j*(j-1))/2)+i
XA(j,i)=real(l)
XA(i,j)=XA(j,i)
end do
endif
end do
C
write(*,*)'itmp=',itmp
write(*,*)'dimension n*n=',n*n
C
do i=1,n
do j=1,n
XCa(j,i)=One/(Ten**4)
end do
end do
C
call FDate(DayTim)
write(*,*) DayTim
C
Do i=1,10
call XGEMM(1,'N','N',n,n,n,one,XCa,n,XA,n,one,XMA,n)
call XGEMM(1,'N','T',n,n,n,one,XCa,n,XA,n,one,XMB,n)
End do
C
call FDate(DayTim)
write(*,*) DayTim
C
stop
end
C
*Deck XGEMM
Subroutine XGEMM(NRI,Str1,Str2,M,N,K,Alpha,A,LDA,B,LDB,BetaI,C,
$ LDC)
Implicit Integer(A-Z)
C
C Wrapper for matrix multiplies which handles both real/complex and
C OpenMP parallelism.
C
Parameter (N3Min=1)
Character*(*) Str1, Str2, XStr1*1, XStr2*1
Logical DoInit
Real*8 Alpha,A(*),B(*),Beta,BetaI,C(*),CAlpha(2),CBeta(2)
C Integer XM, XK, XN, XLDA, XLDB, XLDC
Integer omp_get_num_threads
#ifdef BLAS_I4
Integer*4 XM, XK, XN, XLDA, XLDB, XLDC
#else
Integer XM, XK, XN, XLDA, XLDB, XLDC
#endif
#ifdef _OPENMP_MM_
Parameter (MinCoW=16)
#endif
real*8 Zero, One
Data Zero/0.0d0/, One/1.0d0/
C
DoInit = K.eq.0
If(DoInit) then
If(BetaI.eq.Zero) then
Do 10 I=1,N
II=(I-1)*LDC*NRI
Do 10 J=1,(M*NRI)
10 C(II+J)=Zero
elseif(BetaI.ne.One) then
Do 20 I = 1, N
II = (I-1)*LDC*NRI
Do 20 J = 1, (M*NRI)
20 C(II+J) = Beta*C(II+J)
endIf
Beta = One
else
Beta = BetaI
Endif
C
If(K.ne.0.and.M.ne.0.and.N.ne.0) then
CAlpha(1) = Alpha
CAlpha(2) = Zero
CBeta(1) = Beta
CBeta(2) = Zero
XM = M
XK = K
XN = N
XLDA = LDA
XLDB = LDB
XLDC = LDC
XStr1 = Str1
XStr2 = Str2
If(NRI.eq.1.and.XStr1.eq.'C') XStr1 = 'T'
If(NRI.eq.1.and.XStr2.eq.'C') XStr2 = 'T'
NP=1
C$OMP PARALLEL
C$OMP SINGLE
C$ NP=omp_get_num_threads()
C$OMP END SINGLE
C$OMP END PARALLEL
C
If(NP.eq.1) Then
NWork = 1
Else
C ColPW = Max((N+(NP*MinCoW)-1)/(NP*MinCoW),MinCoW)
ColPW = Min((N+NP-1)/NP,(NP*MinCoW))
NWork = (N+ColPW-1)/ColPW
Endif
If(NWork.eq.1) then
Call DGEMM(XStr1,XStr2,XM,XN,XK,Alpha,A,XLDA,B,XLDB,Beta,C,
$ XLDC)
else
If(XStr2.eq.'T'.or.XStr2.eq.'C') then
IncB = 1
else
IncB = LDB
endIf
IncB = IncB*ColPW*NRI
IncC = ColPW*LDC*NRI
C$OMP Parallel Do Default(Shared) Schedule(Dynamic,1) Private(IP,XN)
Do 100 IP = 0, (NWork-1)
XN = Min(N-IP*ColPW,ColPW)
Call DGEMM(XStr1,XStr2,XM,XN,XK,Alpha,A,XLDA,B(1+IP*IncB),
$ XLDB,Beta,C(1+IP*IncC),XLDC)
100 Continue
endIf
endIf
Return
End
C
*Deck DGEMM
Subroutine DGEMM ( TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB,
$ BETA, C, LDC )
Implicit Real*8(A-H,O-Z)
C .. Scalar Arguments ..
CHARACTER*1 TRANSA, TRANSB
INTEGER M, N, K, LDA, LDB, LDC
Real*8 ALPHA, BETA
C .. Array Arguments ..
Real*8 A( LDA, * ), B( LDB, * ), C( LDC, * )
C ..
C LOGICAL LSAME
C .. Local Scalars ..
LOGICAL NOTA, NOTB
INTEGER I, INFO, J, L, NROWA, NROWB
Real*8 TEMP
C .. Parameters ..
Real*8 ONE , ZERO
PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 )
C ..
C .. Executable Statements ..
C
C Set NOTA and NOTB as true if A and B respectively are not
C transposed and set NROWA and NROWB as the number of rows
C and columns of A and the number of rows of B respectively.
C
C NOTA = LSAME( TRANSA, 'N' )
C NOTB = LSAME( TRANSB, 'N' )
NOTA = TRANSA.eq.'N'
NOTB = TRANSB.eq.'N'
C
IF( NOTA )THEN
NROWA = M
ELSE
NROWA = K
END IF
IF( NOTB )THEN
NROWB = K
ELSE
NROWB = N
END IF
C
C Test the input parameters.
C
INFO = 0
IF( ( .NOT.NOTA ).AND.
C $ ( .NOT.LSAME( TRANSA, 'C' ) ).AND.
$ ( .NOT.(TRANSA.EQ.'C')).AND.
$ ( .NOT.(TRANSA.EQ.'T')) )THEN
INFO = 1
ELSE IF( ( .NOT.NOTB ).AND.
$ ( .NOT.(TRANSB.EQ.'C')).AND.
$ ( .NOT.(TRANSB.EQ.'T')) )THEN
INFO = 2
ELSE IF( M .LT.0 )THEN
C IF( M .LT.0 )THEN
INFO = 3
ELSE IF( N .LT.0 )THEN
INFO = 4
ELSE IF( K .LT.0 )THEN
INFO = 5
ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN
INFO = 8
ELSE IF( LDB.LT.MAX( 1, NROWB ) )THEN
INFO = 10
ELSE IF( LDC.LT.MAX( 1, M ) )THEN
INFO = 13
END IF
C If(Info.ne.0) Call GLBErr('DGEMM',Info)
If(Info.ne.0) then
write(*,*)'---DGEMM, wrong Info:',Info,'---'
stop
endif
C
If(M.eq.0.or.N.eq.0.or.
$ ((Alpha.eq.Zero.or.K.eq.0).and.Beta.eq.One)) Return
C
C And if alpha.eq.zero.
C
IF( ALPHA.EQ.ZERO )THEN
IF( BETA.EQ.ZERO )THEN
DO 20, J = 1, N
DO 10, I = 1, M
C( I, J ) = ZERO
10 CONTINUE
20 CONTINUE
ELSE
DO 40, J = 1, N
DO 30, I = 1, M
C( I, J ) = BETA*C( I, J )
30 CONTINUE
40 CONTINUE
END IF
RETURN
END IF
C
C Start the operations.
C
Itmp=0
C
IF( NOTB )THEN
IF( NOTA )THEN
C
C Form C := alpha*A*B + beta*C.
C
DO 90, J = 1, N
IF( BETA.EQ.ZERO )THEN
DO 50, I = 1, M
C( I, J ) = ZERO
50 CONTINUE
ELSE IF( BETA.NE.ONE )THEN
DO 60, I = 1, M
C( I, J ) = BETA*C( I, J )
60 CONTINUE
END IF
DO 80, L = 1, K
IF( B( L, J ).NE.ZERO )THEN
TEMP = ALPHA*B( L, J )
DO 70, I = 1, M
Itmp=Itmp+1
C( I, J ) = C( I, J ) + TEMP*A( I, L )
70 CONTINUE
END IF
80 CONTINUE
90 CONTINUE
write(*,*)'When A*B, Itmp=',Itmp
Itmp=0
ELSE
C
C Form C := alpha*A'*B + beta*C
C
DO 120, J = 1, N
DO 110, I = 1, M
TEMP = ZERO
DO 100, L = 1, K
TEMP = TEMP + A( L, I )*B( L, J )
100 CONTINUE
IF( BETA.EQ.ZERO )THEN
C( I, J ) = ALPHA*TEMP
ELSE
C( I, J ) = ALPHA*TEMP + BETA*C( I, J )
END IF
110 CONTINUE
120 CONTINUE
END IF
ELSE
IF( NOTA )THEN
C
C Form C := alpha*A*B' + beta*C
C
DO 170, J = 1, N
IF( BETA.EQ.ZERO )THEN
DO 130, I = 1, M
C( I, J ) = ZERO
130 CONTINUE
ELSE IF( BETA.NE.ONE )THEN
DO 140, I = 1, M
C( I, J ) = BETA*C( I, J )
140 CONTINUE
END IF
DO 160, L = 1, K
IF( B( J, L ).NE.ZERO )THEN
TEMP = ALPHA*B( J, L )
DO 150, I = 1, M
Itmp=Itmp+1
C( I, J ) = C( I, J ) + TEMP*A( I, L )
150 CONTINUE
END IF
160 CONTINUE
170 CONTINUE
write(*,*)'When A*B(T), Itmp=',Itmp
ELSE
C
C Form C := alpha*A'*B' + beta*C
C
C write(*,*)'---M=',M,',N=',N,',K=',K,'---'
DO 200, J = 1, N
DO 190, I = 1, M
TEMP = ZERO
DO 180, L = 1, K
TEMP = TEMP + A( L, I )*B( J, L )
180 CONTINUE
IF( BETA.EQ.ZERO )THEN
C( I, J ) = ALPHA*TEMP
ELSE
C( I, J ) = ALPHA*TEMP + BETA*C( I, J )
END IF
190 CONTINUE
200 CONTINUE
END IF
END IF
RETURN
END
The second scalar times a vector method:
program xgemmtest
implicit real*8 (A-H,O-Z)
integer i,j,k,l
integer n,num_pes,m
parameter (n=3432,m=14)
integer omp_get_num_threads
integer NP,NChunk
Real*8 XA(n,n),XCa(n,n),XMA(n,n),Dia(n)
Real XList((n/2)*(m*(m-1)/2))
Integer List(2,(n/2)*(m*(m-1)/2))
character DayTim*24
real*8 Zero, One, Ten
data Zero/0.0d0/, One/1.0d0/, Ten/10.0d0/
do i=1,n
Dia(i)=Zero
do j=1,n
XA(j,i)=Zero
XCa(j,i)=Zero
XMA(j,i)=Zero
end do
end do
C
do i=1,n
do j=1,n
XCa(j,i)=One/(Ten**4)
end do
end do
C
call FDate(DayTim)
write(*,*) DayTim
C
do iijj=1,10
icont=0
itmp=0
itmp2=0
do i=1,n
Dia(i)=real(((i*(i-1))/2)+i)
if(i.eq.(itmp2*68+1))then
itmp2=itmp2+1
do j=i+1,n
l=((j*(j-1))/2)+i
icont=icont+1
List(1,icont)=j
List(2,icont)=i
XList(icont)=real(l)
end do
endif
end do
C
do IDial=1,n
val=Dia(IDial)
do Icol=1,n
itmp=itmp+2
XMA(Icol,IDial)=XMA(Icol,IDial)+val*XCa(Icol,IDial)
XA(Icol,IDial)=XA(Icol,IDial)+val*XCa(Icol,IDial)
end do
end do
C
do ICL=1,icont
Ktmp=List(1,ICL)
Ltmp=List(2,ICL)
val=XList(ICL)
do icol=1,n
itmp=itmp+4
XMA(icol,Ktmp)=XMA(icol,Ktmp)+val*XCa(icol,Ltmp)
XMA(icol,Ltmp)=XMA(icol,Ltmp)+val*XCa(icol,Ktmp)
XA(icol,Ktmp)=XA(icol,Ktmp)+val*XCa(icol,Ltmp)
XA(icol,Ltmp)=XA(icol,Ltmp)+val*XCa(icol,Ktmp)
end do
end do
write(*,*)'itmp=',itmp
end do
call FDate(DayTim)
write(*,*) DayTim
C
write(*,*)'itmp=',itmp
write(*,*)'dimension n=',n
C
stop
end
And the odd DAXPY method:
program xgemmtest
implicit real*8 (A-H,O-Z)
integer i,j,k,l
integer n,num_pes,m
parameter (n=3432,m=14)
integer omp_get_num_threads
integer NP,NChunk
Real*8 XA(n,n),XCa(n,n),XMA(n,n),Dia(n)
Real XList((n/2)*(m*(m-1)/2))
Integer List(2,(n/2)*(m*(m-1)/2))
character DayTim*24
real*8 Zero, One, Ten
data Zero/0.0d0/, One/1.0d0/, Ten/10.0d0/
do i=1,n
Dia(i)=Zero
do j=1,n
XA(j,i)=Zero
XCa(j,i)=Zero
XMA(j,i)=Zero
end do
end do
C
do i=1,n
do j=1,n
XCa(j,i)=One/(Ten**4)
end do
end do
C
call FDate(DayTim)
write(*,*) DayTim
C
do iijj=1,10
icont=0
itmp=0
itmp2=0
do i=1,n
Dia(i)=real(((i*(i-1))/2)+i)
if(i.eq.(itmp2*68+1))then
itmp2=itmp2+1
do j=i+1,n
l=((j*(j-1))/2)+i
icont=icont+1
List(1,icont)=j
List(2,icont)=i
XList(icont)=real(l)
end do
endif
end do
C
do IDial=1,n
val=Dia(IDial)
itmp=itmp+2*n
Call DAXPY(n,val,XCa(1,IDial),1,XMA(1,IDial),1)
Call Daxpy(n,val,XCa(1,IDial),1,XA(1,IDial),1)
end do
C
do ICL=1,icont
Ktmp=List(1,ICL)
Ltmp=List(2,ICL)
val=XList(ICL)
itmp=itmp+4*n
Call Daxpy(n,val,XCa(1,Ltmp),1,XMA(1,Ktmp),1)
Call Daxpy(n,val,XCa(1,Ktmp),1,XMA(1,Ltmp),1)
Call Daxpy(n,val,XCa(1,Ltmp),1,XA(1,Ktmp),1)
Call Daxpy(n,val,XCa(1,Ktmp),1,XA(1,Ltmp),1)
end do
write(*,*)'itmp=',itmp
end do
call FDate(DayTim)
write(*,*) DayTim
C
write(*,*)'itmp=',itmp
write(*,*)'dimension n=',n
C
stop
end