I am observing a curious error while calling scalapack routine PDGESV from an MPI fortran program to solve a set of linear simultaneous equations Ax = b, in which A is an NxN matrix, b an N-element column vector and x the N-element solution vector. I have prepared a small test program (which I have pasted in below) which reproduces the problem. The program 1) allocates space for the matrix A and vector b, 2) fills their entries with random entries 3) calls PDGESV and then 4) deallocates the memory. The program takes 4 command line arguments: N , NPROW, NPCOL, MB, where NPROW , NPCOL determine the dimensions of the blacs process array and the blocksize MB x MB is used. The program successfully completes for all values of N that are small enough, and for all selected BLACS processor arrays / block sizes for these small N cases that I have tried. I have not precisely determined the point at which failure first occurs, but have only observed success for values of N up to N = 50000, but failure for the case N=94423 (with NPROW=NPCOL = 2, MB = 32). The failure (of segmentation fault type) occurs during the call to PDGESV, and the code never returns from the call to PDGESV. Note that failure does not, in particular, occur during the allocation of memory for the matrices / initialisation of the matrices. Indeed the cause of the problem is unlikely to be lack of memory, since for the N=94423 case the required memory is around 68GB, whereas 190 GB is available.
These results suggest to me that perhaps(?) there is nothing wrong with the fortran code itself, but maybe the linking / compiling commands need to be revised for large values of N ? I compile and link in one step with :
mpifort -O -o parat.exe solve_by_lu_parallelmpi_simple_light2.for -mp -tp=haswell -mcmodel=medium -Mscalapack
and can run with
mpirun -n 4 ./parat.exe 9442 2 2 32 > DUMP01 [this will be successful]
mpirun -n 4 ./parat.exe 94423 2 2 32 > DUMP02 [this will fail]
I am running the program on a linux cluster using Red Hat Enterprise Linux Server release 7.3 (Maipo),
compiling using PGI compiler version 17.7.
Any help / advice will really be greatly appreciated.
Many thanks, Dan.
Fortran Program:
PROGRAM SOLVE_LU
USE MPI
IMPLICIT NONE
INTEGER :: N
DOUBLE PRECISION, ALLOCATABLE, DIMENSION(:,:) :: LOCAL_A
DOUBLE PRECISION, ALLOCATABLE, DIMENSION(:) :: LOCAL_B
INTEGER :: ISTATUS
INTEGER :: INFO, NRHS, IA, JA, IB, JB
INTEGER, ALLOCATABLE, DIMENSION (:) :: IPIV
INTEGER :: IARGC, N_COMMAND_ARG
CHARACTER :: ARGV*10
INTEGER :: NPROW, NPCOL, ICTXT,MYROW, MYCOL, MB, NB, MLOC, NLOC
INTEGER :: IDESCA(9), IDESCB(9)
INTEGER :: IERR
INTEGER :: NUMROC
INTEGER :: ISEEDSIZE
INTEGER, ALLOCATABLE, DIMENSION ( :) :: SEED
N_COMMAND_ARG = iargc()
IF (N_COMMAND_ARG == 2) THEN
WRITE(*,*) 'ILLEGAL NO. OF COMMAND LINE PARAMETERS'
STOP
ENDIF
IF (N_COMMAND_ARG .GE. 1)THEN
CALL GETARG(1,argv)
READ (ARGV,'(I10)') N
ELSE
N = 100
ENDIF
IF (N_COMMAND_ARG .GE. 3)THEN
CALL GETARG(2,argv)
READ (ARGV,'(I10)') NPROW
CALL GETARG(3,argv)
READ (ARGV,'(I10)') NPCOL
ELSE
NPROW = 2
NPCOL = 2
ENDIF
IF (N_COMMAND_ARG .GE. 4)THEN
CALL GETARG(4,argv)
READ (ARGV,'(I10)') MB
ELSE
MB = 8
ENDIF
NB = MB
C INITIALISE BLACS
CALL SL_INIT(ICTXT, NPROW, NPCOL)
CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL )
MLOC = NUMROC(N, MB, MYROW, 0, NPROW)
NLOC = NUMROC(N, NB, MYCOL, 0, NPCOL)
IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 )WRITE(*,*)
@ 'WE ARE SOLVING A SYSTEM OF ', N, ' LINEAR EQUATIONS'
WRITE(*,*) 'PROC: ',MYROW, MYCOL,'HAS MLOC, NLOC =', MLOC,NLOC
C ALLOCATE SPACE
WRITE(*,*) 'PROC: ',MYROW, MYCOL,' ALLOCATING SPACE ...'
ALLOCATE ( LOCAL_A(MLOC,NLOC), STAT = ISTATUS )
IF(ISTATUS .NE. 0) THEN
WRITE(*,*)'UNABLE TO ALLOCATE LOCAL_A, PROCESS: ',MYROW,MYCOL
STOP
ENDIF
ALLOCATE ( LOCAL_B(MLOC), STAT = ISTATUS )
IF (ISTATUS /= 0) THEN
WRITE(*,*)
@ ' FAILED TO ALLOCATE SPACE FOR LOCAL_B, PROCESS: ',MYROW,MYCOL
STOP
ENDIF
CALL DESCINIT (IDESCA, N, N, MB, NB, 0, 0,
@ ICTXT, MLOC, IERR)
CALL DESCINIT (IDESCB, N, 1, MB, 1, 0, 0, ICTXT, MLOC, IERR)
C FILL ENTRIES OF MATRIX A AND R.H.S. VECTOR B WITH RANDOM ENTRIES
WRITE(*,*)'PROC: ',MYROW, MYCOL,
@ ' CONSTRUCTING MATRIX A AND RHS VECTOR B ...'
CALL RANDOM_SEED
CALL RANDOM_SEED ( SIZE = ISEEDSIZE )
ALLOCATE ( SEED(1:ISEEDSIZE) )
CALL RANDOM_SEED ( GET = SEED )
SEED(1) = SEED(1) + NPCOL*MYROW + MYCOL
CALL RANDOM_SEED ( PUT = SEED )
CALL RANDOM_NUMBER(LOCAL_B)
CALL RANDOM_NUMBER(LOCAL_A)
C CALL LAPACK LU SOLVER PDGESV
WRITE(*,*)'PROC: ',MYROW, MYCOL,
@ 'NOW SOLVING SYSTEM AX = B USING SCALAPACK PDGESV ..'
ALLOCATE ( IPIV(MLOC + MB), STAT=ISTATUS )
IF(ISTATUS /= 0) THEN
WRITE(*,*)'UNABLE TO ALLOCATE IPIV, PROCESS: ',MYROW,MYCOL
STOP
ENDIF
IA = 1
JA = 1
IB = 1
JB = 1
NRHS = 1
INFO = 0
CALL PDGESV(N, NRHS, LOCAL_A, IA, JA, IDESCA, IPIV,
@ LOCAL_B, IB, JB, IDESCB, INFO )
IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN
WRITE(*,*)
WRITE(*,*) 'INFO code returned by PDGESV = ', INFO
WRITE(*,*)
END IF
C DEALLOCATE MEMORY
DEALLOCATE(LOCAL_A, STAT=ISTATUS)
IF(ISTATUS /= 0) THEN
WRITE(*,*)'UNABLE TO DEALLOCATE '
STOP
ENDIF
DEALLOCATE(LOCAL_B, STAT=ISTATUS)
IF(ISTATUS /= 0) THEN
WRITE(*,*)'UNABLE TO DEALLOCATE '
STOP
ENDIF
DEALLOCATE(IPIV, STAT=ISTATUS)
IF(ISTATUS /= 0) THEN
WRITE(*,*)'UNABLE TO DEALLOCATE '
STOP
ENDIF
c ===================================================
c RELEASE BLACS CONTEXT
CALL BLACS_GRIDEXIT(ictxt)
CALL BLACS_EXIT(0)
END PROGRAM SOLVE_LU