Might be a pgfortran bug? Could any one help me?
I met a problem when I try to use mpi, cuda fortran and CULA library together on multiple GPUs.
A runtime error occur when I use PGI compiler ( version 10.6 ) pgfortran and mpif90 to compile a mpi+cuda fortran code which calls some CULA routines and has kernel function.
After I get the runtime error, I write a simple code to find the problem. There are only four files.
main.f : the main fortran file which initialize the mpi environment and get the CPU rank and name. Then call a subroutine to set up CULA.
more_mpi.f : declare some variables about mpi in a module
cluster.cuf : contain subroutines to set up the device in using different routines(init_cuda, init_cula), cula_status_check routine ( ), and a subroutine to test CULA routine.
acm_dev.cuf : declare some GPU device in a module
Problem 1:
I build the code from a pure fortran code, then extend the cuda fortran on it. The problem result from the acm_dev.cuf. If I compile acm_dev.cuf and add its object file to the final executable file called mpi_cudafor, the runtime error (36) info is given when init_cula() is called in the main function (running on 4 cores). But the variables in acm_dev have never been used anywhere at all, even the module acm_dev. No program or subroutine have “use acm_dev”.
cpuid: 3
GPU device 3 will be selected
Selecting Device FROM CULA
cpuid: 0
GPU device 0 will be selected
Selecting Device FROM CULA
runtime error ( 36 )
runtime error ( 36 )
cpuid: 1
GPU device 1 will be selected
Selecting Device FROM CULA
runtime error ( 36 )
cpuid: 2
GPU device 2 will be selected
Selecting Device FROM CULA
runtime error ( 36 )
However, if I don’t add the object file (acm_dev.o) into the final executable file, I didn’t get the error when the program call CULA_SELECTDEVICE().
The only different is whether acm_dev.o compiled with executable file or not.
Problem 2:
If I still compile acm_dev.o with executable file, but call init_cuda() instead of init_cula(). No same error happened.
Conclusion from last two situations.
cudasetdevice() works fine.
CULA_SELECTDEVICE() doesn’t work fine here.
Problem 3:
If the variables in avm_dev.cuf are declared in a subroutine instead of a module as before, no same error happened. It really confuse me.
main.f
program main
use more_mpi
use cula_module
call mpi_init(ierr)
call mpi_comm_rank(mpi_comm_world,cpuid,ierr)
call mpi_comm_size(mpi_comm_world,numprocs,ierr)
call mpi_get_processor_name(processor_name,namelen,ierr)
! call init_cuda()
call init_cula()
! call cluster()
call mpi_finalize(ierr)
end program
more_mpi.f
module more_mpi
include 'mpif.h'
integer :: ierr,cpuid,numprocs,namelen !mpi
character(len=100) processor_name
end module
cluster.cuf
module cula_module
use cudafor
use more_mpi
! ***** some CULA related variables *****
INTEGER CULA_STATUS
INTEGER DEVICE_ID
CHARACTER(len=100) BUF
INTEGER BUF_SIZE
PARAMETER (BUF_SIZE=100)
! if use DEVICE_INFO_BUF_SIZE = 100000: segmental fault
EXTERNAL CULA_SELECTDEVICE
EXTERNAL CULA_INITIALIZE
EXTERNAL cula_device_cgesvd
EXTERNAL CULA_SHUTDOWN
EXTERNAL CULA_GETDEVICEINFO
EXTERNAL CULA_GETEXECUTINGDEVICE
INTEGER CULA_SELECTDEVICE
INTEGER CULA_INITIALIZE
INTEGER cula_device_cgesvd
INTEGER CULA_GETDEVICEINFO
INTEGER CULA_GETEXECUTINGDEVICE
INTEGER CULA_CGESV !cula
integer :: gpuid,numdevices !gpu
integer :: info
type(cudadeviceprop) :: prop
contains
subroutine init_cuda()
info=cudaGetDeviceCount(numdevices)
gpuid=mod(cpuid,numdevices)
! gpuid=1
write(*,*) 'cpuid: ', cpuid
write(*,*) 'GPU device ', gpuid, ' will be selected'
info=cudasetdevice(gpuid)
info=cudagetdeviceProperties(prop,gpuid)
write(*,"(a9,i2,a12)") "There are",numdevices,"GPU device!"
write (*,"(a21,i2,a4,i1,a4,a30)"), "Hello world! process ",cpuid," of ",numnodes," on ",processor_name
write (*,"(a6,i2)") "GPU id",gpuid
write (*,"(a12,a20)") "Device name ",prop%name
end subroutine init_cuda
subroutine init_cula()
gpuid=cpuid
! gpuid=1
write(*,*) 'cpuid: ', cpuid
write(*,*) 'GPU device ', gpuid, ' will be selected'
WRITE(*,*) 'Selecting Device FROM CULA'
CULA_STATUS = CULA_SELECTDEVICE(cpuid)
CALL CHECK_STATUS(CULA_STATUS)
WRITE(*,*) 'Initializing CULA'
CULA_STATUS = CULA_INITIALIZE()
CALL CHECK_STATUS(CULA_STATUS)
! info=cudasetdevice(gpuid)
WRITE(*,*) 'Getting Device ID FROM CULA'
STATUS = CULA_GETEXECUTINGDEVICE(DEVICE_ID)
CALL CHECK_STATUS(CULA_STATUS)
WRITE(*,*) "Device ID: ",DEVICE_ID
WRITE(*,*) 'Getting Device Info FROM CULA'
CULA_STATUS = CULA_GETDEVICEINFO(DEVICE_ID, BUF, BUF_SIZE)
CALL CHECK_STATUS(CULA_STATUS)
WRITE(*,*) "BUF: ",BUF
end subroutine init_cula
subroutine cluster()
complex :: u(3,3),vt(4,4),a(3,4)
real :: s(3)
real :: start,finish
complex,allocatable,device :: ad(:,:)
integer :: pitch_ad
complex,device :: ud(3,3),vtd(4,4)
real,device :: sd(3)
info=cudaGetDeviceCount(numdevices)
gpuid=mod(cpuid,numdevices)
! gpuid=1
! write(*,*) 'cpuid: ', cpuid
! write(*,*) 'gpuid: ', gpuid
! info=cudasetdevice(gpuid)
! info=cudagetdeviceProperties(prop,gpuid)
! write(*,"(a9,i2,a12)") "There are",numdevices,"GPU device!"
! write (*,"(a21,i2,a4,i1,a4,a30)"), "Hello world! process ",cpuid," of ",numprocs," on ",processor_name
! write (*,"(a6,i2)") "GPU id",gpuid
! write (*,"(a12,a20)") "Device name ",prop%name
m=3
n=4
lda=3
ldu=3
ldvt=4
a=reshape((/(5.91,-5.69),(-3.15,-4.08),(-4.89,4.20),(7.09,2.72),(-1.89,3.27),(4.10,-6.70),(7.78,-4.06),(4.57,-2.07),(3.28,-3.84),(-0.79,-7.21),(-3.88,-3.30),(3.84,1.19)/),(/3,4/))
info=cudamallocpitch(ad,pitch_ad,n,m)
info=cudamemcpy2d(ad,pitch_ad,a,n*4,n*4,m,cudamemcpyhosttodevice)
info = cula_selectdevice(cpuid)
call check_status(info)
!Initialize CULA
info=cula_initialize()
call check_status(info)
call cpu_time(start)
info=cula_device_cgesvd('a','a', M, N, ad, LDA, sd,ud, LDU,vtd, LDVT)
call check_status(info)
call cpu_time(finish)
info=cudamemcpy(s,sd,3,cudamemcpydevicetohost)
write(*,*) s
write(*,*) "GPU time=",finish-start,"s"
call cula_shutdown()
info=cudafree(ad)
info=cudafree(sd)
end subroutine cluster
subroutine check_status(culastatus)
integer culastatus
integer info
integer cula_geterrorinfo
info = cula_geterrorinfo()
if (culastatus .ne. 0) then
if (culastatus .eq. 7) then
!culaargumenterror
write(*,*) 'invalid value for parameter ', info
else if (culastatus .eq. 8) then
!culadataerror
write(*,*) 'data error (', info ,')'
else if (culastatus .eq. 9) then
!culablaserror
write(*,*) 'blas error (', info ,')'
else if (culastatus .eq. 10) then
!cularuntimeerror
write(*,*) 'runtime error (', info ,')'
else
!others
call cula_getstatusstring(culastatus)
endif
stop 1
end if
end subroutine check_status
end module cula_module
acm_dev.cuf
module acm_dev
use cudafor
integer, parameter:: b4 = selected_real_kind(4)
complex(b4), device, allocatable :: c_dev(:,:),b_dev(:,:)
complex(b4), device, allocatable :: eps_dev(:),cnray_dev(:)
complex(b4), device, allocatable :: epsm1_dev, cn_dev
complex(b4), device, allocatable :: base_dev(:,:) ! constant
complex(b4), device, allocatable :: material_dev(:) ! constant
complex(b4), device, allocatable :: ei_dev(:) ! constant
integer, device, allocatable :: gene_dev(:,:)
! integer, device, allocatable :: vector_dev(:) ! should be a shared memroy declared in device subprogram
integer, device, allocatable :: nbox_dev ! might not needed
end module acm_dev
makefile:
.SUFFIXES: .cuf .o
L1= main.o cluster.o more_mpi.o acm_dev.o
PGFOR=pgfortran
PF90= mpif90
LINK1= /opt/pgi/linux86-64/11.5/lib/libcudafor.a
#Change to -Mmpi2 for MPICH2
#MPI=-Mmpi
#add cuf
#CUDA=-ta=nvidia -Mcuda
CUDA=
#lib
CULALIB=-L${CULA_LIB_PATH_64} -lcula -lcula_pgfortran -llapack -lblas
#include
CULAINC= -I${CULA_INC_PATH}
#free format
PGFLAGS = -Mfree -O3
#MPICH include
MPICHINCLUDES=-I/opt/pgi/linux86-64/10.6/mpi/mpich/include/
#MPICH lib
MPICHLIBPATH64=-L/opt/pgi/linux86-64/10.6/mpi/mpich/lib/
mpi_cudafor: $(L1)
$(PF90) $(PGFLAGS) $(L1) $(CULAINC) $(CULALIB) $(LINK1) -o mpi_cudafor
.f.o:
$(PF90) $(PGFLAGS) -c $(CULAINC) $(CULALIB) $<
.cuf.o:
$(PGFOR) $(PGFLAGS) $(CUDA) $(CULAINC) $(CULALIB) -c $<
main.o: main.f cluster.o more_mpi.o
cluster.o: cluster.cuf more_mpi.o
more_mpi.o: more_mpi.f
acm_dev.o: acm_dev.cuf
clean:
rm -f *.o *.mod mpi_cudafor
del:
rm -f *edu
which mpif90 pgf90 pgfortran
/opt/lib/openmpi/1.4.2/pgi/10.6/bin/mpif90
/opt/pgi/linux86-64/10.6/bin/pgf90
/opt/pgi/linux86-64/10.6/bin/pgfortran
run the job:
mpiexec -np 4 ./mpi_cudafor