I have downloaded from the internet a simple MPI+PGI CUDA source code as follows.
! 08/07/2009 I.N. Kozin, STFC Daresbury Lab
! igor.kozin @ stfc.ac.uk
!
! simple direct matrix-matrix multiplication
! c(n3,n1) = a(n3,n2) x b(n2,n1)
! using MPI, PGI accelerator compiler and device allocation
!
! user selectable parameters:
! work in single or double precision
! integer, parameter :: WP
! do or don’t do device allocation
! logical, parameter :: device_allocation
!
! how to compile for GPU:
! mpif90 -fast -o matrix-mult-mpi.exe matrix-mult-mpi.f90 -ta=nvidia -Minfo
!
! how to compile for CPU:
! make sure device_allocation = .FALSE.
! mpif90 -fast -o matrix-mult-mpi.exe matrix-mult-mpi.f90 -Minfo
!
! how to run:
! mpiexec -n 4 -machinefile hosts ./matrix-mult-mpi.exe n1 n2 n3 iter
module precision
! select single or double precision
!integer, parameter :: WP = KIND(1.0)
integer, parameter :: WP = KIND(1.0D0)
end module precision
program main
use accel_lib
use mpi
use precision
implicit none
!logical, parameter :: device_allocation = .FALSE.
logical, parameter :: device_allocation = .TRUE.
integer, parameter :: nn = 1000
integer :: i,j,k,n1,n2,n3,iter,n1t
real(WP), allocatable :: a(:,:), b(:,:)
real(WP), allocatable, target :: ct(:,:)
real(WP), pointer :: c(:,:)
real mflop, msize
double precision :: tstart, tend, walltime
integer :: narg,iargc
integer :: ierr, size, rank, n1size, n1start, n1end
integer :: ibuffer(4)
character*10 :: arg
integer stat(MPI_STATUS_SIZE), columntype
integer :: mytag = 1
! accel
integer(acc_device_kind) devicetype
integer devicenum, num_devices
n1=nn; n2=nn; n3=nn; iter=2
write(6,*) “step 1”
call MPI_INIT(ierr)
call MPI_COMM_SIZE( MPI_COMM_WORLD, size, ierr)
call MPI_COMM_RANK( MPI_COMM_WORLD, rank, ierr)
write(6,*) “step 2”
!===============================================
if(rank == 0) then
! input processing
narg=iargc()
! check command line input
if (narg == 0) then
write(6,*) “./matrix-mult-mpi.exe n1 n2 n3 iter”
stop
end if
if (narg > 0) then
call getarg(1,arg)
read(arg,‘(i10)’) n1
end if
if (narg > 1) then
call getarg(2,arg)
read(arg,‘(i10)’) n2
end if
if (narg > 2) then
call getarg(3,arg)
read(arg,‘(i10)’) n3
end if
if (narg > 3) then
call getarg(4,arg)
read(arg,‘(i10)’) iter
end if
endif
write(6,*) “step 3”
!===============================================
! initialise GPU
! set the accelerator device
if(device_allocation) then
! devicetype = 0 ! none
! devicetype = 2 ! host
devicetype = 3 ! nvidia
! write(6,‘(a12,i10)’) ’ devicetype=',devicetype
! get number of devices
! num_devices = acc_get_num_devices(devicetype)
num_devices = 1
write(6,*) “step 4”
if(rank == 0) write(6,) ‘number of devices per host=’, num_devices
devicenum = mod(rank,num_devices)
write(6,) ’ rank=‘, rank, ’ devicenum=’, devicenum
! set accel device
call acc_set_device_num(devicenum,devicetype)
! call acc_set_device(3)
write(6,*) “step 5”
call acc_init(devicetype)
endif
… (not shown due to irrelevance)
The log of compiling and linking is
PVF Build Log
Begin rebuild: clean project
Deleting intermediate and output files for project ‘MPI_CUDA_Test’, configuration ‘Debug’
Compiling Project …
C:\Program Files\Microsoft HPC Pack 2008 SDK\Include\mpi.f90
c:\program files\pgi\win32\10.4\bin\pgfortran.exe -Hx,123,8 -Hx,123,0x40000 -Hx,0,0x40000000 -Mx,0,0x40000000 -Hx,0,0x20000000 -g -Bstatic -Mbackslash -Mmpi=msmpi -Mcuda -Mfree -I"c:\program files\pgi\win32\10.4\include" -I"C:\Program Files\PGI\Microsoft Open Tools 9\include" -I"C:\Program Files\PGI\Microsoft Open Tools 9\PlatformSDK\include" -Ktrap=fp -ta=nvidia,fastmath,wait -Minform=warn -Mprof=func,msmpi,dwarf -module “Win32\Debug” -Minfo=accel,ccff,ftn,intensity,loop,lre,opt,par,vect -o “Win32\Debug\mpi.obj” -c “C:\Program Files\Microsoft HPC Pack 2008 SDK\Include\mpi.f90”
Command exit code: 0
\
matrix-mult-mpi.f90
c:\program files\pgi\win32\10.4\bin\pgfortran.exe -Hx,123,8 -Hx,123,0x40000 -Hx,0,0x40000000 -Mx,0,0x40000000 -Hx,0,0x20000000 -g -Bstatic -Mbackslash -Mmpi=msmpi -Mcuda -Mfree -I"c:\program files\pgi\win32\10.4\include" -I"C:\Program Files\PGI\Microsoft Open Tools 9\include" -I"C:\Program Files\PGI\Microsoft Open Tools 9\PlatformSDK\include" -Ktrap=fp -ta=nvidia,fastmath,wait -Minform=warn -Mprof=func,msmpi,dwarf -module “Win32\Debug” -Minfo=accel,ccff,ftn,intensity,loop,lre,opt,par,vect -o “Win32\Debug\matrix-mult-mpi.obj” -c “D:\MPI_CUDA_Test\matrix-mult-mpi.f90”
Command exit code: 0
Command output: [main: 197, Possible copy in and copy out of c in call to mmult mmult: 227, Generating copyin(b(:n2,:n1)) Generating copyin(a(:n3,:n2)) Generating copy(c(:n3,:n1)) Generating compute capability 1.3 kernel 229, Loop is parallelizable 231, Loop carried dependence of ‘c’ prevents parallelization Loop carried backward dependence of ‘c’ prevents vectorization 233, Loop is parallelizable Accelerator kernel generated 229, !$acc do parallel, vector(16) 231, !$acc do seq Cached references to size [16x16] block of ‘a’ Cached references to size [16x16] block of ‘b’ 233, !$acc do parallel, vector(16) Using register for ‘c’ ]
\
Linking…
c:\program files\pgi\win32\10.4\bin\pgfortran.exe -Wl,/libpath:“c:\program files\pgi\win32\10.4\lib” -Wl,/libpath:“C:\Program Files\PGI\Microsoft Open Tools 9\lib” -Wl,/libpath:“C:\Program Files\PGI\Microsoft Open Tools 9\PlatformSDK\lib” -Yl,“C:\Program Files\PGI\Microsoft Open Tools 9\bin” -g -Bstatic -Mmpi=msmpi -Mcuda -ta=nvidia,fastmath,wait -Mprof=func,msmpi,dwarf -o “D:\MPI_CUDA_Test\Win32\Debug\MPI_CUDA_Test.exe” -Wl,/libpath:“C:\Program Files\Microsoft HPC Pack 2008 SDK\Lib\i386” “Win32\Debug\matrix-mult-mpi.obj” “Win32\Debug\mpi.obj”
Command exit code: 0
Command output: [c:\program files\pgi/win32/10.4/lib\msmpi.obj : warning LNK4042: object specified more than once; extras ignored ]
MPI_CUDA_Test build succeeded.
The output of executing pgaccelinfo is
CUDA Driver Version 3010
Device Number: 0
Device Name: Tesla C2050
Device Revision Number: 2.0
Global Memory Size: 2817720320
Number of Multiprocessors: 14
Number of Cores: 448
Concurrent Copy and Execution: Yes
Total Constant Memory: 65536
Total Shared Memory per Block: 49152
Registers per Block: 32768
Warp Size: 32
Maximum Threads per Block: 1024
Maximum Block Dimensions: 1024, 1024, 64
Maximum Grid Dimensions: 65535 x 65535 x 1
Maximum Memory Pitch: 2147483647B
Texture Alignment 512B
Clock Rate: 1147 MHz
Current free memory 2701131776
Upload time (4MB) 16 microseconds
Download time 15 microseconds
Upload bandwidth 262144 MB/sec
Download bandwidth 279620 MB/sec
Running the executable generates the following output
$ ./MPI_CUDA_Test.exe 800 200 200 2
Usage error: call to acc_set_device with unsupported device type: 4715104
step 1
step 2
step 3
step 4
number of devices per host= 1
rank= 0 devicenum= 0
The explicit setting “devicetype = 3” seems not seen by and passed to the routine “acc_set_device_num”.
What’s wrong with it?
On the other hand, if I uncomment the line “num_devices = acc_get_num_devices(devicetype)”, the linking output indicates an unresolved external symbol _ACC_GET_NUM_DEVICE.
Again, why is this routine not recognized?
Thank you very much.
M-H Chung