Hi!
I’m using workstation 10.3. I also use mpich1 & cula lib. I found there may be some problem if use them together.
cudasetdevice conflicts with cula_initialize, if they used together, there will be a runtime error.
What’s more, you provide two runtime API, cudasetdevice and cudagetdevice, both of them said they can assign device number with this host thread.
so the question is
1: how to solve conflict?
2: how to assign multi-device to host? I want to run it on my GPU cluster and waht’s the difference between cudagetdevice and cudasetdevice.
Thanks!
This is my source code. You can test it.
program cluster
use cudafor
include 'mpif.h'
external cula_initialize
external cula_cgesvd
external cula_shutdown
integer cula_initialize,cula_device_cgesvd !cula
integer :: ierr,cpuid,numprocs,namelen !mpi
character* (mpi_max_processor_name) processor_name
integer :: gpuid,numdevices !gpu
integer :: info
type(cudadeviceprop) :: prop
complex :: u(3,3),vt(4,4),a(3,4)
real :: s(3)
real :: start,finish
complex,allocatable,device :: ad(:,:)
integer :: pitch_ad
complex,device :: ud(3,3),vtd(4,4)
real,device :: sd(3)
info=cudaGetDeviceCount(numdevices)
m=3
n=4
lda=3
ldu=3
ldvt=4
a=reshape((/(5.91,-5.69),(-3.15,-4.08),(-4.89,4.20),(7.09,2.72),(-1.89,3.27),(4.10,-6.70),(7.78,-4.06),(4.57,-2.07),(3.28,-3.84),(-0.79,-7.21),(-3.88,-3.30),(3.84,1.19)/),(/3,4/))
info=cudamallocpitch(ad,pitch_ad,n,m)
info=cudamemcpy2d(ad,pitch_ad,a,n*4,n*4,m,cudamemcpyhosttodevice)
call mpi_init(ierr)
call mpi_comm_rank(mpi_comm_world,cpuid,ierr)
call mpi_comm_size(mpi_comm_world,numprocs,ierr)
call mpi_get_processor_name(processor_name,namelen,ierr)
gpuid=mod(cpuid,numdevices)
info=cudasetdevice(gpuid)
info=cudagetdeviceProperties(prop,gpuid)
write(*,"(a9,i2,a12)") "There are",numdevices,"GPU device!"
write (*,"(a21,i2,a4,i1,a4,a30)"), "Hello world! process ",cpuid," of ",numprocs," on ",processor_name
write (*,"(a6,i2)") "GPU id",gpuid
write (*,"(a12,a20)") "Device name ",prop%name
!Initialize CULA
info=cula_initialize()
call check_status(info)
call cpu_time(start)
info=cula_device_cgesvd('a','a', M, N, ad, LDA, sd,ud, LDU,vtd, LDVT)
!call check_status(info)
call cpu_time(finish)
info=cudamemcpy(s,sd,3,cudamemcpydevicetohost)
write(*,*) s
write(*,*) "GPU time=",finish-start,"s"
call cula_shutdown()
info=cudafree(ad)
info=cudafree(sd)
call mpi_finalize(ierr)
end
subroutine check_status(culastatus)
integer culastatus
integer info
integer cula_geterrorinfo
info = cula_geterrorinfo()
if (culastatus .ne. 0) then
if (culastatus .eq. 7) then
!culaargumenterror
write(*,*) 'invalid value for parameter ', info
else if (culastatus .eq. 8) then
!culadataerror
write(*,*) 'data error (', info ,')'
else if (culastatus .eq. 9) then
!culablaserror
write(*,*) 'blas error (', info ,')'
else if (culastatus .eq. 10) then
!cularuntimeerror
write(*,*) 'runtime error (', info ,')'
else
!others
call cula_getstatusstring(culastatus)
endif
stop 1
end if
end subroutine check_status
[/code]