PROGRAM CULA_TEST
use CULA_LAPACK_DEVICE_PGFORTRAN
use cula_status
use cula_type
use cudafor
IMPLICIT NONE
INTEGER :: STATUS
INTEGER, PARAMETER :: M = 16
INTEGER, PARAMETER :: N = 16
INTEGER :: K
integer::ii
real,allocatable,dimension(:,:),device :: A_DEVICE
integer,allocatable,dimension(:),device :: TAU_DEVICE
real, ALLOCATABLE, DIMENSION(:,:) :: A
real, ALLOCATABLE, DIMENSION(:) :: work
real, ALLOCATABLE, DIMENSION(:,:) :: b
real, ALLOCATABLE, DIMENSION(:,:) :: c
integer, ALLOCATABLE, DIMENSION(:) ,target:: TAU
type(cudaevent)::startevent,stopevent
real::time
integer::istat
istat = cudaeventcreate(startevent)
istat = cudaeventcreate(stopevent)
ALLOCATE(A(M,N))
ALLOCATE(work(M))
ALLOCATE(b(M,N))
ALLOCATE(c(M,N))
ALLOCATE(TAU(M))
!fordeg
CALL RANDOM_NUMBER(A)
istat = cudaeventrecord(startevent,0)
WRITE(*,*) 'ALLOCATE DEVICE ARRAYS'
allocate(a_device(m,m))
allocate(tau_device(m))
WRITE(*,*) 'WRITING ARRAYS TO DEVICE'
a_device=a
tau_device=tau
WRITE(*,*) 'INITIALIZING CULA'
STATUS = CULA_INITIALIZE()
print *,'bb'
CALL CULA_CHECK_STATUS(STATUS)
print *,status
print *,'aa'
WRITE(*,*) 'CALLING CULA_DEVICE_SGEQRF'
STATUS = cula_device_sgetrf(m,n,a_device,n,tau_device)
STATUS = CULA_DEVICE_sGETRI(M, A_DEVICE, M, TAU_DEVICE
CALL CULA_CHECK_STATUS(STATUS)
WRITE(*,*) 'RETRIEVING DEVICE MEMORY TO HOST'
b=a_device
tau=tau_device
CALL CULA_SHUTDOWN()
write(*,*)'time for gpu execution(ms):',time
c = matmul(a,b)
istat = cudaeventrecord(startevent,0)
call sgetrf(m,n,a,m,tau,ii)
!call sgetrf_f95(a,tau,ii)
!print *,'sgetrf finish'
call sGETRI(M, A, M, TAU,work,M,ii)
istat = cudaeventrecord(stopevent,0)
istat = cudaeventsynchronize(stopevent)
istat = cudaeventelapsedtime(time,startevent,stopevent)
write(*,*)'time for cpu execution(ms):',time
END
I have encouter such two question:
1.When I compile this program with PGI2014,it failed to initial the cula when running.However,it succeed when I compile the program with PGI2015.
2.the intel’s mkl rountine fail to lanuch when my array’size is bigger while nothing happened with a small array size.for example,set the arry size 2,20 is passed,200 is failed
I hope someone can answer my question.