First,my cart is K20 ,and cuda/5.0 cula/17 PGI/13.9 ;supporting for dynamic parallel function. About cudafortran, I have some problem about the ‘cula device dgetrf’ and ‘cula device dgetri’ being invoked in attributes(global) subroutine !
The fowllowing is a simple code about my quension, but I think there should be some issue within that. Can you show me a right case about my quension, and the corret compiler command ?
Thank you very much !
(now I have run the devicedgemm within cublas_device in the attributes(global) kernel subroutine sucsessful
module dgemm
use cula_status
use cula_lapack_device_pgfortran
CONTAINS
attributes(global) subroutine dgemm16(a, m, ipiv, node)
integer, value :: m, node
double precision, device :: a(m,m), ipiv(node)
integer status
i = threadIdx%x
if (i.eq.1) then
status=cula_initialize()
status = cula_device_dgetrf(m,m,a,m,ipiv)
status =cula_device_dgetri(m,a,m,ipiv)
end if
return
end subroutine
END MODULE
program main
use cudafor
use faster_dgemm
integer, parameter :: N = 512,node=177310
integer, parameter :: NREPS = 100
! matrix data
real(8), dimension(N,N) :: B, C
real(8), dimension(Node) :: ipiv
real(8), dimension(N,node) :: A
real(8), allocatable, device, dimension(:,:) :: dA
real(8), allocatable, device, dimension(:) :: dipiv
real(8), allocatable, device, dimension(:,:) :: dB, dC
real(8) gold, RR(N), RQ(N)
type(cudaEvent) :: start, stop
type(dim3) :: blocks
type(dim3) :: threads
istat = cudaEventCreate(start)
istat = cudaEventCreate(stop)
j = 1
bv = -127.0d0
do i = 1, N/2
B(i,j) = 2.0d0 ** bv
bv = bv + 1.0d0
B(N-i+1,j) = -B(i,j)
end do
call random_number(rr)
A(:,1) = rr
do j = 2, Node
RQ = B(:,1)
call random_number(rr)
nn = N - 1
do i = 1, N
ival = int(rr(j) * nn + 1.0d0)
B(i,j) = rq(ival)
do k = ival, nn
rq(k) = rq(k+1)
end do
nn = nn - 1
A(i,j) = A(i,1)
end do
end do
allocate(dA(N,NODE),dipiv(node))
allocate(dB(N,N))
allocate(dC(N,N))
dA = A
dB = B
dipiv=ipiv
m = N
k = N
! timing experiment
call dgemm16<<<1, 1>>>(dA, M, dipiv, node)
time = 0.d0
istat = cudaEventRecord(start, 0)
do j = 1, NREPS
call dgemm16<<<1, 1>>>(dA, m, dipiv,node)
end do
istat = cudaEventRecord(stop, 0)
istat = cudaDeviceSynchronize()
istat = cudaEventElapsedTime(time, start, stop)
time = time / (NREPS*1.0d3)
a = da
nerrors = 0
rmaxerr = 0.0d0
rsumerr = 0.0d0
do j = 1, Node
do i = 1, N
if (a(i,j) .ne. 0.0d0) then
if (abs(a(i,j)) .gt. rmaxerr) rmaxerr = abs(a(i,j))
nerrors = nerrors + 1
rsumerr = rsumerr + abs(a(i,j))
end if
end do
end do
if (nerrors .eq. 0) then
print *,"Test passed!"
else
print *,nerrors," errors were encountered"
print *,"Max error was ",rmaxerr
print *,"Ave error was ",rsumerr / (N * Node)
endif
gflops = 2.0 * N * N * N/time/1d9
write (*,901) m,node,time*1.0d3,gflops
print *,"### a(1,1)=",a(1,1)
901 format(i0,'x',i0,' * ',i0,'x',i0,':\t',f8.3,' ms\t',f8.3,' GFlops/s')
end program
!!!!!!!!!!!!!!!!!!!!!!!!!code end!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!