How compile the kernel subroutine containing dgetrf

First,my cart is K20 ,and cuda/5.0 cula/17 PGI/13.9 ;supporting for dynamic parallel function. About cudafortran, I have some problem about the ‘cula device dgetrf’ and ‘cula device dgetri’ being invoked in attributes(global) subroutine !
The fowllowing is a simple code about my quension, but I think there should be some issue within that. Can you show me a right case about my quension, and the corret compiler command ?
Thank you very much !
(now I have run the devicedgemm within cublas_device in the attributes(global) kernel subroutine sucsessful

module dgemm
use cula_status
 use cula_lapack_device_pgfortran
CONTAINS
 attributes(global) subroutine dgemm16(a, m, ipiv, node)

 integer, value :: m, node
 double precision, device :: a(m,m), ipiv(node)

 integer status
 i = threadIdx%x
 if (i.eq.1) then
 status=cula_initialize()

 status = cula_device_dgetrf(m,m,a,m,ipiv)
 status =cula_device_dgetri(m,a,m,ipiv)

 end if
 return
 end subroutine

END MODULE

program main
 use cudafor
 use faster_dgemm
 integer, parameter :: N = 512,node=177310
 integer, parameter :: NREPS = 100
 ! matrix data
 real(8), dimension(N,N) :: B, C
 real(8), dimension(Node) :: ipiv
 real(8), dimension(N,node) :: A

 real(8), allocatable, device, dimension(:,:) :: dA
 real(8), allocatable, device, dimension(:) :: dipiv
 real(8), allocatable, device, dimension(:,:) :: dB, dC
 real(8) gold, RR(N), RQ(N)
 type(cudaEvent) :: start, stop
 type(dim3) :: blocks
 type(dim3) :: threads

 istat = cudaEventCreate(start)
 istat = cudaEventCreate(stop)

 j = 1
 bv = -127.0d0
 do i = 1, N/2
 B(i,j) = 2.0d0 ** bv
 bv = bv + 1.0d0
 B(N-i+1,j) = -B(i,j)
 end do

 call random_number(rr)
 A(:,1) = rr

 do j = 2, Node
 RQ = B(:,1)
 call random_number(rr)
 nn = N - 1
 do i = 1, N
 ival = int(rr(j) * nn + 1.0d0)
 B(i,j) = rq(ival)
 do k = ival, nn
 rq(k) = rq(k+1)
 end do
 nn = nn - 1
 A(i,j) = A(i,1)
 end do
 end do

 allocate(dA(N,NODE),dipiv(node))
 allocate(dB(N,N))
 allocate(dC(N,N))

 dA = A
 dB = B
 dipiv=ipiv

 m = N
 k = N

 ! timing experiment
 call dgemm16<<<1, 1>>>(dA, M, dipiv, node)
 time = 0.d0
 istat = cudaEventRecord(start, 0)
 do j = 1, NREPS
 call dgemm16<<<1, 1>>>(dA, m, dipiv,node)
 end do
 istat = cudaEventRecord(stop, 0)
 istat = cudaDeviceSynchronize()
 istat = cudaEventElapsedTime(time, start, stop)
 time = time / (NREPS*1.0d3)

 a = da

 nerrors = 0
 rmaxerr = 0.0d0
 rsumerr = 0.0d0
 do j = 1, Node
 do i = 1, N
 if (a(i,j) .ne. 0.0d0) then
 if (abs(a(i,j)) .gt. rmaxerr) rmaxerr = abs(a(i,j))
 nerrors = nerrors + 1
 rsumerr = rsumerr + abs(a(i,j))
 end if
 end do
 end do

 if (nerrors .eq. 0) then
 print *,"Test passed!"
 else
 print *,nerrors," errors were encountered"
 print *,"Max error was ",rmaxerr
 print *,"Ave error was ",rsumerr / (N * Node)
 endif

 gflops = 2.0 * N * N * N/time/1d9
 write (*,901) m,node,time*1.0d3,gflops
 print *,"### a(1,1)=",a(1,1)

901 format(i0,'x',i0,' * ',i0,'x',i0,':\t',f8.3,' ms\t',f8.3,' GFlops/s')
end program

!!!!!!!!!!!!!!!!!!!!!!!!!code end!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

Hi tsariner1986,

Dynamic Parallelism allows you to launch other CUDA global kernels (i.e. the calls with the chevron (<<< >>>) syntax). However, the “cula_device_” routines are host routines not global kernels. They do accept device data, but are only callable from the host.

I’d suggest you contact EM Photonics and see if they have any plans to add device callable routines.

  • Mat

Thank you for your reply!
The fowllowing is the code about the subroutine of attributes(device) cublas_dgemm . The code‘s compiling and executing is very successful. Now I want to use dgetrf & dgetri (in cula) replace dgemm only . can you show me that is possible?


!
!      Copyright 2013, STMicroelectronics, Incorporated.
!      All rights reserved.
!
!          This source code is intended only as a supplement to
!   STMicroelectronics Development Tools and/or on-line documentation.
!

MODULE faster_dgemm

CONTAINS
  attributes(global) subroutine dgemm16(a, b, c, m, n, k)
    use cublas_device
    integer, value :: m, n, k
    double precision, device :: a(m,*), b(k,*), c(m,*)
    double precision, device :: alpha, beta
    type(cublasHandle) :: ch1
    integer transa, transb
    i = threadIdx%x
    if (i.eq.1) then
        istat = cublasCreate_v2(ch1)
        alpha = 1.0d0
        beta  = 0.0d0
        transa = cublas_op_n
        transb = cublas_op_n
        [b]istat = cublasDgemm_v2(ch1, transa, transb, m, n, k, alpha, &
                                   a, m, b, k, beta, c, m)[/b]
        istat = cublasDestroy_v2(ch1)
    end if
    return
    end subroutine

END MODULE

program main
  use cudafor
  use faster_dgemm
  integer, parameter :: N = 512
  integer, parameter :: NREPS = 100
  ! matrix data
  real(8), dimension(N,N) :: A, B, C
  real(8), allocatable, device, dimension(:,:) :: dA, dB, dC
  real(8) gold, RR(N), RQ(N)
  type(cudaEvent) :: start, stop
  type(dim3) :: blocks
  type(dim3) :: threads

  istat = cudaEventCreate(start)
  istat = cudaEventCreate(stop)

  j = 1
  bv = -127.0d0
  do i = 1, N/2
    B(i,j) = 2.0d0 ** bv
    bv = bv + 1.0d0
    B(N-i+1,j) = -B(i,j)
  end do

  call random_number(rr)
  A(:,1) = rr

  do j = 2, N
    RQ = B(:,1)
    call random_number(rr)
    nn = N - 1
    do i = 1, N
      ival = int(rr(j) * nn + 1.0d0)
      B(i,j) = rq(ival)
      do k = ival, nn
        rq(k) = rq(k+1)
      end do
      nn = nn - 1
      A(i,j) = A(i,1)
    end do
  end do

  allocate(dA(N,N))
  allocate(dB(N,N))
  allocate(dC(N,N))

  dA = A
  dB = B

  m = N
  k = N

  ! timing experiment
  call dgemm16<<<1, 1>>>(dA, dB, dC, m, N, k)
  time = 0.d0
  istat = cudaEventRecord(start, 0)
  do j = 1, NREPS
     call dgemm16<<<1, 1>>>(dA, dB, dC, m, N, k)
  end do
  istat = cudaEventRecord(stop, 0)
  istat = cudaDeviceSynchronize()
  istat = cudaEventElapsedTime(time, start, stop)
  time = time / (NREPS*1.0d3)

  C = dC

  nerrors = 0
  rmaxerr = 0.0d0
  rsumerr = 0.0d0
  do j = 1, N
    do i = 1, N
      if (C(i,j) .ne. 0.0d0) then
        if (abs(C(i,j)) .gt. rmaxerr) rmaxerr = abs(C(i,j))
        nerrors = nerrors + 1
        rsumerr = rsumerr + abs(C(i,j))
      end if
    end do
  end do

  if (nerrors .eq. 0) then
    print *,"Test passed!"
  else
    print *,nerrors," errors were encountered"
    print *,"Max error was ",rmaxerr
    print *,"Ave error was ",rsumerr / (N * N)
  endif

  gflops = 2.0 * N * N * N/time/1d9
  write (*,901) m,k,k,N,time*1.0d3,gflops
  print *,"### C(1,1)=",C(1,1)
!
901 format(i0,'x',i0,' * ',i0,'x',i0,':\t',f8.3,' ms\t',f8.3,' GFlops/s')
end program

Hi tsariner1986,

cuBLAS added special version (along with a Fortran generic interface) which is callable from device code. You will need to inquire with EMPhotonics if they are planning on adding device callable version of the dgetrf & dgetri routines.

  • Mat

Hi Mat,
Thanks again!
Now,I think that your suggestion is the best method!