New cuBlas function "getrfBatched"

Hello,

I am testing the new CUBLAS ‘cublasZgetrfBatched’ function using CUDA Fortran.
I have a question about how to build an array of pointers with the device attribute.

This function performs the LU factorization of each Aarray _for i = 0, …, batchSize-1.
Aarray is an array of pointers to matrices stored in column-major format with dimensions n x n, as to the below spec… So Aarray is a pointer-of-pointer array like C.

cublasStatus_t cublasZgetrfBatched(cublasHandle_t handle,
                                   int n, 
                                   cuDoubleComplex *Aarray[],
                                   int lda, 
                                   int *PivotArray,
                                   int *infoArray,
                                   int batchSize);

I wrote the following simple test program for one reference.
I don’t know how to set up an array(Aarray_dev) of pointers with the device attribute.
Could you please tell me the way? And if you give me a correct code I will be much appreciated.
(GPU kard is K20 / cuda 5.5/ PGI 13.10 / CentOS 6.4)

module Zgetr_cublas

  use cudafor

  interface
   function cublas_zgetrf(h, n, Aarray, lda, ipvt, info, ibatch) bind(c,name='cublasZgetrfBatched')
   use iso_c_binding
   use cublas
    type(cublasHandle) :: h
    integer(c_int), value :: n, lda, ibatch
    complex(c_double_complex), device, pointer, dimension(:) :: Aarray(:)
    !complex(c_double_complex), device, pointer :: Aarray
    integer(c_int), device :: ipvt(:)
    integer(c_int), device :: info(:)
   end function cublas_zgetrf
  end interface

end module Zgetr_cublas

program test
    use cudafor
    use cublas
    use Zgetr_cublas
    use, intrinsic :: iso_c_binding
    implicit none

!.. Parameters ..
    integer n
    parameter (n=4)
    integer lda, ldb
    integer row, col, matrixsize
    parameter (lda=n,ldb=n)
    parameter (row=n,col=n)

    integer ndmax   ! = ibatch
    parameter (ndmax=1)

!.. Local ..
    integer :: i, j, k, istat

!.. Input Matrix for Host ..
    complex(8), dimension(:,:,:), allocatable :: a
!.. for device
    complex(8), device, pointer, dimension(:) :: Aarray_dev
    !complex(8), device, pointer :: Aarray_dev                       ! test for ibatch=1
    complex(8), device, target,  dimension(:,:,:), allocatable :: a_d
!
!.. handle for CUBLAS function
    type(cublasHandle) :: h1
    integer :: ibatch
    parameter (ibatch=ndmax)
    integer  :: ipvt(n*ibatch)
    integer  :: info(ibatch)
    integer, device  :: ipvt_d(n*ibatch)
    integer, device  :: info_d(ibatch)

! Matrixsize
  matrixsize=col*row

  print *, "size of variable a",sizeof(a(1,1,1)), "bytes"
  print *, "size of matrix (row x col)", matrixsize

! allocate both a and a_d on device
  allocate (a(row,col,ndmax))
  allocate (a_d(row,col,ndmax))

! generates input matrix

do k=1, ndmax
  a(1,1,k)=(-1.34d0, 2.55d0)
  a(1,2,k)=( 0.28d0, 3.17d0)
  a(1,3,k)=(-6.39d0,-2.20d0)
  a(1,4,k)=( 0.72d0,-0.92d0)
  a(2,1,k)=(-0.17d0,-1.41d0)
  a(2,2,k)=( 3.31d0,-0.15d0)
  a(2,3,k)=(-0.15d0, 1.34d0)
  a(2,4,k)=( 1.29d0, 1.38d0)
  a(3,1,k)=(-3.29d0,-2.39d0)
  a(3,2,k)=(-1.91d0, 4.42d0)
  a(3,3,k)=(-0.14d0,-1.35d0)
  a(3,4,k)=( 1.72d0, 1.35d0)
  a(4,1,k)=( 2.41d0, 0.39d0)
  a(4,2,k)=(-0.56d0, 1.47d0)
  a(4,3,k)=(-0.83d0,-0.69d0)
  a(4,4,k)=(-1.96d0, 0.67d0)
end do

 !a(1,1,2)=(-2.00d0, 2.55d0)
 !a(1,1,3)=(-3.00d0, 2.55d0)

   print *,'Input'
   do k = 1, ndmax
     print *, "Batch no. =", k
     print '(1X,8D12.3)', ((a(i,j,k),j=1,col),i=1,row)
   end do

! copy host to device

   istat = cudaMemset(a_d, (0.0d0,0.0d0), matrixsize*ndmax)
   !a_d = (0.0d0,0.0d0)

  a_d = a

! build an array of pointers
    Aarray_dev => a_d(1,1,:)   ! runtime error due to device array section argument is not sequential
    !Aarray_dev => a_d(1,1,1)   ! test for ibatch=1

    ipvt_d = ipvt
    info_d = info

! call cuBlas
    istat = cublasCreate(h1)
    !h1 = cublasGetHandle()
    istat= cublas_zgetrf(h1, n, Aarray_dev , lda, ipvt_d, info_d, ibatch)
    print *, 'error code = ' ,istat, cudaGetErrorString( cudaGetLastError() )
    istat = cublasDestroy(h1)


! initialize a() to verify
  a   = (0.0d0, 0.0d0)

! copy from Device to Host

  a = a_d

   print *, 'result'
!  Print details of factorization

   do k = 1, ndmax
     print *, "Batch no. =", k
     print '(1X,8D12.3)', ((a(i,j,k),j=1,col),i=1,row)
   end do

! Print pivot indices

  !print *, 'Pivot indices'
  !print *, (ipiv(i),i=1,n)


!  GPU device : reset

   istat = cudaDeviceReset()
   print *, 'error code = ', cudaGetErrorString( cudaGetLastError() )

end program

The compilation is successful, but a runtime error occurs as follows.

$ pgf90 -Mcuda:5.5 f.f90 -lcublas

$ a.out
 size of variable a           16 bytes
 size of matrix           16
 Input
 Batch no. =            1
   -0.134D+01   0.255D+01   0.280D+00   0.317D+01  -0.639D+01  -0.220D+01   0.720D+00  -0.920D+00
   -0.170D+00  -0.141D+01   0.331D+01  -0.150D+00  -0.150D+00   0.134D+01   0.129D+01   0.138D+01
   -0.329D+01  -0.239D+01  -0.191D+01   0.442D+01  -0.140D+00  -0.135D+01   0.172D+01   0.135D+01
    0.241D+01   0.390D+00  -0.560D+00   0.147D+01  -0.830D+00  -0.690D+00  -0.196D+01   0.670D+00
0: f.f90, line 105: device array section argument is not sequential

Line 105 is the part to set a pointer as the below.
! build an array of pointers
Aarray_dev => a_d(1,1,:)

This error might be due to a specification for Fortran pointer operation on device side.
I would like to know other any method to solve it.

Anyway, I applied some changes belows to run it avoiding the above runtime error. That is, I changed the program for batchsize=1.


 !complex(8), device, pointer, dimension(:) :: Aarray_dev
  complex(8), device, pointer :: Aarray_dev                       ! test for batchsize=1
---
 !complex(c_double_complex), device, pointer, dimension(:) :: Aarray(:)
  complex(c_double_complex), device, pointer :: Aarray
---
 !Aarray_dev => a_d(1,1,:)    ! runtime error due to device array section argument is not sequentia
  Aarray_dev => a_d(1,1,1)    ! test for batchsize=1

The execution looks normal, but the cublas function is not called. I can see this by the profiler “nvprof”. The symbol name of “cublasZgetrfBatched” does not appear in the profiling list. I don’t know why.

$ pgf90 -Mcuda:5.5 f.f90 -lcublas
$ nvprof a.out
==26712== NVPROF is profiling process 26712, command: a.out
 size of variable a           16 bytes
 size of matrix           16
 Input
 Batch no. =            1
   -0.134D+01   0.255D+01   0.280D+00   0.317D+01  -0.639D+01  -0.220D+01   0.720D+00  -0.920D+00
   -0.170D+00  -0.141D+01   0.331D+01  -0.150D+00  -0.150D+00   0.134D+01   0.129D+01   0.138D+01
   -0.329D+01  -0.239D+01  -0.191D+01   0.442D+01  -0.140D+00  -0.135D+01   0.172D+01   0.135D+01
    0.241D+01   0.390D+00  -0.560D+00   0.147D+01  -0.830D+00  -0.690D+00  -0.196D+01   0.670D+00
 error code =             0
 no error
 result
 Batch no. =            1
   -0.134D+01   0.255D+01   0.280D+00   0.317D+01  -0.639D+01  -0.220D+01   0.720D+00  -0.920D+00
   -0.170D+00  -0.141D+01   0.331D+01  -0.150D+00  -0.150D+00   0.134D+01   0.129D+01   0.138D+01
   -0.329D+01  -0.239D+01  -0.191D+01   0.442D+01  -0.140D+00  -0.135D+01   0.172D+01   0.135D+01
    0.241D+01   0.390D+00  -0.560D+00   0.147D+01  -0.830D+00  -0.690D+00  -0.196D+01   0.670D+00
 error code =
 no error
==26712== Profiling application: a.out
==26712== Profiling result:
Time(%)      Time     Calls       Avg       Min       Max  Name
 59.91%  8.5120us         7  1.2160us  1.0880us  1.4720us  [CUDA memcpy HtoD]
 23.87%  3.3920us         1  3.3920us  3.3920us  3.3920us  [CUDA memcpy DtoH]
 16.22%  2.3040us         1  2.3040us  2.3040us  2.3040us  __pgi_dev_cumemset_8

Thanks in advance for any help anybody could provide.

Best regards,
Kato

Hi Kato,

I wanted to let you know that we are looking at this but don’t have a solution for you yet. We don’t using Fortran pointers is the way to go, but are looking at using the “c_devptr” type.

Thanks,
Mat

Hi Mat,

Thank you for looking into this.

I find a function C_DEVLOC() defined which will create a TYPE(C_DEVPTR) that holds the C address of the Fortran device array argument. ( I didn’t know this function is prepared in CUDA Fortran)

I rewrite this program using c_devloc() as follows.

module Zgetr_cublas

  use cudafor

  interface
   function cublas_zgetrf(h, n, Aarray, lda, ipvt, info, ibatch) bind(c,name='cublasZgetrfBatched')
   use iso_c_binding
   use cublas
    type(cublasHandle) :: h
    integer(c_int), value :: n, lda, ibatch
    type (c_devptr) :: Aarray(:)
    !!complex(c_double_complex), device, dimension(:) :: Aarray(:)
    integer(c_int), device :: ipvt(:)
    integer(c_int), device :: info(:)
   end function cublas_zgetrf
  end interface

end module Zgetr_cublas

program test
    use cudafor
    use cublas
    use Zgetr_cublas
    use, intrinsic :: iso_c_binding
    implicit none

!.. Parameters ..
    integer n
    parameter (n=4)
    integer lda, ldb
    integer row, col, matrixsize
    parameter (lda=n,ldb=n)
    parameter (row=n,col=n)

    integer ndmax   ! = ibatch
    parameter (ndmax=3)

!.. Local ..
    integer :: i, j, k, istat

!.. Input Matrix for Host ..
    complex(8), dimension(:,:,:), allocatable :: a
!.. for device
    complex(8), device, target,  dimension(:,:,:), allocatable :: a_d
    type (c_devptr) :: devPtrA(ndmax)
    !!complex(8), device, dimension(:),allocatable :: Aarray_dev
!
!.. handle for CUBLAS function
    type(cublasHandle) :: h1
    integer :: ibatch
    parameter (ibatch=ndmax)
    integer  :: ipvt(n*ibatch)
    integer  :: info(ibatch)
    integer, device  :: ipvt_d(n*ibatch)
    integer, device  :: info_d(ibatch)

! Matrixsize
  matrixsize=col*row

  print *, "size of variable a",sizeof(a(1,1,1)), "bytes"
  print *, "size of matrix (row x col)", matrixsize

! allocate both a and a_d on device
  allocate (a(row,col,ndmax))
  allocate (a_d(row,col,ndmax))

! generates input matrix

do k=1, ndmax
  a(1,1,k)=(-1.34d0, 2.55d0)
  a(1,2,k)=( 0.28d0, 3.17d0)
  a(1,3,k)=(-6.39d0,-2.20d0)
  a(1,4,k)=( 0.72d0,-0.92d0)
  a(2,1,k)=(-0.17d0,-1.41d0)
  a(2,2,k)=( 3.31d0,-0.15d0)
  a(2,3,k)=(-0.15d0, 1.34d0)
  a(2,4,k)=( 1.29d0, 1.38d0)
  a(3,1,k)=(-3.29d0,-2.39d0)
  a(3,2,k)=(-1.91d0, 4.42d0)
  a(3,3,k)=(-0.14d0,-1.35d0)
  a(3,4,k)=( 1.72d0, 1.35d0)
  a(4,1,k)=( 2.41d0, 0.39d0)
  a(4,2,k)=(-0.56d0, 1.47d0)
  a(4,3,k)=(-0.83d0,-0.69d0)
  a(4,4,k)=(-1.96d0, 0.67d0)
end do

 !a(1,1,2)=(-2.00d0, 2.55d0)
 !a(1,1,3)=(-3.00d0, 2.55d0)

   print *,'Input'
   do k = 1, ndmax
     print *, "Batch no. =", k
     print '(1X,8D12.3)', ((a(i,j,k),j=1,col),i=1,row)
   end do

! copy host to device

    istat = cudaMemset(a_d, (0.0d0,0.0d0), matrixsize*ndmax)
   !a_d = (0.0d0,0.0d0)

  a_d = a

! build an array of pointers
    do k = 1, ndmax
      devPtrA(i) = c_devloc(a_d(1,1,k))
    end do
    !!call c_f_pointer( devPtrA, Aarray_dev , (/ ndmax /) )

    ipvt_d = ipvt
    info_d = info

! call blas
    istat = cublasCreate(h1)
    !h1 = cublasGetHandle()
    istat= cublas_zgetrf(h1, n, devPtrA , lda, ipvt_d, info_d, ibatch)
    !!istat= cublas_zgetrf(h1, n, Aarray_dev , lda, ipvt_d, info_d, ibatch)
    print *, 'error code = ' ,istat, cudaGetErrorString( cudaGetLastError() )
    istat = cublasDestroy(h1)


! initialize a() to verify
  a   = (0.0d0, 0.0d0)

! copy from Device to Host

  a = a_d

   print *, 'result'
!  Print details of factorization

   do k = 1, ndmax
     print *, "Batch no. =", k
     print '(1X,8D12.3)', ((a(i,j,k),j=1,col),i=1,row)
   end do

! Print pivot indices

  !print *, 'Pivot indices'
  !print *, (ipiv(i),i=1,n)


!  GPU device : reset

   istat = cudaDeviceReset()
   print *, 'error code = ', cudaGetErrorString( cudaGetLastError() )

end program

But it looks like the function ‘cublasZgetrfBatched’ is not called at runtime.
I can not see the entry point of ‘cublasZgetrfBatched’.

$ pgf90 -Mcuda:5.5 g.f90 -lcublas
$ nvprof a.out
==4819== NVPROF is profiling process 4819, command: a.out
 size of variable a           16 bytes
 size of matrix (row x col)           16
(snip)
==4819== Profiling application: a.out
==4819== Profiling result:
Time(%)      Time     Calls       Avg       Min       Max  Name
 45.86%  4.9600us         4  1.2400us  1.1520us  1.4720us  [CUDA memcpy HtoD]
 33.14%  3.5840us         1  3.5840us  3.5840us  3.5840us  [CUDA memcpy DtoH]
 21.01%  2.2720us         1  2.2720us  2.2720us  2.2720us  __pgi_dev_cumemset_8

I doubt if there is something wrong with cublas library.
Please let me know if there are any developments with that.

Thanks,
Kato

Hi Kato,

I’ve asked Greg Ruetsch who’s one of the authors of CUDA Fortran for Scientist’s and Engineer’s for help. In the past, he’s solved this issue by using an array of a user derived type containing an array of pointers and then use a C interface routine to transform it into a C * structure. However, he doesn’t like this solution so is trying to find a more elegant solution.

He did notice the following typo:

! build an array of pointers 
     do k = 1, ndmax 
       devPtrA(i) = c_devloc(a_d(1,1,k)) 
     end do

devPtrA is using “i” as an index. It should be “k”. Unfortunately, it doesn’t fix the problem, but he wanted to point it out.

  • Mat

Hi Kato,

Greg was able to get this to work.

The issue is that CUBLAS expects a device array of device pointers as an argument to cublasSgetrfBatched. The declaration:

type (c_devptr) :: aArray(:)

is a host array of device pointers, what is needed is:

type (c_devptr), device :: aArray(:)

Using this in the interface and declaring:

  type (c_devptr) :: devPtrA(nbatch) 
  type (c_devptr), device :: devPtrA_d(nbatch)

where:

  do k = 1, ndmax
     devPtrA(k) = c_devloc(a_d(1,1,k))
  end do
  devPtrA_d = devPtrA

and passing devPtrA_d to cublasSgetrfBatched solves the problem.

Here’s the full example:

% cat getrfBatched.cuf

module batchedCublas
  use cudafor
  implicit none
  interface
     integer(c_int) function cublasSgetrfBatched(h, n, Aarray, lda, ipvt, info, ibatch) &
          bind(c,name='cublasSgetrfBatched')
       use iso_c_binding
       use cublas
       type(cublasHandle), value :: h
       integer(c_int), value :: n
       type(c_devptr), device :: Aarray(*)
       integer(c_int), value :: lda
       integer(c_int), device :: ipvt(*)
       integer(c_int), device :: info(*)
       integer(c_int), value :: ibatch
     end function cublasSgetrfBatched
  end interface
end module batchedCublas

program test
  use cudafor
  use cublas
  use batchedCublas
  use, intrinsic :: iso_c_binding
  implicit none

  integer, parameter :: n=2
  integer, parameter :: lda=n, row=n, col=n
  integer, parameter :: nbatch=2

  real, allocatable :: a(:,:,:)
  real, device, allocatable :: a_d(:,:,:)
  type(c_devptr) :: devPtrA(nbatch)
  type(c_devptr), device :: devPtrA_d(nbatch)
  type(cublasHandle) :: h1
  integer  :: ipvt(n*nbatch)
  integer  :: info(nbatch)
  integer, device  :: ipvt_d(n*nbatch)
  integer, device  :: info_d(nbatch)
  integer :: i, j, k, istat

  ! allocate both a and a_d on device
  allocate (a(row,col,nbatch))
  allocate (a_d(row,col,nbatch))

  a(1,1,:) = 6.0
  a(2,1,:) = 4.0
  a(1,2,:) = 3.0
  a(2,2,:) = 3.0
  a_d = a

  write(*,*) 'Before:'
  do k = 1, nbatch
     write(*,*) 'Batch: ', k
     do i = 1, n
        write(*,*) a(i,:,k)
     enddo
  enddo

  ! build an array of pointers
  do k = 1, nbatch
     devPtrA(k) = c_devloc(a_d(1,1,k))
  end do
  devPtrA_d = devPtrA

  ! call blas
  istat = cublasCreate(h1)
  if (istat /= CUBLAS_STATUS_SUCCESS) write(*,*) 'cublasCreate failed'
  istat= cublasSgetrfBatched(h1, n, devPtrA_d, lda, ipvt_d, info_d, nbatch)
  if (istat /= CUBLAS_STATUS_SUCCESS) write(*,*) 'cublasSgetrfBatched failed: ', istat
  istat = cublasDestroy(h1)
  if (istat /= CUBLAS_STATUS_SUCCESS) write(*,*) 'cublasDestroy failed'

  a = a_d

  write(*,*) 'After:'
  do k = 1, nbatch
     write(*,*) 'Batch: ', k
     do i = 1, n
        write(*,*) a(i,:,k)
     enddo
  enddo

end program test


% pgfortran -Mcuda getrfBatched.cuf -lcublas -o getrf.out
% getrf.out
 Before:
 Batch:             1
    6.000000        3.000000
    4.000000        3.000000
 Batch:             2
    6.000000        3.000000
    4.000000        3.000000
 After:
 Batch:             1
    6.000000        3.000000
   0.6666667        1.000000
 Batch:             2
    6.000000        3.000000
   0.6666667        1.000000

Hi Mat,

Thank you for solving it, Greg and Mat. I understood how to handle a device array of device pointers.

Best Regards,
Kato