Calling CUBLAS' cublasDgetrfBatched proper procedure

Hello, hope you are having a good day…
I have a problem calling CUBLAS functions, as I am clueless to how they are called in fortran.
I have followed the way CULA routines were followed and read the advice from the “CUDA Library Interfaces” pdf.
However, the rest does not come intuitively with me.

I am trying to get the inverse of Matrix A (using cublasDgetriBatched and cublasDgetrfBatched) which its dimension and cell values are called from a text file.
The code is bellow and I get this error… “PGF90-S-0148-Reference to derived type required (inverse.cuf: 43)”

Could you kindly give me an example that uses a similar function… the “isamax” example in the pdf is quite easy to understand yet different.

I am grateful for any help.

Ahmed

! Compile with "pgfortran -Mlarge_arrays inverse.cuf -Mcudalib=cublas -lblas"
      PROGRAM MatrixInverse
      use cudafor
      use cublas
      use iso_c_binding

	  IMPLICIT NONE
	  CHARACTER*1000  infile,outfile
	  type(c_devptr),ALLOCATABLE, device :: A(:,:),C(:,:)
	  INTEGER,ALLOCATABLE, device :: ipiv(:)
	  INTEGER istat, status
	  type(cublasHandle) :: h
	  INTEGER :: n
	  INTEGER :: lda
	  INTEGER :: ldc
	  INTEGER, device :: info(:)
	  INTEGER :: batchCount
	  INTEGER DI, I, J
	  Real time1,time2
	  DOUBLE PRECISION,ALLOCATABLE :: mat(:,:)
! input file names from $INV$
	OPEN(5,FILE='$INV$')
	READ(5,'(A)') infile
	READ(5,'(A)') outfile
	close(5)
! input matrix data to be inversed
	OPEN(1,FILE=infile)
	READ(1,*) DI
	n=DI
	lda=DI
	ldc=DI
	write(*,*) 'Matrix size is : ',DI,'x',DI
	ALLOCATE(mat(DI,DI))
	DO I=1,DI
	DO J=1,DI
	READ(1,*)mat(I,J)
	END DO
	END DO 
	close(1)
! Allocate 
      Allocate (A(DI,DI))
      Allocate (C(DI,DI))
      A = mat
! initialize cublas
      print *, 'Cublas starts'
! cublas SOLVES
      allocate(ipiv(DI))
      open (99, file='Inverse_GPU_Time.txt',status='unknown')      
! start
      call CPU_Time(time1)
      print *, 'Starting CUBLAS (Host interface)'
	  status = cublasCreate(h)
! call cublas
      status = cublasDgetrfBatched(h, DI, A, DI, ipiv, info,batchCount)
      status = cublasDgetriBatched(h, DI, A, DI, ipiv, C, DI,info, batchCount)
! stop
      mat = A
      status=cudaDeviceSynchronize()
      call CPU_Time(time2)
      write(99,*) 'Time spent GPU: ', time2-time1
      OPEN(4,FILE=outfile)
      DO 6 I=1,DI
      DO 7 J=1,DI
      write(4,*)mat(I,J)
    7 END DO
    6 END DO
	status = cublasDestroy(h)
	close(4)
	END

The batched routines take an array of addresses as input. This is an odd thing to do in Fortran, but that is how the C is written. Here is an example code.

module mt
use cudafor
#ifdef DIMPAR
integer, parameter :: n=DIMPAR
#else
integer, parameter :: n=4
#endif

#ifdef BATCHPAR
integer, parameter :: nbatch=BATCHPAR
#else
integer, parameter :: nbatch=5
#endif

type (c_devptr), device :: pa(nbatch), pb(nbatch), pc(nbatch)
contains
attributes(global) subroutine fillp(p, a)
type (c_devptr), device :: p(nbatch)
real(8) :: a(n,n,nbatch)
i = threadIdx%x
if (i.le.nbatch) p(i) = c_devloc(a(1,1,i))
end subroutine fillp

#ifdef CC35
attributes(global) subroutine devDgetrf(p, ipvt, info)
use cublas_device
implicit none
type (c_devptr), device :: p(nbatch)
integer :: ipvt(n*nbatch), info(nbatch)

type(cublasHandle) :: h1
integer :: i, istat

i = threadIdx%x
if (i == 1) then
istat = cublasCreate(h1)
if (istat /= CUBLAS_STATUS_SUCCESS) &
write(,) ‘cublasCreate failed’
istat= cublasDgetrfBatched(h1, n, p, n, &
ipvt, info, nbatch)
if (istat /= CUBLAS_STATUS_SUCCESS) &
write(,) 'cublasDgetrfBatched failed: ', istat
istat = cublasDestroy(h1)
if (istat /= CUBLAS_STATUS_SUCCESS) &
write(,) ‘cublasDestroy failed’
end if
end subroutine devDgetrf
#endif

end module mt

program testDgetrfBatched
use mt
use cudafor
use cublas
use check_mod
implicit none

real(8) :: a(n,n,nbatch), ha(n,n,nbatch), savea(n,n,nbatch), rki
real(8), device :: a_d(n,n,nbatch)
integer :: ipvt(nnbatch), info(nbatch)
integer, device :: ipvt_d(n
nbatch), info_d(nbatch)
type(cublasHandle) :: hh
integer :: i, j, k, istat

! initialize arrays
call random_number(a)
do k = 1, nbatch
rki = 1.0 / k
do j = 1, n
do i = 1, n
if (i.eq.j) then
a(i,j,k) = a(i,j,k) * 4.0
endif
a(i,j,k) = a(i,j,k) * (4.0 + k) * rki
end do
end do
end do

! copies for test and ref
a_d = a
ha = a
#ifdef CC35
savea = a
#endif

#ifdef DEBUG
write(,"(/,‘Input:’)")
do k = 1, nbatch
write(
,"(2x,'Matrix: ', i0)") k
do i=1, n
write(,) a(i,:,k)
enddo
enddo
#endif

! Compute host results
do k = 1, nbatch
call Dgetrf(n, n, ha(1,1,k), n, ipvt((k-1)*n+1), info(k))
end do

! Compute batch from host results
istat = cublasCreate(hh)
call fillp<<<1,nbatch>>> (pa, a_d)
istat= cublasDgetrfBatched(hh, n, pa, n, ipvt_d, info_d, nbatch)
a = a_d

call check(ha, a, nnnbatch, atoler=1.0d-3)

#ifdef CC35
a_d = savea
call fillp<<<1,nbatch>>> (pa, a_d)
call devDgetrf<<<1,1>>> (pa, ipvt_d, info_d)
a = a_d

call check(ha, a, nnnbatch, atoler=1.0d-3)

#endif

#ifdef DEBUG
write(,"(/, ‘LU Factorization:’)")
do k = 1, nbatch
write(
,"(2x,'Matrix: ', i0)") k
do i = 1, n
write(,) a(i,:,k)
enddo
enddo
#endif

end program testDgetrfBatched

You can fill the array of addresses using c_devloc() either on the host or on the device.

Thank you so much, I have managed to complete the code and it could be found bellow for anyone who requires an inverse of a dense matrix through LU decomposition.

I just need to know, can I compile it on 32-bit? I get a message that says pgnvd and pgacclnk are not available in win32 folder of PGI. Is there a way around that, I really need to compile batched cublas for 32-bit.

The code:

! Compile with "pgfortran -Mlarge_arrays inv1.cuf -Mcudalib=cublas -lblas"
! or with "pgf90 inv1.cuf -lcublas"
      PROGRAM MatrixInverse
      use cudafor
      use cublas
      use iso_c_binding 
      implicit none
	  
      integer, parameter :: batchCount=1

      CHARACTER*1000  infile,outfile
	  real(8),ALLOCATABLE :: A(:,:),C(:,:)
	  real :: time1, time2
	  integer,ALLOCATABLE, device :: ipvt(:), info(:)
	  real(8),ALLOCATABLE, device :: A_d(:,:),C_d(:,:)
	  type(c_devptr) :: devPtrA(batchCount), devPtrC(batchCount)
	  type(c_devptr), device :: devPtrA_d(batchCount), devPtrC_d(batchCount)
      INTEGER istat, status
	  type(cublasHandle) :: h
	  INTEGER :: n, i, j
	  INTEGER :: lda
	  INTEGER :: ldc
      INTEGER DI
	  
! intitialize arrays and transfer to device
	OPEN(5,FILE='$INV$')
	READ(5,'(A)') infile
	READ(5,'(A)') outfile
	close(5)
	OPEN(1,FILE=infile)
	READ(1,*) DI
	write(*,*) 'Matrix size is : ',DI,'x',DI
	ALLOCATE(A(DI,DI))

	DO 2 I=1,DI
	DO 3 J=1,DI
	READ(1,*)A(I,J)
 3	END DO
 2	END DO 
      close(1)

      Allocate (C(DI,DI))
      Allocate (A_d(DI,DI))
      Allocate (C_d(DI,DI))
      Allocate (ipvt(DI))
      Allocate (info(DI))

!      call random_number(A)
      write(*,*)A
      A_d = A 
      C_d = 0
! initialize cublas
      print *, 'Cublas starts'
	  
! cublas SOLVES
      open (1, file='Inverse_GPU_Time.txt',         status='unknown')
	  
! start
      call CPU_Time(time1)
      print *, 'Starting CUBLAS'

! create handle
  istat = cublasCreate(h)
  if (istat /= CUBLAS_STATUS_SUCCESS) write(*,*) 'cublasCreate failed: ', istat
  write(*,*) 'cublasCreate status: ', istat
 
! build an array of pointers

  devPtrA(batchCount) = c_devloc(A_d(1,1))
  devPtrC(batchCount) = c_devloc(C_d(1,1))

  devPtrA_d = devPtrA
  devPtrC_d = devPtrC
  
! call cula solver (host interface)
      status = cublasDgetrfBatched(h, DI, devPtrA_d, DI, ipvt, info,batchCount)
      if (istat /= CUBLAS_STATUS_SUCCESS) write(*,*) 'cublasDgetrfBatched failed: ', istat
	   write(*,*) 'cublasDgetrfBatched status: ', istat

      status = cublasDgetriBatched(h, DI, devPtrA_d, DI, ipvt, devPtrC_d, DI,info, batchCount)
      if (istat /= CUBLAS_STATUS_SUCCESS) write(*,*) 'cublasDgetriBatched failed: ', istat
	   write(*,*) 'cublasDgetriBatched status: ', istat

!      A = A_d
      C = C_d
      write(*,*)C
	  
      status=cudaDeviceSynchronize() 
      call CPU_Time(time2)
      write(1,*) 'Time spent GPU: ', time2-time1
      OPEN(4,FILE=outfile)
      DO 6 I=1,DI
      DO 7 J=1,DI
      write(4,*)C(I,J)
    7 END DO
    6 END DO
  
! cleanup
	status = cublasDestroy(h)
	close(4)
	STOP
	END

Hi A.Torky,

As of CUDA 7.0, the cuBLAS libraries are no longer supported on 32-bit Windows.

From the CUDA 7.0 release notes: http://developer.download.nvidia.com/compute/cuda/7_0/Prod/doc/CUDA_Toolkit_Release_Notes.pdf

Certain CUDA Features on 32-bit and 32-on-64-bit Windows Systems
The CUDA Toolkit no longer supports 32-bit Windows operating systems.
Furthermore, the Windows CUDA Driver no longer supports Tesla and Quadro
products on 32-bit Windows operating systems. Additionally, on 64-bit Windows
operating systems, the following features are no longer supported by the CUDA
driver or CUDA toolkit:
‣ Running 32-bit applications on Tesla and Quadro products
‣ Using the Thrust library from 32-bit applications
‣ 32-bit versions of the CUDA Toolkit scientific libraries, including cuBLAS,
cuSPARSE, cuFFT, cuRAND, and NPP
‣ 32-bit versions of the CUDA samples

-Mat