Calling CUBLAS' cublasDgetrfBatched proper procedure

torkyahmad · January 16, 2017, 12:19pm

Hello, hope you are having a good day…
I have a problem calling CUBLAS functions, as I am clueless to how they are called in fortran.
I have followed the way CULA routines were followed and read the advice from the “CUDA Library Interfaces” pdf.
However, the rest does not come intuitively with me.

I am trying to get the inverse of Matrix A (using cublasDgetriBatched and cublasDgetrfBatched) which its dimension and cell values are called from a text file.
The code is bellow and I get this error… “PGF90-S-0148-Reference to derived type required (inverse.cuf: 43)”

Could you kindly give me an example that uses a similar function… the “isamax” example in the pdf is quite easy to understand yet different.

I am grateful for any help.

Ahmed

! Compile with "pgfortran -Mlarge_arrays inverse.cuf -Mcudalib=cublas -lblas"
      PROGRAM MatrixInverse
      use cudafor
      use cublas
      use iso_c_binding

	  IMPLICIT NONE
	  CHARACTER*1000  infile,outfile
	  type(c_devptr),ALLOCATABLE, device :: A(:,:),C(:,:)
	  INTEGER,ALLOCATABLE, device :: ipiv(:)
	  INTEGER istat, status
	  type(cublasHandle) :: h
	  INTEGER :: n
	  INTEGER :: lda
	  INTEGER :: ldc
	  INTEGER, device :: info(:)
	  INTEGER :: batchCount
	  INTEGER DI, I, J
	  Real time1,time2
	  DOUBLE PRECISION,ALLOCATABLE :: mat(:,:)
! input file names from $INV$
	OPEN(5,FILE='$INV$')
	READ(5,'(A)') infile
	READ(5,'(A)') outfile
	close(5)
! input matrix data to be inversed
	OPEN(1,FILE=infile)
	READ(1,*) DI
	n=DI
	lda=DI
	ldc=DI
	write(*,*) 'Matrix size is : ',DI,'x',DI
	ALLOCATE(mat(DI,DI))
	DO I=1,DI
	DO J=1,DI
	READ(1,*)mat(I,J)
	END DO
	END DO 
	close(1)
! Allocate 
      Allocate (A(DI,DI))
      Allocate (C(DI,DI))
      A = mat
! initialize cublas
      print *, 'Cublas starts'
! cublas SOLVES
      allocate(ipiv(DI))
      open (99, file='Inverse_GPU_Time.txt',status='unknown')      
! start
      call CPU_Time(time1)
      print *, 'Starting CUBLAS (Host interface)'
	  status = cublasCreate(h)
! call cublas
      status = cublasDgetrfBatched(h, DI, A, DI, ipiv, info,batchCount)
      status = cublasDgetriBatched(h, DI, A, DI, ipiv, C, DI,info, batchCount)
! stop
      mat = A
      status=cudaDeviceSynchronize()
      call CPU_Time(time2)
      write(99,*) 'Time spent GPU: ', time2-time1
      OPEN(4,FILE=outfile)
      DO 6 I=1,DI
      DO 7 J=1,DI
      write(4,*)mat(I,J)
    7 END DO
    6 END DO
	status = cublasDestroy(h)
	close(4)
	END

brentl · January 17, 2017, 9:31pm

The batched routines take an array of addresses as input. This is an odd thing to do in Fortran, but that is how the C is written. Here is an example code.

module mt
use cudafor
#ifdef DIMPAR
integer, parameter :: n=DIMPAR
#else
integer, parameter :: n=4
#endif

#ifdef BATCHPAR
integer, parameter :: nbatch=BATCHPAR
#else
integer, parameter :: nbatch=5
#endif

type (c_devptr), device :: pa(nbatch), pb(nbatch), pc(nbatch)
contains
attributes(global) subroutine fillp(p, a)
type (c_devptr), device :: p(nbatch)
real(8) :: a(n,n,nbatch)
i = threadIdx%x
if (i.le.nbatch) p(i) = c_devloc(a(1,1,i))
end subroutine fillp

#ifdef CC35
attributes(global) subroutine devDgetrf(p, ipvt, info)
use cublas_device
implicit none
type (c_devptr), device :: p(nbatch)
integer :: ipvt(n*nbatch), info(nbatch)

type(cublasHandle) :: h1
integer :: i, istat

i = threadIdx%x
if (i == 1) then
istat = cublasCreate(h1)
if (istat /= CUBLAS_STATUS_SUCCESS) &
write(,) ‘cublasCreate failed’
istat= cublasDgetrfBatched(h1, n, p, n, &
ipvt, info, nbatch)
if (istat /= CUBLAS_STATUS_SUCCESS) &
write(,) 'cublasDgetrfBatched failed: ', istat
istat = cublasDestroy(h1)
if (istat /= CUBLAS_STATUS_SUCCESS) &
write(,) ‘cublasDestroy failed’
end if
end subroutine devDgetrf
#endif

end module mt

program testDgetrfBatched
use mt
use cudafor
use cublas
use check_mod
implicit none

real(8) :: a(n,n,nbatch), ha(n,n,nbatch), savea(n,n,nbatch), rki
real(8), device :: a_d(n,n,nbatch)
integer :: ipvt(nnbatch), info(nbatch)
integer, device :: ipvt_d(nnbatch), info_d(nbatch)
type(cublasHandle) :: hh
integer :: i, j, k, istat

! initialize arrays
call random_number(a)
do k = 1, nbatch
rki = 1.0 / k
do j = 1, n
do i = 1, n
if (i.eq.j) then
a(i,j,k) = a(i,j,k) * 4.0
endif
a(i,j,k) = a(i,j,k) * (4.0 + k) * rki
end do
end do
end do

! copies for test and ref
a_d = a
ha = a
#ifdef CC35
savea = a
#endif

#ifdef DEBUG
write(,“(/,‘Input:’)”)
do k = 1, nbatch
write(,“(2x,'Matrix: ', i0)”) k
do i=1, n
write(,) a(i,:,k)
enddo
enddo
#endif

! Compute host results
do k = 1, nbatch
call Dgetrf(n, n, ha(1,1,k), n, ipvt((k-1)*n+1), info(k))
end do

! Compute batch from host results
istat = cublasCreate(hh)
call fillp<<<1,nbatch>>> (pa, a_d)
istat= cublasDgetrfBatched(hh, n, pa, n, ipvt_d, info_d, nbatch)
a = a_d

call check(ha, a, nnnbatch, atoler=1.0d-3)

#ifdef CC35
a_d = savea
call fillp<<<1,nbatch>>> (pa, a_d)
call devDgetrf<<<1,1>>> (pa, ipvt_d, info_d)
a = a_d

call check(ha, a, nnnbatch, atoler=1.0d-3)

#endif

#ifdef DEBUG
write(,“(/, ‘LU Factorization:’)”)
do k = 1, nbatch
write(,“(2x,'Matrix: ', i0)”) k
do i = 1, n
write(,) a(i,:,k)
enddo
enddo
#endif

end program testDgetrfBatched

You can fill the array of addresses using c_devloc() either on the host or on the device.

A.Torky · January 28, 2017, 11:15pm

Thank you so much, I have managed to complete the code and it could be found bellow for anyone who requires an inverse of a dense matrix through LU decomposition.

I just need to know, can I compile it on 32-bit? I get a message that says pgnvd and pgacclnk are not available in win32 folder of PGI. Is there a way around that, I really need to compile batched cublas for 32-bit.

The code:

! Compile with "pgfortran -Mlarge_arrays inv1.cuf -Mcudalib=cublas -lblas"
! or with "pgf90 inv1.cuf -lcublas"
      PROGRAM MatrixInverse
      use cudafor
      use cublas
      use iso_c_binding 
      implicit none
	  
      integer, parameter :: batchCount=1

      CHARACTER*1000  infile,outfile
	  real(8),ALLOCATABLE :: A(:,:),C(:,:)
	  real :: time1, time2
	  integer,ALLOCATABLE, device :: ipvt(:), info(:)
	  real(8),ALLOCATABLE, device :: A_d(:,:),C_d(:,:)
	  type(c_devptr) :: devPtrA(batchCount), devPtrC(batchCount)
	  type(c_devptr), device :: devPtrA_d(batchCount), devPtrC_d(batchCount)
      INTEGER istat, status
	  type(cublasHandle) :: h
	  INTEGER :: n, i, j
	  INTEGER :: lda
	  INTEGER :: ldc
      INTEGER DI
	  
! intitialize arrays and transfer to device
	OPEN(5,FILE='$INV$')
	READ(5,'(A)') infile
	READ(5,'(A)') outfile
	close(5)
	OPEN(1,FILE=infile)
	READ(1,*) DI
	write(*,*) 'Matrix size is : ',DI,'x',DI
	ALLOCATE(A(DI,DI))

	DO 2 I=1,DI
	DO 3 J=1,DI
	READ(1,*)A(I,J)
 3	END DO
 2	END DO 
      close(1)

      Allocate (C(DI,DI))
      Allocate (A_d(DI,DI))
      Allocate (C_d(DI,DI))
      Allocate (ipvt(DI))
      Allocate (info(DI))

!      call random_number(A)
      write(*,*)A
      A_d = A 
      C_d = 0
! initialize cublas
      print *, 'Cublas starts'
	  
! cublas SOLVES
      open (1, file='Inverse_GPU_Time.txt',         status='unknown')
	  
! start
      call CPU_Time(time1)
      print *, 'Starting CUBLAS'

! create handle
  istat = cublasCreate(h)
  if (istat /= CUBLAS_STATUS_SUCCESS) write(*,*) 'cublasCreate failed: ', istat
  write(*,*) 'cublasCreate status: ', istat
 
! build an array of pointers

  devPtrA(batchCount) = c_devloc(A_d(1,1))
  devPtrC(batchCount) = c_devloc(C_d(1,1))

  devPtrA_d = devPtrA
  devPtrC_d = devPtrC
  
! call cula solver (host interface)
      status = cublasDgetrfBatched(h, DI, devPtrA_d, DI, ipvt, info,batchCount)
      if (istat /= CUBLAS_STATUS_SUCCESS) write(*,*) 'cublasDgetrfBatched failed: ', istat
	   write(*,*) 'cublasDgetrfBatched status: ', istat

      status = cublasDgetriBatched(h, DI, devPtrA_d, DI, ipvt, devPtrC_d, DI,info, batchCount)
      if (istat /= CUBLAS_STATUS_SUCCESS) write(*,*) 'cublasDgetriBatched failed: ', istat
	   write(*,*) 'cublasDgetriBatched status: ', istat

!      A = A_d
      C = C_d
      write(*,*)C
	  
      status=cudaDeviceSynchronize() 
      call CPU_Time(time2)
      write(1,*) 'Time spent GPU: ', time2-time1
      OPEN(4,FILE=outfile)
      DO 6 I=1,DI
      DO 7 J=1,DI
      write(4,*)C(I,J)
    7 END DO
    6 END DO
  
! cleanup
	status = cublasDestroy(h)
	close(4)
	STOP
	END

MatColgrove · January 30, 2017, 6:00pm

Hi A.Torky,

As of CUDA 7.0, the cuBLAS libraries are no longer supported on 32-bit Windows.

From the CUDA 7.0 release notes: http://developer.download.nvidia.com/compute/cuda/7_0/Prod/doc/CUDA_Toolkit_Release_Notes.pdf

Certain CUDA Features on 32-bit and 32-on-64-bit Windows Systems
The CUDA Toolkit no longer supports 32-bit Windows operating systems.
Furthermore, the Windows CUDA Driver no longer supports Tesla and Quadro
products on 32-bit Windows operating systems. Additionally, on 64-bit Windows
operating systems, the following features are no longer supported by the CUDA
driver or CUDA toolkit:
â€£ Running 32-bit applications on Tesla and Quadro products
â€£ Using the Thrust library from 32-bit applications
â€£ 32-bit versions of the CUDA Toolkit scientific libraries, including cuBLAS,
cuSPARSE, cuFFT, cuRAND, and NPP
â€£ 32-bit versions of the CUDA samples

-Mat

Topic		Replies	Views
CUDA Pro Tip: How to Call Batched cuBLAS routines from CUDA Fortran Technical Blog	4	358	April 27, 2015
New cuBlas function "getrfBatched" Legacy PGI Compilers	5	8515	January 21, 2014
Trouble with cublas and matrix inverse in Fortran Legacy PGI Compilers	1	1927	April 9, 2018
Interfacing OpenACC with cublasDgetrfBatched/cusparseDgtsvStridedBatch in Fortran GPU-Accelerated Libraries	2	1161	April 11, 2014
CUBLAS Library and Fortran Bindings Fortran 2003 provides interoperability standards CUDA Programming and Performance	4	3224	April 24, 2009
Using CUDA Libraries from CUDA Fortran Device Code Legacy PGI Compilers	6	7434	July 19, 2017
How to call cublas library into my cuda fortran code? Legacy PGI Compilers	5	5934	December 8, 2011
cublasGemmEx doesn't work with INT8 utilizing __dp4a instruction on NVIDIA 1080TI CUDA Programming and Performance	12	3625	September 25, 2017
cuBLAS call from kernel in CUDA 10.0 GPU-Accelerated Libraries	9	4813	April 7, 2021
0: copyout Memcpy... FAILED: 4(unspecified launch failure) Legacy PGI Compilers	3	5241	May 13, 2013

Calling CUBLAS' cublasDgetrfBatched proper procedure

Related topics