I am using cublas with nvfortran to calculate batched matrix inverses and determinants. I am using nvfortran 25.7-0 64-bit target on x86-64 Linux -tp haswell
I have a set of different classes which can have different dimensions and batch sizes. I would like to be able to store the matrices, and their inverses, and the device pointers needed by cublas in these derived types.
Currently, if I define them as module arrays in the same module with the classes, the inverse works fine, but only one class can use these arrays. I think there may be more than one problem, but to start, a minimal test code is below. This code is able to have the matrices and their stored inverses in the derived type, but trying to move the pointers to device arrays in the derived type fails. My full code is also unable to work with the matrices in the derived type.
Here I get a compiler error if I compile with -DFAILS.
$nvfortran -cuda cmat.F90 -DFAILS
Lowering Error: symbol dp_smat_d$sd is an inconsistent array descriptor
Lowering Error: symbol dp_smat_d$sd is an inconsistent array descriptor
Lowering Error: symbol dp_smat_d$sd is an inconsistent array descriptor
Lowering Error: symbol dp_smat_d$sd is an inconsistent array descriptor
Lowering Error: symbol dp_smati_d$sd is an inconsistent array descriptor
Lowering Error: symbol dp_smati_d$sd is an inconsistent array descriptor
Lowering Error: symbol dp_smati_d$sd is an inconsistent array descriptor
Lowering Error: symbol dp_smati_d$sd is an inconsistent array descriptor
NVFORTRAN-F-0000-Internal compiler error. Errors in Lowering 8 (cmat.F90: 119)
NVFORTRAN/x86-64 Linux 25.7-0: compilation aborted
Compiling with no defines uses pointer variables not in the defined type and gives the correct inverses as tested by multiplying them together and comparing with the unit matrix.
$nvfortran -cuda cmat.F90
$./a.out
max inverse error 5.1636710387543281E-014
Here is the test code
module precision
use, intrinsic :: iso_fortran_env
implicit none
integer, public, parameter :: i4=int32
integer, public, parameter :: i8=int64
integer, public, parameter :: r8=real64
complex(kind=r8), public, parameter :: czero=(0.0_r8,0.0_r8)
complex(kind=r8), public, parameter :: ci=(0.0_r8,1.0_r8)
complex(kind=r8), public, parameter :: cone=(1.0_r8,0.0_r8)
end module precision
program cmattest
use cudafor
use cublas_v2
use, intrinsic :: iso_fortran_env
use, intrinsic :: iso_c_binding
use precision
implicit none
!host
integer(kind=i4) :: istat,nbatch,i,iwf,id
type(c_devptr), allocatable :: hp_smat_d(:,:),hp_smati_d(:,:)
complex(kind=r8), allocatable :: smat_h(:,:,:,:),smati_h(:,:,:,:)
complex(kind=r8), allocatable :: test(:,:)
real(kind=r8) :: err
integer, allocatable :: seed(:)
integer :: seedsize
type(cublasHandle) :: h
type :: t_test
integer(kind=i4) :: npart,ndet,nwf
complex(kind=r8), device, allocatable :: smat_d(:,:,:,:),smati_d(:,:,:,:)
integer(kind=i4), device, allocatable :: ipvt_d(:,:,:),info_d(:,:)
type(c_devptr), device, allocatable :: dp_smat_d(:,:),dp_smati_d(:,:)
end type
type(t_test) :: m
!device for working version
type(c_devptr), device, allocatable :: dp_smat_d(:,:),dp_smati_d(:,:)
!set up random number generator
call random_seed(size=seedsize)
allocate(seed(seedsize))
seed=17171717
call random_seed(put=seed)
!problem size
m%npart=16
m%ndet=3
m%nwf=97
nbatch=m%nwf*m%ndet
! host allocate
allocate(hp_smat_d(m%ndet,m%nwf),hp_smati_d(m%ndet,m%nwf))
allocate(smat_h(m%npart,m%npart,m%ndet,m%nwf))
allocate(smati_h(m%npart,m%npart,m%ndet,m%nwf))
allocate(test(m%npart,m%npart))
! fill input matrices with random values for test
smat_h=reshape(cr(m%npart**2*m%ndet*m%nwf),shape(smat_h))
! device allocate with and without derived type
allocate(m%dp_smat_d(m%ndet,m%nwf),m%dp_smati_d(m%ndet,m%nwf))
allocate(m%smat_d(m%npart,m%npart,m%ndet,m%nwf))
allocate(m%smati_d(m%npart,m%npart,m%ndet,m%nwf))
allocate(m%ipvt_d(m%npart,m%ndet,m%nwf),m%info_d(m%ndet,m%nwf))
! need these to avoid compile errors
allocate(dp_smat_d(m%ndet,m%nwf),dp_smati_d(m%ndet,m%nwf))
! transfer to device
m%smat_d=smat_h
! create cublas handle
istat=cublasCreate(h)
if (istat.ne.0) write (*,*) 'cuda error',istat
istat = cudaDeviceSynchronize()
if (istat.ne.0) write (*,*) 'cuda error',istat
! set up pointers needed by cublas_v2
do iwf=1,m%nwf
do id=1,m%ndet
hp_smat_d(id,iwf)=c_devloc(m%smat_d(1,1,id,iwf))
hp_smati_d(id,iwf)=c_devloc(m%smati_d(1,1,id,iwf))
enddo
enddo
#if defined(FAILS)
m%dp_smat_d=hp_smat_d
m%dp_smati_d=hp_smati_d
! batched lu decomposition
istat=cublasZgetrfBatched(h,m%npart,m%dp_smat_d,m%npart,m%ipvt_d &
,m%info_d,nbatch)
if (istat.ne.0) write (*,*) 'cuda error',istat
! batched inverse from lu
istat=cublasZgetriBatched(h,m%npart,m%dp_smat_d,m%npart,m%ipvt_d &
,m%dp_smati_d,m%npart,m%info_d,nbatch)
if (istat.ne.0) write (*,*) 'cuda error',istat
#else
dp_smat_d=hp_smat_d
dp_smati_d=hp_smati_d
! batched lu decomposition
istat=cublasZgetrfBatched(h,m%npart,dp_smat_d,m%npart,m%ipvt_d &
,m%info_d,nbatch)
if (istat.ne.0) write (*,*) 'cuda error',istat
! batched inverse from lu
istat=cublasZgetriBatched(h,m%npart,dp_smat_d,m%npart,m%ipvt_d &
,dp_smati_d,m%npart,m%info_d,nbatch)
istat = cudaDeviceSynchronize()
if (istat.ne.0) write (*,*) 'cuda error',istat
#endif
smati_h=m%smati_d
istat = cudaDeviceSynchronize()
if (istat.ne.0) write (*,*) 'cuda error',istat
smati_h=m%smati_d
istat = cudaDeviceSynchronize()
if (istat.ne.0) write (*,*) 'cuda error',istat
err=0.0_r8
do iwf=1,m%nwf
do id=1,m%ndet
test=matmul(smat_h(:,:,id,iwf),smati_h(:,:,id,iwf))
do i=1,m%npart
test(i,i)=test(i,i)-cone
enddo
err=max(err,maxval(abs(test)))
enddo
enddo
write (*,*) 'max inverse error ',err
! clean up
istat=cublasDestroy(h)
if (istat.ne.0) write (*,*) 'cuda error',istat
contains
function cr(n)
integer(kind=i4) :: n,i
complex(kind=r8) :: cr(n)
real :: r1(n),r2(n)
call random_number(r1)
call random_number(r2)
cr=(r1-0.5)+ci*(r2-0.5)
end function cr
function rr(n)
integer(kind=i4) :: n,i
real(kind=r8) :: rr(n)
real :: r1(n)
call random_number(r1)
rr=r1
end function rr
end program cmattest