Memory space used by cusolverDN_getrf

Hi Jo, here is a CUDA Fortran version. It could be easily modified for OpenACC or other models.

program testdgetrf
use cutensorex
use cusolverDn
use cudafor
integer, parameter :: M = 8000
integer, parameter :: N = M
real(8), parameter :: eps = 1.0d-7
real(8), managed :: a(M,N), b(M,128)
real(8), managed :: bscal(128)
real(8) adiff, bval, t0, t1
integer(8), managed :: ipiv(M)
integer(4), managed :: fsinfo(2)
integer(4) lda
integer(8) devsz, hostsz
type(cusolverDnHandle) :: h
type(cusolverDnParams) :: p
real(8), device, allocatable :: dwork(:)
real(8), allocatable :: hwork(:)

call random_number(a)
!$cuf kernel do(2) <<< *,* >>>
do j = 1, n
  do i = 1, n
    if (abs(i-j) .lt. 4) then
      a(i,j) = a(i,j) * 10.0d0
    end if
  end do
end do
!
b(:,1) = sum(a,dim=2)
call random_number(bscal)
!$cuf kernel do(2)<<< *,* >>>
do j = 2, 128
  do i = 1, M
    b(i,j) = b(i,1) * bscal(j)
  end do
end do
!
lda = m

istat = cusolverDnCreate(h)
print *,"cusolver handle create status = ",istat

istat = cusolverDnCreateParams(p)
print *,"cusolver create params status = ",istat

istat = cusolverDnSetAdvOptions(p, CUSOLVERDN_GETRF, CUSOLVER_ALG_1)
print *,"cusolver set adv options status = ",istat

istat = cusolverDnXgetrf_buffersize(h, p, m, n, cudaDataType(CUDA_R_64F), &
            a, lda, cudaDataType(CUDA_R_64F), devsz, hostsz )

print *,"cusolver Xgetrf buffersize status = ",istat
print *,"cusolver Xgetrf buffersize dev, host size = ", devsz, hostsz

allocate(dwork(devsz/8))
allocate(hwork(hostsz/8))

call cpu_time(t0)
istat = cusolverDnXgetrf(h, p, m, n, cudaDataType(CUDA_R_64F), a, lda, &
            ipiv, cudaDataType(CUDA_R_64F), dwork, devsz, hwork, hostsz, fsinfo(1))

jstat = cusolverDnXgetrs(h, p, CUBLAS_OP_N, n, 128, cudaDataType(CUDA_R_64F), &
            a, lda, ipiv, cudaDataType(CUDA_R_64F), b, lda, fsinfo(2))
istat = cudaDeviceSynchronize()
call cpu_time(t1)
print *,"dgetrf return",istat, fsinfo(1)
print *,"dgetrs return",jstat, fsinfo(2)
!
nerrors = 0
bscal(1) = 1.0
do i = 1, 128
  adiff = abs(minval(b(:,i))-bscal(i)) + abs(maxval(b(:,i))-bscal(i))
  if (adiff.gt.eps) then
    nerrors = nerrors + 1
    write (6,100) minval(b(:,i)), maxval(b(:,i)), bscal(i)
  endif
end do
100 format(10(1x,f12.8))
if (nerrors.eq.0) print *,"test PASSED"
print *,"Time for dgetrf and dgetrs using cpu_time: ",t1-t0
end

I compiled it with 21.5 like this: nvfortran tdgetrf.cuf -cudalib=cutensor,curand,cusolver