Hi sk99,
The intent of the NVLAmath interface is that you don’t need to make any modifications to your existing code, but simply compile with the flags “-stdpar -gpu=nvlamath -cudalib=nvlamath”. The “-gpu=nvlamath” flag brings in the “nvhpc_nvlamath” module which has a generic interface that gets mapped to the cuSolver routine.
In other words, you should be able to simply add these flags to your existing program and these routines are implicitly offloaded.
One caveat is that we need to use CUDA Unified Memory to handle the data management. On systems like NVIDIA’s Grace that use HMM, then all host memory is accessible on the device. non-HMM system would use “managed” memory, where only allocated data is accessible.
The “-stdpar” flag, implicitly sets the CUDA Unified Memory “-gpu=mem:managed” or “-gpu=mem:unified” flag depending on our system.
To test, I did find this simple dsyevd example, which I confirmed by profiling the code with Nsight-systems, does get implicitly offloaded.
program dsyevd_example
implicit none
integer, parameter :: n = 3
double precision, allocatable :: A(:,:), W(:)
double precision, allocatable :: work(:)
integer, allocatable :: iwork(:)
integer :: lwork, liwork, info
allocate(A(n,n))
allocate(W(n))
! Example symmetric matrix
A = reshape([ 1.0d0, 2.0d0, 3.0d0, &
2.0d0, 4.0d0, 5.0d0, &
3.0d0, 5.0d0, 6.0d0 ], shape(A))
print *, "Input matrix A:"
call print_matrix(A, n)
! Workspace query
lwork = -1
liwork = -1
allocate(work(1), iwork(1))
call dsyevd('V', 'U', n, A, n, W, work, lwork, iwork, liwork, info)
! Allocate optimal workspace
lwork = int(work(1))
liwork = iwork(1)
deallocate(work, iwork)
allocate(work(lwork), iwork(liwork))
! Actual computation
call dsyevd('V', 'U', n, A, n, W, work, lwork, iwork, liwork, info)
if (info /= 0) then
print *, "DSYEVD failed, INFO =", info
stop
end if
print *, "Eigenvalues:"
print *, W
print *, "Eigenvectors (columns of A):"
call print_matrix(A, n)
deallocate(A)
deallocate(w)
contains
subroutine print_matrix(M, n)
double precision, intent(in) :: M(n,n)
integer, intent(in) :: n
integer :: i
do i = 1, n
print '(3F10.5)', M(i,:)
end do
end subroutine print_matrix
end program dsyevd_example
Compiled for the host:
% nvfortran -Ofast -llapack dsyevd1.F90; a.out
Input matrix A:
1.00000 2.00000 3.00000
2.00000 4.00000 5.00000
3.00000 5.00000 6.00000
Eigenvalues:
-0.5157294715892563 0.1709151888271795 11.34481428276207
Eigenvectors (columns of A):
0.73698 0.59101 0.32799
0.32799 -0.73698 0.59101
-0.59101 0.32799 0.73698
Same code compiled for the GPU (on a GH100 system):
% nvfortran -stdpar -gpu=nvlamath -cudalib=nvlamath dsyevd1.F90; a.out
Input matrix A:
1.00000 2.00000 3.00000
2.00000 4.00000 5.00000
3.00000 5.00000 6.00000
Eigenvalues:
-0.5157294715892582 0.1709151888271763 11.34481428276208
Eigenvectors (columns of A):
0.73698 0.59101 0.32799
0.32799 -0.73698 0.59101
-0.59101 0.32799 0.73698
And the nsys profile from the command “nsys profile --stats=true a.out”
Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
-------- --------------- --------- --------- --------- -------- -------- ----------- ---------------------------------------------------------------- ------------------------------------
76.2 230,752 1 230,752.0 230,752.0 230,752 230,752 0.0 __pgi_dev_cumemset_8n
8.1 24,448 1 24,448.0 24,448.0 24,448 24,448 0.0 void lansy_M_stage1<double, double, (int)8>(long, const T1 *, un signed long, T2 *, int)
7.1 21,440 3 7,146.7 1,568.0 1,344 18,528 9,857.2 copy_info_kernel(int, int *)
2.3 6,976 1 6,976.0 6,976.0 6,976 6,976 0.0 void steqr_ker<stedc_params_<double2, double, (int)8, (int)16, ( int)128>, double, (int)0>(long, T1:…
1.0 2,976 1 2,976.0 2,976.0 2,976 2,976 0.0 void sytrd4_cta<sytrd_params<double, (int)16, (int)32, (int)32, (int)32, (int)0, (int)1, (int)2>, (…
0.7 2,240 1 2,240.0 2,240.0 2,240 2,240 0.0 void hermitianize_matrix_using_upper_triangular_part<double, (lo ng)256>(long, T1 *, long)
0.7 2,144 1 2,144.0 2,144.0 2,144 2,144 0.0 void epilogue<sytrd_params<double, (int)32, (int)8, (int)512, (i nt)32, (int)16, (int)1, (int)2>, (i…
0.7 2,048 1 2,048.0 2,048.0 2,048 2,048 0.0 void ormqr_cta_kernel<double, (int)4, (int)1>(long, long, long, const T1 *, long, long, const T1 *,…
0.6 1,920 1 1,920.0 1,920.0 1,920 1,920 0.0 void lansy_M_stage2<double, (int)8>(long, T1 *)
0.6 1,856 1 1,856.0 1,856.0 1,856 1,856 0.0 void scale_max<double, (int)32, (int)32, (int)1>(long, long, T1 *, T1 *, T1 *, int *)
0.6 1,728 1 1,728.0 1,728.0 1,728 1,728 0.0 void scale_max<double, (int)32, (int)128, (int)0>(long, long, T1 *, T1 *, T1 *, int *)
0.5 1,472 1 1,472.0 1,472.0 1,472 1,472 0.0 __pgi_dev_cumemset_4n
0.5 1,440 1 1,440.0 1,440.0 1,440 1,440 0.0 void lacpy_kernel<double, double, (int)5, (int)3>(long, long, co nst T1 *, long, long, T2 *, long, l…
0.4 1,248 1 1,248.0 1,248.0 1,248 1,248 0.0 xx_set_info_ker(int, int *, int)
Hope this helps,
Mat