Symmetric Matrix Inverse not correct with cusolverDnDsytri

Hello,
I have recently been trying to use cuda in Fortan for a project which requires finding the inverse of a matrix. I am first finding the Bunch-Kaufman factorization of a symmetric matrix via cusolverDnDsytrf then attempting to find the inverse with cusolverDnDsytri.

I did find this post which is similar to my problem: Wrong output of Matrix Inversion using cuSOLVER. However, as far as I can tell I am not messing around with non-blocking streams. I’m also new to using cuda though, so I’m not sure..

For compilation I’m using nvfortran from nvhpc version 24.9.

Would anyone be able to help figure out what I’m doing wrong?

Here is an example with a 3 by 3 symmetric matrix:

      program symmetric_inverse
      use cusolverDn
      use cublas
      integer n,lwork_factor,lwork_solve,error,col,row
      parameter(n=3)
      double precision A(n,n), A_inv(n,n), I(n,n)
      double precision, device :: dA_inv(n,n), dI(n,n), dA(n,n)
      double precision, allocatable, device :: dwork_factor(:)
      double precision, allocatable, device :: dwork_solve(:)
      integer, device :: pivots(n)
      integer, device :: devinfo
      integer local_devinfo
      type(cusolverDnHandle) :: h

      A = reshape((/ -1, 2, 0, 2, -5, 0, 0, 0, -1 /), shape(A))

C     send A to device so A can be inverted
      dA_inv = A

C     get the cusolver handle
      error = cusolverDnCreate(h)
      if (error .ne. CUBLAS_STATUS_SUCCESS) then
            print*, 'Could not create handle', error
            stop
      endif

C     get the buffer size needed for LDL^T calculations
      error = cusolverDnDsytrf_buffersize(h,n,dA_inv,n,lwork_factor)
      if (error .ne. CUBLAS_STATUS_SUCCESS) then
            print*, 'Could not get get LDL^T buffer size', error
            stop
      endif

C     allocate the workspace
      allocate(dwork_factor(lwork_factor))

C     Compute the LDL^T factorization
      error = cusolverDnDsytrf(h,CUBLAS_FILL_MODE_UPPER,
     &                         n,dA_inv,n,pivots,dwork_factor,
     &                         lwork_factor,devinfo)
      local_devinfo = devinfo
      if ((error.ne.CUBLAS_STATUS_SUCCESS).or.(local_devinfo.ne.0)) then
            print*, 'Could not find the LDL^T factorization', error
            print*, 'SYTRF Info:', local_devinfo
            stop
      endif

C     get the buffer size needed for inverse calculations 
      error = cusolverDnDsytri_buffersize(h,CUBLAS_FILL_MODE_UPPER,
     &                                    n,dA_inv,n,pivots,lwork_solve)
      if(error.ne.CUBLAS_STATUS_SUCCESS) then
            print*, 'Could not get inverse buffer size', error
            stop
      endif
 
C     Find the inverse of the matrix 
      allocate(dwork_solve(lwork_solve))
      error = cusolverDnDsytri(h,CUBLAS_FILL_MODE_UPPER,
     &                         n,dA_inv,n,pivots,dwork_solve,
     &                         lwork_solve,devinfo)
      local_devinfo = devinfo
      if((error.ne.CUBLAS_STATUS_SUCCESS).or.(local_devinfo.ne.0)) then
            print*, 'Could not compute the inverse from LDL', error
            print*, 'SYTRI Info:', local_devinfo
            stop
      endif

C     get the inverse from device
      A_inv = dA_inv

C     fill in the lower triangle of A
C     since upper is used for results in previous blas calls
      do row=1, n
            do col=row+1, n
                  A_inv(col,row) = A_inv(row,col)
            enddo
      enddo

C     send A and the now symmetric A_inv to device
      dA_inv = A_inv
      dA = A
C     do A*A_inv 
      call cublasDgemm('N','N',n,n,n,1.0d0,dA,n,dA_inv,n,0.0d0,dI,n)
C     get result of A*A_inv
      I = dI

C     print the results
      print*, A, '\n'
      print*, A_inv, '\n'
      print*, I, '\n'

      deallocate(dwork_factor)
      deallocate(dwork_solve)
      error = cusolverDnDestroy(h)
      end

Thank you!