How to compute the singular value using the cusolver library with less memory

Hi,

I am using the cusolver library to calculate the SVD. According to official examples, the functions “cusolverDnXgesvd()”, “cusolverDnXgesvdp()”, and “cusolverDnXgesvr()” can work well. However, I realize that a large of memory are required for three functions. For example, when the matrix with the size of12168*12168 is solved, the cusolverDnXgesvd(), cusolverDnXgesvdp(), and cusolverDnXgesvr() required 6806 MB, 8474 MB, and 7940MB memory. I make sure that k in “k-rank” is equal to 1 when cusolverDnXgesvr() is used.

I want to know whether all functions require similar memory.

I provide a part of my codes for you, which contain the process of calling three functions:

	SUBROUTINE Regularization_Initial (Lambda, Jac_T)
	IMPLICIT NONE

	REAL(r8k),intent(inout)	::Lambda
	REAL(r8k), intent(in)::Jac_T(Model_Num, NST)

	INTEGER(i4k):: Hessian_row,Hessian_col,WtW_row
	INTEGER(i4k):: moment, point, idx_tol
    INTEGER(i8k)::t1,t2,count_rate           !t1 denotes original cpu time , t2 denotes the end cpu time and t=t2-t1
    REAL(r8k)   ::t
	CHARACTER(10)::str_6
	CHARACTER(20)::str_2,str_3,str_4
	CHARACTER(4)::str_1
	!============= Assemble Hessian matrix APIs =============
    REAL(r8k) ::Jac_TCopy(Model_Num, NST)
    REAL(r8k) ::Jac(Model_Num,NST),Hessian(Model_Num,Model_Num)
    REAL(r8k) ::beta, alpha1
	INTEGER(i4k) :: start_pos, end_pos
    !============= Calculate singular value APIs =============
    TYPE(cusolverDnHandle):: handle								! A handle that handle to the cuSolver library context 
    TYPE(cusolverDnParams):: params								! A handle that handle to the cuSolver library context 
    INTEGER(i8k)::rows 											! The number of rows of the dense matrix
    INTEGER(i8k)::cols											! The number of columns of the dense matrix        
	INTEGER(i4k)::jobz,econ
	INTEGER(i8k)::k,oversampling,niters
    integer(i8k)  :: workspaceInBytesOnDevice, workspaceInBytesOnHost
    integer(i4k), device :: devinfo
    INTEGER(i1k), DIMENSION(:), device, ALLOCATABLE :: bufferOnDevice	
    INTEGER(i1k), DIMENSION(:), ALLOCATABLE :: bufferOnHost
    
    INTEGER(r8k) :: lda, ldu, ldvt
    CHARACTER(i1k) :: jobu, jobv
    REAL(r8k), device, DIMENSION(:,:), ALLOCATABLE:: U,VT
    REAL(r8k), device, DIMENSION(:),   ALLOCATABLE:: S
    REAL(r8k), device :: Lambda_up, Lambda_down
    REAL(r8k)::laida_up,laida_down
	REAL(r8k)::h_err_sigma

    !============= Host problem definition =============
    if(test) then
        write(*,*) "*******************This Is The Subroutine Compute_Gradient_GN*******************************"
    endif
    !============= Record the computation time =============
    
	Hessian = 0
	alpha1 = 1.d0
	beta = 0.d0
	Jac_TCopy = Jac_T
	Jac = Jac_T
	write(str_1,"(I4)")Num_Regularization+1
	write(*,*)"========"//TRIM(str_1)//"th Regularization calculation==========="
	

    !$OMP PARALLEL num_threads(Threads_Num)
    !$OMP DO 
    do Hessian_col = 1, NST
        Jac_TCopy( :,Hessian_col ) = Jac_TCopy( :,Hessian_col ) / Dataerr_Variance(Hessian_col)**2
    enddo
    !$OMP END DO
    !$OMP END PARALLEL 

    !call DGEMM('N', 'T', Model_Num, Model_Num, NST, alpha1, Jac_TCopy, Model_Num, Jac, Model_Num, beta, Hessian, Model_Num)
    !$acc data copyin(Jac_TCopy,Jac) copy(Hessian)
    Call cublasDgemm( 'N', 'T', Model_Num, Model_Num, NST, alpha1, Jac_TCopy, Model_Num, Jac, Model_Num, beta, Hessian, Model_Num )
    ! istat = cublasDgemm_v2(handle_WtW,'N', 'T', Model_Num, Model_Num, NST, alpha1, Jac_TCopy, Model_Num, Jac, Model_Num, beta, Hessian, Model_Num )
	!$acc end data
	rows = Model_Num
	cols = Model_Num

	econ = 1

	lda = rows
	ldu = rows
	ldvt = cols

	jobu  = "S"
	jobv = "S"

	call system_clock(count_rate=count_rate)
    call system_clock (t1) 
	! ============= Device memory management =============
	istat = cusolverDnCreate(handle)
	istat = cusolverDnSetStream(handle,acc_get_cuda_stream(1)) 
	istat = cusolverDnCreateParams(params)
	istat = cusolverDnSetAdvOptions(params, CUSOLVERDN_GETRF, CUSOLVER_ALG_0)

	!$acc data copyin(Mat_Wm2%Pos_Row_Firstdata,Mat_Wm2%Col_List, Mat_Wm2%Val_List) &
	!$acc  copyin(Hessian,Lambda,laida_up,laida_down)

		ALLOCATE( S(cols),U(ldu,rows),VT(ldvt,cols) )
		write(*,*)"========Solve the singular value of JtJ matrix==========="
		istat = cusolverDnXgesvd_bufferSize(handle,params,jobu,jobv,rows,cols,cudaDataType(CUDA_R_64F),&
									Hessian,lda,cudaDataType(CUDA_R_64F),S,cudaDataType(CUDA_R_64F),U,ldu,&
									cudaDataType(CUDA_R_64F),VT,ldvt,cudaDataType(CUDA_R_64F),&
									workspaceInBytesOnDevice, workspaceInBytesOnHost)
		allocate(bufferOnDevice(workspaceInBytesOnDevice) , bufferOnHost(workspaceInBytesOnHost)  )
		istat = cusolverDnXgesvd(handle,params,jobu,jobv,rows,cols,cudaDataType(CUDA_R_64F),&
									Hessian,lda,cudaDataType(CUDA_R_64F),S,cudaDataType(CUDA_R_64F),U,ldu,&
									cudaDataType(CUDA_R_64F),VT,ldvt,cudaDataType(CUDA_R_64F),&
									bufferOnDevice,workspaceInBytesOnDevice,&
									bufferOnHost,workspaceInBytesOnHost,&
									devinfo)

		if(istat/=0)then 
			write(*,*)"Some superdiagonals are not converge to zero. Maybe Hessian matrix is not very good "
			write(*,*)istat
		endif

		!$acc kernels 
		Lambda_up = S(1)
		laida_up = Lambda_up

		!$acc loop collapse(2)
		do Hessian_row=1, Model_Num
			do Hessian_col=1, Model_Num
				Hessian( Hessian_row, Hessian_col ) = 0.0
			enddo
		enddo

		!$acc loop
		do Hessian_row=1, Model_Num
			!$acc loop seq
			do WtW_row=Mat_Wm2%Pos_Row_Firstdata(Hessian_row), Mat_Wm2%Pos_Row_Firstdata(Hessian_row+1)-1
				Hessian_col = Mat_Wm2%Col_List(WtW_row)
				Hessian( Hessian_row, Mat_Wm2%Col_List(Hessian_col) ) = Mat_Wm2%Val_List(WtW_row)
			enddo
		enddo
		!$acc end kernels

		istat = cusolverDnXgesvd(handle,params,jobu,jobv,rows,cols,cudaDataType(CUDA_R_64F),&
									Hessian,lda,cudaDataType(CUDA_R_64F),S,cudaDataType(CUDA_R_64F),U,ldu,&
									cudaDataType(CUDA_R_64F),VT,ldvt,cudaDataType(CUDA_R_64F),&
									bufferOnDevice,workspaceInBytesOnDevice,&
									bufferOnHost,workspaceInBytesOnHost,&
									devinfo)

		if(istat/=0)then 
			write(*,*)"Some superdiagonals are not converge to zero. Maybe Hessian matrix is not very good "
			write(*,*)istat
		endif
		call system_clock (t2)
		t = REAL(t2 - t1) / REAL(count_rate)
		write(str_6,"(f10.6)")t/60.0
		write(*,*) " The computing time of the regularization is:"//TRIM(str_6)//"min"  
		!$acc kernels
		Lambda_down = S(1)
		laida_down = Lambda_down
		if(istat/=0)then 
			CALL Regularization_Cooling( Lambda, Num_Regularization+1 )
		else
			Lambda = Lambda_up/Lambda_down
		endif
		!$acc end kernels

		deallocate( bufferOnDevice,bufferOnHost )
		deallocate( S,U,VT )

		!$acc update host(Lambda,laida_down,laida_up)
	!$acc end data

	istat = cusolverDnDestroyParams(params)
	istat = cusolverDnDestroy(handle) 

	Num_Regularization = Num_Regularization+1
	write(str_2,"(f20.10)")Lambda
	write(str_3,"(f20.10)")laida_up
	write(str_4,"(f20.10)")laida_down
	write(*,*)"The new lambda is"//TRIM(str_2)//"  ("//TRIM(str_3)//";"//TRIM(str_4)//")"
	!====================================================================================================================


	!=========================cusolverDnXgesvdp===========================================================================
	IMPLICIT NONE

	REAL(r8k),intent(inout)	::Lambda
	REAL(r8k), intent(in)::Jac_T(Model_Num, NST)

	INTEGER(i4k):: Hessian_row,Hessian_col,WtW_row
	INTEGER(i4k):: moment, point, idx_tol
    INTEGER(i8k)::t1,t2,count_rate           !t1 denotes original cpu time , t2 denotes the end cpu time and t=t2-t1
    REAL(r8k)   ::t
	CHARACTER(10)::str_6
	CHARACTER(20)::str_2,str_3,str_4
	CHARACTER(4)::str_1
	!============= Assemble Hessian matrix APIs =============
    REAL(r8k) ::Jac_TCopy(Model_Num, NST)
    REAL(r8k) ::Jac(Model_Num,NST),Hessian(Model_Num,Model_Num)
    REAL(r8k) ::beta, alpha1
	INTEGER(i4k) :: start_pos, end_pos
    !============= Calculate singular value APIs =============
    TYPE(cusolverDnHandle):: handle								! A handle that handle to the cuSolver library context 
    TYPE(cusolverDnParams):: params								! A handle that handle to the cuSolver library context 
    INTEGER(i8k)::rows 											! The number of rows of the dense matrix
    INTEGER(i8k)::cols											! The number of columns of the dense matrix        
	INTEGER(i4k)::jobz,econ
	INTEGER(i8k)::k
    integer(i8k)  :: workspaceInBytesOnDevice, workspaceInBytesOnHost
    integer(i4k), device :: devinfo
    INTEGER(i1k), DIMENSION(:), device, ALLOCATABLE :: bufferOnDevice	
    INTEGER(i1k), DIMENSION(:), ALLOCATABLE :: bufferOnHost
    
    INTEGER(r8k) :: lda, ldu, ldvt
    CHARACTER(i1k) :: jobu, jobvt, jobv
    REAL(r8k), device, DIMENSION(:,:), ALLOCATABLE:: U,VT
    REAL(r8k), device, DIMENSION(:),   ALLOCATABLE:: S
    REAL(r8k), device :: Lambda_up, Lambda_down
    REAL(r8k)::laida_up,laida_down
	REAL(r8k)::h_err_sigma

    !============= Host problem definition =============
    if(test) then
        write(*,*) "*******************This Is The Subroutine Compute_Gradient_GN*******************************"
    endif
    !============= Record the computation time =============
    
	Hessian = 0
	alpha1 = 1.d0
	beta = 0.d0
	Jac_TCopy = Jac_T
	Jac = Jac_T
	write(str_1,"(I4)")Num_Regularization+1
	write(*,*)"========"//TRIM(str_1)//"th Regularization calculation==========="
	

    !$OMP PARALLEL num_threads(Threads_Num)
    !$OMP DO 
    do Hessian_col = 1, NST
        Jac_TCopy( :,Hessian_col ) = Jac_TCopy( :,Hessian_col ) / Dataerr_Variance(Hessian_col)**2
    enddo
    !$OMP END DO
    !$OMP END PARALLEL 

    !call DGEMM('N', 'T', Model_Num, Model_Num, NST, alpha1, Jac_TCopy, Model_Num, Jac, Model_Num, beta, Hessian, Model_Num)
    !$acc data copyin(Jac_TCopy,Jac) copy(Hessian)
    Call cublasDgemm( 'N', 'T', Model_Num, Model_Num, NST, alpha1, Jac_TCopy, Model_Num, Jac, Model_Num, beta, Hessian, Model_Num )
    ! istat = cublasDgemm_v2(handle_WtW,'N', 'T', Model_Num, Model_Num, NST, alpha1, Jac_TCopy, Model_Num, Jac, Model_Num, beta, Hessian, Model_Num )

	!$acc end data
	rows = Model_Num
	cols = Model_Num

	econ = 1

	lda = rows
	ldu = rows
	ldvt = cols

	jobz = CUSOLVER_EIG_MODE_NOVECTOR

	call system_clock(count_rate=count_rate)
    call system_clock (t1) 
	! ============= Device memory management =============
	istat = cusolverDnCreate(handle)
	istat = cusolverDnSetStream(handle,acc_get_cuda_stream(1)) 
	istat = cusolverDnCreateParams(params)
	istat = cusolverDnSetAdvOptions(params, CUSOLVERDN_GETRF, CUSOLVER_ALG_0)

	!$acc data copyin(Mat_Wm2%Pos_Row_Firstdata,Mat_Wm2%Col_List, Mat_Wm2%Val_List) &
	!$acc  copyin(Hessian,Lambda,laida_up,laida_down)

		ALLOCATE( S(cols),U(ldu,rows),VT(ldvt,cols) )
		write(*,*)"========Solve the singular value of JtJ matrix==========="


		istat = cusolverDnXgesvdp_bufferSize(handle,params,jobz,econ,rows,cols,cudaDataType(CUDA_R_64F),&
									Hessian,lda,cudaDataType(CUDA_R_64F),S,cudaDataType(CUDA_R_64F),U,ldu,&
									cudaDataType(CUDA_R_64F),VT,ldvt,cudaDataType(CUDA_R_64F),&
									workspaceInBytesOnDevice, workspaceInBytesOnHost)
		allocate(bufferOnDevice(workspaceInBytesOnDevice) , bufferOnHost(workspaceInBytesOnHost)  )
		
		istat = cusolverDnXgesvdp(handle,params,jobz,econ,rows,cols,cudaDataType(CUDA_R_64F),&
									Hessian,lda,cudaDataType(CUDA_R_64F),S,cudaDataType(CUDA_R_64F),U,ldu,&
									cudaDataType(CUDA_R_64F),VT,ldvt,cudaDataType(CUDA_R_64F),&
									bufferOnDevice,workspaceInBytesOnDevice,&
									bufferOnHost,workspaceInBytesOnHost,&
									devinfo,h_err_sigma)

		if(istat/=0)then 
			write(*,*)"Some superdiagonals are not converge to zero. Maybe Hessian matrix is not very good "
			write(*,*)istat
		endif
		!$acc kernels 
		 Lambda_up = S(1)
		laida_up = Lambda_up

		!$acc loop collapse(2)
		do Hessian_row=1, Model_Num
			do Hessian_col=1, Model_Num
				Hessian( Hessian_row, Hessian_col ) = 0.0
			enddo
		enddo

		!$acc loop
		do Hessian_row=1, Model_Num
			!$acc loop seq
			do WtW_row=Mat_Wm2%Pos_Row_Firstdata(Hessian_row), Mat_Wm2%Pos_Row_Firstdata(Hessian_row+1)-1
				Hessian_col = Mat_Wm2%Col_List(WtW_row)
				Hessian( Hessian_row, Mat_Wm2%Col_List(Hessian_col) ) = Mat_Wm2%Val_List(WtW_row)
			enddo
		enddo
		!$acc end kernels

		istat = cusolverDnXgesvdp(handle,params,jobz,econ,rows,cols,cudaDataType(CUDA_R_64F),&
									Hessian,lda,cudaDataType(CUDA_R_64F),S,cudaDataType(CUDA_R_64F),U,ldu,&
									cudaDataType(CUDA_R_64F),VT,ldvt,cudaDataType(CUDA_R_64F),&
									bufferOnDevice,workspaceInBytesOnDevice,&
									bufferOnHost,workspaceInBytesOnHost,&
									devinfo,h_err_sigma)


		if(istat/=0)then 
			write(*,*)"Some superdiagonals are not converge to zero. Maybe Hessian matrix is not very good "
			write(*,*)istat
		endif
		call system_clock (t2)
		t = REAL(t2 - t1) / REAL(count_rate)
		write(str_6,"(f10.6)")t/60.0
		write(*,*) " The computing time of the regularization is:"//TRIM(str_6)//"min"  
		!$acc kernels
		Lambda_down = S(1)
		laida_down = Lambda_down
		if(istat/=0)then 
			CALL Regularization_Cooling( Lambda, Num_Regularization+1 )
		else
			Lambda = Lambda_up/Lambda_down
		endif
		!$acc end kernels

		deallocate( bufferOnDevice,bufferOnHost )
		deallocate( S,U,VT )

		!$acc update host(Lambda,laida_down,laida_up)
	!$acc end data

	istat = cusolverDnDestroyParams(params)
	istat = cusolverDnDestroy(handle) 

	Num_Regularization = Num_Regularization+1
	write(str_2,"(f20.10)")Lambda
	write(str_3,"(f20.10)")laida_up
	write(str_4,"(f20.10)")laida_down
	write(*,*)"The new lambda is"//TRIM(str_2)//"  ("//TRIM(str_3)//";"//TRIM(str_4)//")"
	!=======================================================================================================


	!=========================cusolverDnXgesvdr=============================================================
	IMPLICIT NONE

	REAL(r8k),intent(inout)	::Lambda
	REAL(r8k), intent(in)::Jac_T(Model_Num, NST)

	INTEGER(i4k):: Hessian_row,Hessian_col,WtW_row
	INTEGER(i4k):: moment, point, idx_tol
    INTEGER(i8k)::t1,t2,count_rate           !t1 denotes original cpu time , t2 denotes the end cpu time and t=t2-t1
    REAL(r8k)   ::t
	CHARACTER(10)::str_6
	CHARACTER(20)::str_2,str_3,str_4
	CHARACTER(4)::str_1
	!============= Assemble Hessian matrix APIs =============
    REAL(r8k) ::Jac_TCopy(Model_Num, NST)
    REAL(r8k) ::Jac(Model_Num,NST),Hessian(Model_Num,Model_Num)
    REAL(r8k) ::beta, alpha1
	INTEGER(i4k) :: start_pos, end_pos
    !============= Calculate singular value APIs =============
    TYPE(cusolverDnHandle):: handle								! A handle that handle to the cuSolver library context 
    TYPE(cusolverDnParams):: params								! A handle that handle to the cuSolver library context 
    INTEGER(i8k)::rows 											! The number of rows of the dense matrix
    INTEGER(i8k)::cols											! The number of columns of the dense matrix        
	INTEGER(i4k)::jobz,econ
	INTEGER(i8k)::k,oversampling,niters
    integer(i8k)  :: workspaceInBytesOnDevice, workspaceInBytesOnHost
    integer(i4k), device :: devinfo
    INTEGER(i1k), DIMENSION(:), device, ALLOCATABLE :: bufferOnDevice	
    INTEGER(i1k), DIMENSION(:), ALLOCATABLE :: bufferOnHost
    
    INTEGER(r8k) :: lda, ldu, ldvt
    CHARACTER(i1k) :: jobu, jobv
    REAL(r8k), device, DIMENSION(:,:), ALLOCATABLE:: U,VT
    REAL(r8k), device, DIMENSION(:),   ALLOCATABLE:: S
    REAL(r8k), device :: Lambda_up, Lambda_down
    REAL(r8k)::laida_up,laida_down
	REAL(r8k)::h_err_sigma

    !============= Host problem definition =============
    if(test) then
        write(*,*) "*******************This Is The Subroutine Compute_Gradient_GN*******************************"
    endif
    !============= Record the computation time =============
    
	Hessian = 0
	alpha1 = 1.d0
	beta = 0.d0
	Jac_TCopy = Jac_T
	Jac = Jac_T
	write(str_1,"(I4)")Num_Regularization+1
	write(*,*)"========"//TRIM(str_1)//"th Regularization calculation==========="

    !$OMP PARALLEL num_threads(Threads_Num)
    !$OMP DO 
    do Hessian_col = 1, NST
        Jac_TCopy( :,Hessian_col ) = Jac_TCopy( :,Hessian_col ) / Dataerr_Variance(Hessian_col)**2
    enddo
    !$OMP END DO
    !$OMP END PARALLEL 

    !$acc data copyin(Jac_TCopy,Jac) copy(Hessian)
    Call cublasDgemm( 'N', 'T', Model_Num, Model_Num, NST, alpha1, Jac_TCopy, Model_Num, Jac, Model_Num, beta, Hessian, Model_Num )

	!$acc end data
	rows = Model_Num
	cols = Model_Num

	k = 1
	oversampling = 2*k
	niters = 3

	lda = rows
	ldu = rows
	ldvt = cols

	jobu  = "S"
	jobv = "S"

	call system_clock(count_rate=count_rate)
    call system_clock (t1) 
	! ============= Device memory management =============
	istat = cusolverDnCreate(handle)
	istat = cusolverDnSetStream(handle,acc_get_cuda_stream(1)) 
	istat = cusolverDnCreateParams(params)
	istat = cusolverDnSetAdvOptions(params, CUSOLVERDN_GETRF, CUSOLVER_ALG_0)
	write(*,*)"22222222222",Model_Num
	!$acc data copyin(Mat_Wm2%Pos_Row_Firstdata,Mat_Wm2%Col_List, Mat_Wm2%Val_List) &
	!$acc  copyin(Hessian,Lambda,laida_up,laida_down)
		ALLOCATE( S(cols),U(ldu,rows),VT(ldvt,cols) )
		write(*,*)"========Solve the singular value of JtJ matrix==========="

		istat = cusolverDnXgesvdr_bufferSize(handle,params,jobu,jobv,rows,cols,k,oversampling,niters,cudaDataType(CUDA_R_64F),&
									Hessian,lda,cudaDataType(CUDA_R_64F),S,cudaDataType(CUDA_R_64F),U,ldu,&
									cudaDataType(CUDA_R_64F),VT,ldvt,cudaDataType(CUDA_R_64F),&
									workspaceInBytesOnDevice, workspaceInBytesOnHost)

		allocate(bufferOnDevice(workspaceInBytesOnDevice) , bufferOnHost(workspaceInBytesOnHost)  )
		
		istat = cusolverDnXgesvdr(handle,params,jobu,jobv,rows,cols,k,oversampling,niters,cudaDataType(CUDA_R_64F),&
									Hessian,lda,cudaDataType(CUDA_R_64F),S,cudaDataType(CUDA_R_64F),U,ldu,&
									cudaDataType(CUDA_R_64F),VT,ldvt,cudaDataType(CUDA_R_64F),&
									bufferOnDevice,workspaceInBytesOnDevice,&
									bufferOnHost,workspaceInBytesOnHost,&
									devinfo)

		if(istat/=0)then 
			write(*,*)"Some superdiagonals are not converge to zero. Maybe Hessian matrix is not very good "
			write(*,*)istat
		endif

		!$acc kernels 
		Lambda_up = S(1)
		laida_up = Lambda_up

		!$acc loop collapse(2)
		do Hessian_row=1, Model_Num
			do Hessian_col=1, Model_Num
				Hessian( Hessian_row, Hessian_col ) = 0.0
			enddo
		enddo

		!$acc loop
		do Hessian_row=1, Model_Num
			!$acc loop seq
			do WtW_row=Mat_Wm2%Pos_Row_Firstdata(Hessian_row), Mat_Wm2%Pos_Row_Firstdata(Hessian_row+1)-1
				Hessian_col = Mat_Wm2%Col_List(WtW_row)
				Hessian( Hessian_row, Mat_Wm2%Col_List(Hessian_col) ) = Mat_Wm2%Val_List(WtW_row)
			enddo
		enddo
		!$acc end kernels

		istat = cusolverDnXgesvdr(handle,params,jobu,jobv,rows,cols,k,oversampling,niters,cudaDataType(CUDA_R_64F),&
									Hessian,lda,cudaDataType(CUDA_R_64F),S,cudaDataType(CUDA_R_64F),U,ldu,&
									cudaDataType(CUDA_R_64F),VT,ldvt,cudaDataType(CUDA_R_64F),&
									bufferOnDevice,workspaceInBytesOnDevice,&
									bufferOnHost,workspaceInBytesOnHost,&
									devinfo)

		if(istat/=0)then 
			write(*,*)"Some superdiagonals are not converge to zero. Maybe Hessian matrix is not very good "
			write(*,*)istat
		endif
		call system_clock (t2)
		t = REAL(t2 - t1) / REAL(count_rate)
		write(str_6,"(f10.6)")t/60.0
		write(*,*) " The computing time of the regularization is:"//TRIM(str_6)//"min"  
		!$acc kernels

		Lambda_down = S(1)
		laida_down = Lambda_down
		if(istat/=0)then 
			CALL Regularization_Cooling( Lambda, Num_Regularization+1 )
		else
			Lambda = Lambda_up/Lambda_down
		endif
		!$acc end kernels

		deallocate( bufferOnDevice,bufferOnHost )
		deallocate( S,U,VT )

		!$acc update host(Lambda,laida_down,laida_up)
	!$acc end data

	istat = cusolverDnDestroyParams(params)
	istat = cusolverDnDestroy(handle) 

	Num_Regularization = Num_Regularization+1
	write(str_2,"(f20.10)")Lambda
	write(str_3,"(f20.10)")laida_up
	write(str_4,"(f20.10)")laida_down
	write(*,*)"The new lambda is"//TRIM(str_2)//"  ("//TRIM(str_3)//";"//TRIM(str_4)//")"
	RETURN
ENDSUBROUTINE Regularization_Initial

regards,
Amore.