Conjugate Gradient CUDA Fortran

Hi,

Has anyone come across an implementation of the conjugate gradient method in CUDA Fortran for solving Ax=b?

I know that NVIDIA has a CUDA C implementation of this using cuSparse and cuBlas. But I would like to have a native implementation if possible.

Any help is greatly appreciated.

Kirk

Hi Kirk,

I talked with Greg Ruetsch. He’s in the process of updating his “CUDA Fortran for Scientists and Engineers” book and is currently writing a chapter on using a CG. He’ll use calls to cuSparse and cuBlas, but via Fortran interfaces which should make them easier to work with.

Greg asked if you wanted to use a preconditioner or not?

  • Mat

Hi Mat,

That is good news to hear that an update of the book is on the way!

Including a preconditioner would be ideal and would help me out a lot.

Thank you,

Kirk

Hi,

Did Greg mention what the timeline for the new edition is?

Is there a possibility that I could test out his PCG implementation before release. Could be good validation for the algorithm to test with real simulation data!?!

Kirk

Hello,

I am also interested to use cusparse for Cuda Fortran.

Best,

Ivonne.


I have programmed a simple sparse CG method in CUDA Fortran for Ax=b that works nicely. I just use global memory so there is most likely room for improvement… But it works well.

This is an implementation for implicit particle methods. So it expects the values of A to be arranged in a neighbor list format. That is each row belongs to the ith particle and the jth neighbors. This is a static neighbor list data structure that works well on the GPU.

The algorithm is set up to run with either a vector of scalars, PT=1 (say temperature) or a vector of vectors, PT=2 (like deformation).

If you use it and make any improvements I would love to hear about them!

Module Module_Iterative_solver

!----------------------------------------------------------------------
!     SPHriction is a three dimensional SPH code for friction stir 
!     welding. This program is written and maintained by Kirk Fraser, 
!     university of quebec at chicoutimi (kirk.fraser1@uqac.ca).     
!----------------------------------------------------------------------
!
! File:   Iterative_solver.cuf
! Author: Kirk Fraser
!    
!----------------------------------------------------------------------
!
!   Module for solving Ax = b using iterative methods. 
!	Conjugate gradient for SPD matrices (CG)
!	Jacobi preconditioned conjugate gradient for SPD matrices (JPCG) - Not yet coded
!	Bi-conjugate gradient stable for NSPD matrices - Not yet coded
!----------------------------------------------------------------------

Use Module_Variables
Use cudafor
Use Module_Error_checking

Implicit None

Integer:: NeibTotal, Iter
Integer, Parameter:: Iter_max = 1000
Real(fp_kind), Parameter:: TolCG = 1.0E-06

! Host variables

! CG vars
Real(fp_kind):: e1CG_h, e2CG_h

! CG vars
Real(fp_kind), Device:: alphaCG_d, betaCG_d, e1CG_d, e2CG_d, PdotQ
Real(fp_kind), Device, Dimension(:), Allocatable:: Pcg_d, R_d, Q_d

Contains

Subroutine CG(A,x,b,Neib_d,PT,N)

	Implicit None

	Integer:: N, PT
	Integer, Device:: Neib_d(N,nNeib_max)
	Real(fp_kind), Device, Intent(IN):: A(N,3*nNeib_max), b(N) 
	Real(fp_kind), Device, Intent(INOUT):: x(N)
	Real(fp_kind):: x_h(N)
	Type(Dim3):: gridCG, gridSml
		
	gridCG = Dim3(ceiling(Real(N)/tBlock%x),1,1)
	gridSml = Dim3(ceiling(Real(N/3)/tBlock%x),1,1)

	! Start the CG method	
	Call Initialize_CG_Vars(N)
	Iter = 0
	If (PT == 1) Call MatVecMultSubVecHT<<<gridCG,tBlock>>>(A,b,x,Pcg_d,Neib_d,N)	 ! r = Ax - b for HT problem
	If (PT == 2) Call MatVecMultSubVecMech<<<gridSml,tBlock>>>(A,b,x,Pcg_d,Neib_d,N) ! r = Ax - b for Mech problem
	R_d = Pcg_d		
	e1CG_d = 0.0																	 ! Zero out e1
	Call VecDotVec(R_d,R_d,e1CG_d,N)												 ! e1 = rTr
	! Start iterations
	 Do
		If (PT == 1) Call MatVecMultHT<<<gridCG,tBlock>>>(A,Pcg_d,Q_d,Neib_d,N)		 ! q = Ap for HT problem	
		If (PT == 2) Call MatVecMultMech<<<gridSml,tBlock>>>(A,Pcg_d,Q_d,Neib_d,N)	 ! q = Ap for Mech problem
		PdotQ = 0.0
		Call VecDotVec(Pcg_d,Q_d,PdotQ,N)											 ! pTq
		Call ScalarDivScalar<<<1,1>>>(e1CG_d,alphaCG_d,PdotQ,N)						 ! alpha = e1/pTq
		Call DoubVecUpdate<<<gridCG,tBlock>>>(x,R_d,Pcg_d,Q_d,alphaCG_d,N)			 ! x = x - alpha*p and r = r - alpha*q
		e2CG_d = 0.0
		Call VecDotVec(R_d,R_d,e2CG_d,N)
		e2CG_h = e2CG_d																 ! e2 = rTr	
		If (e2CG_h < TolCG**2.0 .or. Iter >= Iter_max) Then 						 ! Exit iteration loop conditions
			Call CPU_TIME(time_cpu)
			Write(*,*) ""
			Write(*,*) "CG Solver Convergence Information"
			Write(*,'(" Iter = ", I4," Res = ", E12.4," Elapsed Time = ",E12.4)') Iter, sqrt(e2CG_h), time_cpu
			Write(*,*)'______________________________________________________'
			Exit
		End If
		Call ScalarDivScalar<<<1,1>>>(e2CG_d,betaCG_d,e1CG_d,N)						 ! beta = e2/e1
		Call VecUpdate<<<gridCG,tBlock>>>(R_d,Pcg_d,betaCG_d,N)						 ! p = r + beta*p				
		e1CG_d = e2CG_d
		Iter = Iter + 1
	End Do
	Call Deallocate_CG_Vars()

End Subroutine

Attributes(Global) Subroutine MatVecMultSubVecHT(M,b,x,P,Neib,nTot)

	Implicit none 

	! Local Vars
	Integer, Device:: i, j, k
	Real(fp_kind), Device:: Sum

	! PassedVars
	Integer, Value:: nTot
	Integer, Device, Intent(IN):: Neib(nTot,nNeib_max)
	Real(fp_kind), Device, Intent(IN):: b(nTot), x(nTot), M(nTot,nNeib_max)
	Real(fp_kind), Device, Intent(OUT):: P(nTot)

	i = (blockIdx%x-1)*blockDim%x + threadIdx%x

	If (i >= 1 .and. i <= nTot) Then	
		Sum = M(i,1)*x(i)
		Do k = 2, Neib(i,2)+1
			j = Neib(i,k+1)
			Sum = Sum + M(i,k)*x(j)
		End Do
		P(i) = b(i)	- Sum
	End If

End Subroutine

Attributes(Global) Subroutine MatVecMultSubVecMech(M,b,x,P,Neib,nTot)

	Implicit none 

	! Local Vars
	Integer, Device:: i, i_Ke, j, j_Ke, k, k_Ke
	Real(fp_kind), Device:: Sum1, Sum2, Sum3

	! PassedVars
	Integer, Value:: nTot
	Integer, Device, Intent(IN):: Neib(nTot/3,nNeib_max)
	Real(fp_kind), Device, Intent(IN):: b(nTot), x(nTot), M(nTot,3*nNeib_max)
	Real(fp_kind), Device, Intent(OUT):: P(nTot)

	i = (blockIdx%x-1)*blockDim%x + threadIdx%x
	i_Ke = 3*i-2 

	If (i >= 1 .and. i <= nTot/3) Then		
		Sum1 = M(i_Ke,1)*x(i_Ke) + M(i_Ke,2)*x(i_Ke+1) + M(i_Ke,3)*x(i_Ke+2)
		Sum2 = M(i_Ke+1,1)*x(i_Ke) + M(i_Ke+1,2)*x(i_Ke+1) + M(i_Ke+1,3)*x(i_Ke+2)
		Sum3 = M(i_Ke+2,1)*x(i_Ke) + M(i_Ke+2,2)*x(i_Ke+1) + M(i_Ke+2,3)*x(i_Ke+2)
		Do k = 3, Neib(i,2)+2
			k_Ke = 3*(k-2)+1 
			j = Neib(i,k)
			j_Ke = 3*j - 2
			Sum1 = Sum1 + M(i_Ke,k_Ke)*x(j_Ke) + M(i_Ke,k_Ke+1)*x(j_Ke+1) + M(i_Ke,k_Ke+2)*x(j_Ke+2)
			Sum2 = Sum2 + M(i_ke+1,k_Ke)*x(j_Ke) + M(i_ke+1,k_Ke+1)*x(j_Ke+1) + M(i_ke+1,k_Ke+2)*x(j_Ke+2)
			Sum3 = Sum3 + M(i_ke+2,k_Ke)*x(j_Ke) + M(i_ke+2,k_Ke+1)*x(j_Ke+1) + M(i_ke+2,k_Ke+2)*x(j_Ke+2)
		End Do
		P(i_Ke) = b(i_Ke) - Sum1
		P(i_Ke+1) = b(i_Ke+1) - Sum2
		P(i_Ke+2) = b(i_Ke+2) - Sum3
	End If

End Subroutine

Subroutine VecdotVec(V1,V2,eCG,nTot)

	Implicit None

	! Local Vars
	Integer:: i, j

	! Passed Vars
	Integer, Value:: nTot
	Real(fp_kind), Device, Intent(IN):: V1(nTot), V2(nTot)
	Real(fp_kind), Device, Intent(INOUT):: eCG

	!$cuf kernel do(1) <<<*,*>>>
	Do i = 1, nTot
		eCG = eCG + V1(i)*V2(i)
	End Do
		
End Subroutine

Attributes(Global) Subroutine MatVecMultHT(M,P,Q,Neib,nTot)

	Implicit none 

	! Local Vars
	Integer, Device:: i, j, k
	Real(fp_kind), Device:: Sum

	! PassedVars
	Integer, Value:: nTot
	Integer, Device, Intent(IN):: Neib(nTot,nNeib_max)
	Real(fp_kind), Device, Intent(IN):: M(nTot,nNeib_max), P(nTot)
	Real(fp_kind), Device, Intent(OUT):: Q(nTot)

	i = (blockIdx%x-1)*blockDim%x + threadIdx%x

	If (i >= 1 .and. i <= nTot) Then	
		Sum = M(i,1)*P(i)
		Do k = 2, Neib(i,2)+1
			j = Neib(i,k+1)
			Sum = Sum + M(i,k)*P(j)
		End Do
		Q(i) = Sum
	End If

End Subroutine

Attributes(Global) Subroutine MatVecMultMech(M,P,Q,Neib,nTot)

	Implicit none 

	! Local Vars
	Integer, Device:: i, i_Ke, j, j_Ke, k, k_Ke
	Real(fp_kind), Device:: Sum1, Sum2, Sum3

	! PassedVars
	Integer, Value:: nTot
	Integer, Device, Intent(IN):: Neib(nTot/3,nNeib_max)
	Real(fp_kind), Device, Intent(IN):: M(nTot,3*nNeib_max), P(nTot)
	Real(fp_kind), Device, Intent(OUT):: Q(nTot)

	i = (blockIdx%x-1)*blockDim%x + threadIdx%x
	i_Ke = 3*i-2 

	If (i >= 1 .and. i <= nTot/3) Then	
		Sum1 = M(i_Ke,1)*P(i_Ke) + M(i_Ke,2)*P(i_Ke+1) + M(i_Ke,3)*P(i_Ke+2)
		Sum2 = M(i_Ke+1,1)*P(i_Ke) + M(i_Ke+1,2)*P(i_Ke+1) + M(i_Ke+1,3)*P(i_Ke+2)
		Sum3 = M(i_Ke+2,1)*P(i_Ke) + M(i_Ke+2,2)*P(i_Ke+1) + M(i_Ke+2,3)*P(i_Ke+2)
		Do k = 3, Neib(i,2)+2
			k_Ke = 3*(k-2)+1 
			j = Neib(i,k)
			j_Ke = 3*j - 2
			Sum1 = Sum1 + M(i_Ke,k_Ke)*P(j_Ke) + M(i_Ke,k_Ke+1)*P(j_Ke+1) + M(i_Ke,k_Ke+2)*P(j_Ke+2)
			Sum2 = Sum2 + M(i_ke+1,k_Ke)*P(j_Ke) + M(i_ke+1,k_Ke+1)*P(j_Ke+1) + M(i_ke+1,k_Ke+2)*P(j_Ke+2)
			Sum3 = Sum3 + M(i_ke+2,k_Ke)*P(j_Ke) + M(i_ke+2,k_Ke+1)*P(j_Ke+1) + M(i_ke+2,k_Ke+2)*P(j_Ke+2)
		End Do
		Q(i_Ke) = Sum1
		Q(i_Ke+1) = Sum2
		Q(i_Ke+2) = Sum3
	End If

End Subroutine

Attributes(Global) Subroutine ScalarDivScalar(ScIn,ScOut,dotProd,nTot)

	Implicit none 

	! Local Vars
	Integer, Device:: i, j, k
	Real(fp_kind), Device:: Sum

	! PassedVars
	Integer, Value:: nTot
	Real(fp_kind), Device, Intent(IN):: dotProd, ScIn
	Real(fp_kind), Device, Intent(INOUT):: ScOut
	
		ScOut = ScIn/dotProd	

End Subroutine

Attributes(Global) Subroutine DoubVecUpdate(x,R,P,Q,alpha,nTot)

	Implicit none 

	! Local Vars
	Integer, Device:: i, j, k
	Real(fp_kind), Device:: Sum

	! PassedVars
	Integer, Value:: nTot
	Real(fp_kind), Device, Intent(IN):: P(nTot), Q(nTot), alpha
	Real(fp_kind), Device, Intent(INOUT):: x(nTot), R(nTot)

	i = (blockIdx%x-1)*blockDim%x + threadIdx%x

	If (i >= 1 .and. i <= nTot) Then	
		x(i) = x(i)	+ alpha*P(i)
		R(i) = R(i) - alpha*Q(i)
	End If

End Subroutine

Attributes(Global) Subroutine VecUpdate(R,P,beta,nTot)

	Implicit none 

	! Local Vars
	Integer, Device:: i, j, k
	Real(fp_kind), Device:: Sum

	! PassedVars
	Integer, Value:: nTot
	Real(fp_kind), Device, Intent(IN):: R(nTot), beta
	Real(fp_kind), Device, Intent(INOUT):: P(nTot)

	i = (blockIdx%x-1)*blockDim%x + threadIdx%x

	If (i >= 1 .and. i <= nTot) Then	
		P(i) = R(i)	+ Beta*P(i)
	End If

End Subroutine

Subroutine Initialize_CG_Vars(allocN)

	Implicit None

	Integer:: allocN
	Character(60):: ArrayName
	
	! Allocate Device variables for implicit
	ArrayName = "Device CG Vectors"
	Allocate(Pcg_d(allocN),R_d(allocN),Q_d(allocN),STAT = istat)
	Call AllocateErrorCheck(istat,ArrayName)
		
	Pcg_d			= 0.0
	R_d				= 0.0
	Q_d				= 0.0
	
End Subroutine

Subroutine Deallocate_CG_Vars()

	Implicit None
		
	Deallocate(Pcg_d,R_d,Q_d)
		
End Subroutine

End Module