Hi,

Has anyone come across an implementation of the conjugate gradient method in CUDA Fortran for solving Ax=b?

I know that NVIDIA has a CUDA C implementation of this using cuSparse and cuBlas. But I would like to have a native implementation if possible.

Any help is greatly appreciated.

Kirk

Hi Kirk,

I talked with Greg Ruetsch. He’s in the process of updating his “CUDA Fortran for Scientists and Engineers” book and is currently writing a chapter on using a CG. He’ll use calls to cuSparse and cuBlas, but via Fortran interfaces which should make them easier to work with.

Greg asked if you wanted to use a preconditioner or not?

• Mat

Hi Mat,

That is good news to hear that an update of the book is on the way!

Including a preconditioner would be ideal and would help me out a lot.

Thank you,

Kirk

Hi,

Did Greg mention what the timeline for the new edition is?

Is there a possibility that I could test out his PCG implementation before release. Could be good validation for the algorithm to test with real simulation data!?!

Kirk

Hello,

I am also interested to use cusparse for Cuda Fortran.

Best,

Ivonne.

I have programmed a simple sparse CG method in CUDA Fortran for Ax=b that works nicely. I just use global memory so there is most likely room for improvement… But it works well.

This is an implementation for implicit particle methods. So it expects the values of A to be arranged in a neighbor list format. That is each row belongs to the ith particle and the jth neighbors. This is a static neighbor list data structure that works well on the GPU.

The algorithm is set up to run with either a vector of scalars, PT=1 (say temperature) or a vector of vectors, PT=2 (like deformation).

If you use it and make any improvements I would love to hear about them!

``````Module Module_Iterative_solver

!----------------------------------------------------------------------
!     SPHriction is a three dimensional SPH code for friction stir
!     welding. This program is written and maintained by Kirk Fraser,
!     university of quebec at chicoutimi (kirk.fraser1@uqac.ca).
!----------------------------------------------------------------------
!
! File:   Iterative_solver.cuf
! Author: Kirk Fraser
!
!----------------------------------------------------------------------
!
!   Module for solving Ax = b using iterative methods.
!	Conjugate gradient for SPD matrices (CG)
!	Jacobi preconditioned conjugate gradient for SPD matrices (JPCG) - Not yet coded
!	Bi-conjugate gradient stable for NSPD matrices - Not yet coded
!----------------------------------------------------------------------

Use Module_Variables
Use cudafor
Use Module_Error_checking

Implicit None

Integer:: NeibTotal, Iter
Integer, Parameter:: Iter_max = 1000
Real(fp_kind), Parameter:: TolCG = 1.0E-06

! Host variables

! CG vars
Real(fp_kind):: e1CG_h, e2CG_h

! CG vars
Real(fp_kind), Device:: alphaCG_d, betaCG_d, e1CG_d, e2CG_d, PdotQ
Real(fp_kind), Device, Dimension(:), Allocatable:: Pcg_d, R_d, Q_d

Contains

Subroutine CG(A,x,b,Neib_d,PT,N)

Implicit None

Integer:: N, PT
Integer, Device:: Neib_d(N,nNeib_max)
Real(fp_kind), Device, Intent(IN):: A(N,3*nNeib_max), b(N)
Real(fp_kind), Device, Intent(INOUT):: x(N)
Real(fp_kind):: x_h(N)
Type(Dim3):: gridCG, gridSml

gridCG = Dim3(ceiling(Real(N)/tBlock%x),1,1)
gridSml = Dim3(ceiling(Real(N/3)/tBlock%x),1,1)

! Start the CG method
Call Initialize_CG_Vars(N)
Iter = 0
If (PT == 1) Call MatVecMultSubVecHT<<<gridCG,tBlock>>>(A,b,x,Pcg_d,Neib_d,N)	 ! r = Ax - b for HT problem
If (PT == 2) Call MatVecMultSubVecMech<<<gridSml,tBlock>>>(A,b,x,Pcg_d,Neib_d,N) ! r = Ax - b for Mech problem
R_d = Pcg_d
e1CG_d = 0.0																	 ! Zero out e1
Call VecDotVec(R_d,R_d,e1CG_d,N)												 ! e1 = rTr
! Start iterations
Do
If (PT == 1) Call MatVecMultHT<<<gridCG,tBlock>>>(A,Pcg_d,Q_d,Neib_d,N)		 ! q = Ap for HT problem
If (PT == 2) Call MatVecMultMech<<<gridSml,tBlock>>>(A,Pcg_d,Q_d,Neib_d,N)	 ! q = Ap for Mech problem
PdotQ = 0.0
Call VecDotVec(Pcg_d,Q_d,PdotQ,N)											 ! pTq
Call ScalarDivScalar<<<1,1>>>(e1CG_d,alphaCG_d,PdotQ,N)						 ! alpha = e1/pTq
Call DoubVecUpdate<<<gridCG,tBlock>>>(x,R_d,Pcg_d,Q_d,alphaCG_d,N)			 ! x = x - alpha*p and r = r - alpha*q
e2CG_d = 0.0
Call VecDotVec(R_d,R_d,e2CG_d,N)
e2CG_h = e2CG_d																 ! e2 = rTr
If (e2CG_h < TolCG**2.0 .or. Iter >= Iter_max) Then 						 ! Exit iteration loop conditions
Call CPU_TIME(time_cpu)
Write(*,*) ""
Write(*,*) "CG Solver Convergence Information"
Write(*,'(" Iter = ", I4," Res = ", E12.4," Elapsed Time = ",E12.4)') Iter, sqrt(e2CG_h), time_cpu
Write(*,*)'______________________________________________________'
Exit
End If
Call ScalarDivScalar<<<1,1>>>(e2CG_d,betaCG_d,e1CG_d,N)						 ! beta = e2/e1
Call VecUpdate<<<gridCG,tBlock>>>(R_d,Pcg_d,betaCG_d,N)						 ! p = r + beta*p
e1CG_d = e2CG_d
Iter = Iter + 1
End Do
Call Deallocate_CG_Vars()

End Subroutine

Attributes(Global) Subroutine MatVecMultSubVecHT(M,b,x,P,Neib,nTot)

Implicit none

! Local Vars
Integer, Device:: i, j, k
Real(fp_kind), Device:: Sum

! PassedVars
Integer, Value:: nTot
Integer, Device, Intent(IN):: Neib(nTot,nNeib_max)
Real(fp_kind), Device, Intent(IN):: b(nTot), x(nTot), M(nTot,nNeib_max)
Real(fp_kind), Device, Intent(OUT):: P(nTot)

If (i >= 1 .and. i <= nTot) Then
Sum = M(i,1)*x(i)
Do k = 2, Neib(i,2)+1
j = Neib(i,k+1)
Sum = Sum + M(i,k)*x(j)
End Do
P(i) = b(i)	- Sum
End If

End Subroutine

Attributes(Global) Subroutine MatVecMultSubVecMech(M,b,x,P,Neib,nTot)

Implicit none

! Local Vars
Integer, Device:: i, i_Ke, j, j_Ke, k, k_Ke
Real(fp_kind), Device:: Sum1, Sum2, Sum3

! PassedVars
Integer, Value:: nTot
Integer, Device, Intent(IN):: Neib(nTot/3,nNeib_max)
Real(fp_kind), Device, Intent(IN):: b(nTot), x(nTot), M(nTot,3*nNeib_max)
Real(fp_kind), Device, Intent(OUT):: P(nTot)

i_Ke = 3*i-2

If (i >= 1 .and. i <= nTot/3) Then
Sum1 = M(i_Ke,1)*x(i_Ke) + M(i_Ke,2)*x(i_Ke+1) + M(i_Ke,3)*x(i_Ke+2)
Sum2 = M(i_Ke+1,1)*x(i_Ke) + M(i_Ke+1,2)*x(i_Ke+1) + M(i_Ke+1,3)*x(i_Ke+2)
Sum3 = M(i_Ke+2,1)*x(i_Ke) + M(i_Ke+2,2)*x(i_Ke+1) + M(i_Ke+2,3)*x(i_Ke+2)
Do k = 3, Neib(i,2)+2
k_Ke = 3*(k-2)+1
j = Neib(i,k)
j_Ke = 3*j - 2
Sum1 = Sum1 + M(i_Ke,k_Ke)*x(j_Ke) + M(i_Ke,k_Ke+1)*x(j_Ke+1) + M(i_Ke,k_Ke+2)*x(j_Ke+2)
Sum2 = Sum2 + M(i_ke+1,k_Ke)*x(j_Ke) + M(i_ke+1,k_Ke+1)*x(j_Ke+1) + M(i_ke+1,k_Ke+2)*x(j_Ke+2)
Sum3 = Sum3 + M(i_ke+2,k_Ke)*x(j_Ke) + M(i_ke+2,k_Ke+1)*x(j_Ke+1) + M(i_ke+2,k_Ke+2)*x(j_Ke+2)
End Do
P(i_Ke) = b(i_Ke) - Sum1
P(i_Ke+1) = b(i_Ke+1) - Sum2
P(i_Ke+2) = b(i_Ke+2) - Sum3
End If

End Subroutine

Subroutine VecdotVec(V1,V2,eCG,nTot)

Implicit None

! Local Vars
Integer:: i, j

! Passed Vars
Integer, Value:: nTot
Real(fp_kind), Device, Intent(IN):: V1(nTot), V2(nTot)
Real(fp_kind), Device, Intent(INOUT):: eCG

!\$cuf kernel do(1) <<<*,*>>>
Do i = 1, nTot
eCG = eCG + V1(i)*V2(i)
End Do

End Subroutine

Attributes(Global) Subroutine MatVecMultHT(M,P,Q,Neib,nTot)

Implicit none

! Local Vars
Integer, Device:: i, j, k
Real(fp_kind), Device:: Sum

! PassedVars
Integer, Value:: nTot
Integer, Device, Intent(IN):: Neib(nTot,nNeib_max)
Real(fp_kind), Device, Intent(IN):: M(nTot,nNeib_max), P(nTot)
Real(fp_kind), Device, Intent(OUT):: Q(nTot)

If (i >= 1 .and. i <= nTot) Then
Sum = M(i,1)*P(i)
Do k = 2, Neib(i,2)+1
j = Neib(i,k+1)
Sum = Sum + M(i,k)*P(j)
End Do
Q(i) = Sum
End If

End Subroutine

Attributes(Global) Subroutine MatVecMultMech(M,P,Q,Neib,nTot)

Implicit none

! Local Vars
Integer, Device:: i, i_Ke, j, j_Ke, k, k_Ke
Real(fp_kind), Device:: Sum1, Sum2, Sum3

! PassedVars
Integer, Value:: nTot
Integer, Device, Intent(IN):: Neib(nTot/3,nNeib_max)
Real(fp_kind), Device, Intent(IN):: M(nTot,3*nNeib_max), P(nTot)
Real(fp_kind), Device, Intent(OUT):: Q(nTot)

i_Ke = 3*i-2

If (i >= 1 .and. i <= nTot/3) Then
Sum1 = M(i_Ke,1)*P(i_Ke) + M(i_Ke,2)*P(i_Ke+1) + M(i_Ke,3)*P(i_Ke+2)
Sum2 = M(i_Ke+1,1)*P(i_Ke) + M(i_Ke+1,2)*P(i_Ke+1) + M(i_Ke+1,3)*P(i_Ke+2)
Sum3 = M(i_Ke+2,1)*P(i_Ke) + M(i_Ke+2,2)*P(i_Ke+1) + M(i_Ke+2,3)*P(i_Ke+2)
Do k = 3, Neib(i,2)+2
k_Ke = 3*(k-2)+1
j = Neib(i,k)
j_Ke = 3*j - 2
Sum1 = Sum1 + M(i_Ke,k_Ke)*P(j_Ke) + M(i_Ke,k_Ke+1)*P(j_Ke+1) + M(i_Ke,k_Ke+2)*P(j_Ke+2)
Sum2 = Sum2 + M(i_ke+1,k_Ke)*P(j_Ke) + M(i_ke+1,k_Ke+1)*P(j_Ke+1) + M(i_ke+1,k_Ke+2)*P(j_Ke+2)
Sum3 = Sum3 + M(i_ke+2,k_Ke)*P(j_Ke) + M(i_ke+2,k_Ke+1)*P(j_Ke+1) + M(i_ke+2,k_Ke+2)*P(j_Ke+2)
End Do
Q(i_Ke) = Sum1
Q(i_Ke+1) = Sum2
Q(i_Ke+2) = Sum3
End If

End Subroutine

Attributes(Global) Subroutine ScalarDivScalar(ScIn,ScOut,dotProd,nTot)

Implicit none

! Local Vars
Integer, Device:: i, j, k
Real(fp_kind), Device:: Sum

! PassedVars
Integer, Value:: nTot
Real(fp_kind), Device, Intent(IN):: dotProd, ScIn
Real(fp_kind), Device, Intent(INOUT):: ScOut

ScOut = ScIn/dotProd

End Subroutine

Attributes(Global) Subroutine DoubVecUpdate(x,R,P,Q,alpha,nTot)

Implicit none

! Local Vars
Integer, Device:: i, j, k
Real(fp_kind), Device:: Sum

! PassedVars
Integer, Value:: nTot
Real(fp_kind), Device, Intent(IN):: P(nTot), Q(nTot), alpha
Real(fp_kind), Device, Intent(INOUT):: x(nTot), R(nTot)

If (i >= 1 .and. i <= nTot) Then
x(i) = x(i)	+ alpha*P(i)
R(i) = R(i) - alpha*Q(i)
End If

End Subroutine

Attributes(Global) Subroutine VecUpdate(R,P,beta,nTot)

Implicit none

! Local Vars
Integer, Device:: i, j, k
Real(fp_kind), Device:: Sum

! PassedVars
Integer, Value:: nTot
Real(fp_kind), Device, Intent(IN):: R(nTot), beta
Real(fp_kind), Device, Intent(INOUT):: P(nTot)

If (i >= 1 .and. i <= nTot) Then
P(i) = R(i)	+ Beta*P(i)
End If

End Subroutine

Subroutine Initialize_CG_Vars(allocN)

Implicit None

Integer:: allocN
Character(60):: ArrayName

! Allocate Device variables for implicit
ArrayName = "Device CG Vectors"
Allocate(Pcg_d(allocN),R_d(allocN),Q_d(allocN),STAT = istat)
Call AllocateErrorCheck(istat,ArrayName)

Pcg_d			= 0.0
R_d				= 0.0
Q_d				= 0.0

End Subroutine

Subroutine Deallocate_CG_Vars()

Implicit None

Deallocate(Pcg_d,R_d,Q_d)

End Subroutine

End Module
``````