I have programmed a simple sparse CG method in CUDA Fortran for Ax=b that works nicely. I just use global memory so there is most likely room for improvement… But it works well.
This is an implementation for implicit particle methods. So it expects the values of A to be arranged in a neighbor list format. That is each row belongs to the ith particle and the jth neighbors. This is a static neighbor list data structure that works well on the GPU.
The algorithm is set up to run with either a vector of scalars, PT=1 (say temperature) or a vector of vectors, PT=2 (like deformation).
If you use it and make any improvements I would love to hear about them!
Module Module_Iterative_solver
!----------------------------------------------------------------------
! SPHriction is a three dimensional SPH code for friction stir
! welding. This program is written and maintained by Kirk Fraser,
! university of quebec at chicoutimi (kirk.fraser1@uqac.ca).
!----------------------------------------------------------------------
!
! File: Iterative_solver.cuf
! Author: Kirk Fraser
!
!----------------------------------------------------------------------
!
! Module for solving Ax = b using iterative methods.
! Conjugate gradient for SPD matrices (CG)
! Jacobi preconditioned conjugate gradient for SPD matrices (JPCG) - Not yet coded
! Bi-conjugate gradient stable for NSPD matrices - Not yet coded
!----------------------------------------------------------------------
Use Module_Variables
Use cudafor
Use Module_Error_checking
Implicit None
Integer:: NeibTotal, Iter
Integer, Parameter:: Iter_max = 1000
Real(fp_kind), Parameter:: TolCG = 1.0E-06
! Host variables
! CG vars
Real(fp_kind):: e1CG_h, e2CG_h
! CG vars
Real(fp_kind), Device:: alphaCG_d, betaCG_d, e1CG_d, e2CG_d, PdotQ
Real(fp_kind), Device, Dimension(:), Allocatable:: Pcg_d, R_d, Q_d
Contains
Subroutine CG(A,x,b,Neib_d,PT,N)
Implicit None
Integer:: N, PT
Integer, Device:: Neib_d(N,nNeib_max)
Real(fp_kind), Device, Intent(IN):: A(N,3*nNeib_max), b(N)
Real(fp_kind), Device, Intent(INOUT):: x(N)
Real(fp_kind):: x_h(N)
Type(Dim3):: gridCG, gridSml
gridCG = Dim3(ceiling(Real(N)/tBlock%x),1,1)
gridSml = Dim3(ceiling(Real(N/3)/tBlock%x),1,1)
! Start the CG method
Call Initialize_CG_Vars(N)
Iter = 0
If (PT == 1) Call MatVecMultSubVecHT<<<gridCG,tBlock>>>(A,b,x,Pcg_d,Neib_d,N) ! r = Ax - b for HT problem
If (PT == 2) Call MatVecMultSubVecMech<<<gridSml,tBlock>>>(A,b,x,Pcg_d,Neib_d,N) ! r = Ax - b for Mech problem
R_d = Pcg_d
e1CG_d = 0.0 ! Zero out e1
Call VecDotVec(R_d,R_d,e1CG_d,N) ! e1 = rTr
! Start iterations
Do
If (PT == 1) Call MatVecMultHT<<<gridCG,tBlock>>>(A,Pcg_d,Q_d,Neib_d,N) ! q = Ap for HT problem
If (PT == 2) Call MatVecMultMech<<<gridSml,tBlock>>>(A,Pcg_d,Q_d,Neib_d,N) ! q = Ap for Mech problem
PdotQ = 0.0
Call VecDotVec(Pcg_d,Q_d,PdotQ,N) ! pTq
Call ScalarDivScalar<<<1,1>>>(e1CG_d,alphaCG_d,PdotQ,N) ! alpha = e1/pTq
Call DoubVecUpdate<<<gridCG,tBlock>>>(x,R_d,Pcg_d,Q_d,alphaCG_d,N) ! x = x - alpha*p and r = r - alpha*q
e2CG_d = 0.0
Call VecDotVec(R_d,R_d,e2CG_d,N)
e2CG_h = e2CG_d ! e2 = rTr
If (e2CG_h < TolCG**2.0 .or. Iter >= Iter_max) Then ! Exit iteration loop conditions
Call CPU_TIME(time_cpu)
Write(*,*) ""
Write(*,*) "CG Solver Convergence Information"
Write(*,'(" Iter = ", I4," Res = ", E12.4," Elapsed Time = ",E12.4)') Iter, sqrt(e2CG_h), time_cpu
Write(*,*)'______________________________________________________'
Exit
End If
Call ScalarDivScalar<<<1,1>>>(e2CG_d,betaCG_d,e1CG_d,N) ! beta = e2/e1
Call VecUpdate<<<gridCG,tBlock>>>(R_d,Pcg_d,betaCG_d,N) ! p = r + beta*p
e1CG_d = e2CG_d
Iter = Iter + 1
End Do
Call Deallocate_CG_Vars()
End Subroutine
Attributes(Global) Subroutine MatVecMultSubVecHT(M,b,x,P,Neib,nTot)
Implicit none
! Local Vars
Integer, Device:: i, j, k
Real(fp_kind), Device:: Sum
! PassedVars
Integer, Value:: nTot
Integer, Device, Intent(IN):: Neib(nTot,nNeib_max)
Real(fp_kind), Device, Intent(IN):: b(nTot), x(nTot), M(nTot,nNeib_max)
Real(fp_kind), Device, Intent(OUT):: P(nTot)
i = (blockIdx%x-1)*blockDim%x + threadIdx%x
If (i >= 1 .and. i <= nTot) Then
Sum = M(i,1)*x(i)
Do k = 2, Neib(i,2)+1
j = Neib(i,k+1)
Sum = Sum + M(i,k)*x(j)
End Do
P(i) = b(i) - Sum
End If
End Subroutine
Attributes(Global) Subroutine MatVecMultSubVecMech(M,b,x,P,Neib,nTot)
Implicit none
! Local Vars
Integer, Device:: i, i_Ke, j, j_Ke, k, k_Ke
Real(fp_kind), Device:: Sum1, Sum2, Sum3
! PassedVars
Integer, Value:: nTot
Integer, Device, Intent(IN):: Neib(nTot/3,nNeib_max)
Real(fp_kind), Device, Intent(IN):: b(nTot), x(nTot), M(nTot,3*nNeib_max)
Real(fp_kind), Device, Intent(OUT):: P(nTot)
i = (blockIdx%x-1)*blockDim%x + threadIdx%x
i_Ke = 3*i-2
If (i >= 1 .and. i <= nTot/3) Then
Sum1 = M(i_Ke,1)*x(i_Ke) + M(i_Ke,2)*x(i_Ke+1) + M(i_Ke,3)*x(i_Ke+2)
Sum2 = M(i_Ke+1,1)*x(i_Ke) + M(i_Ke+1,2)*x(i_Ke+1) + M(i_Ke+1,3)*x(i_Ke+2)
Sum3 = M(i_Ke+2,1)*x(i_Ke) + M(i_Ke+2,2)*x(i_Ke+1) + M(i_Ke+2,3)*x(i_Ke+2)
Do k = 3, Neib(i,2)+2
k_Ke = 3*(k-2)+1
j = Neib(i,k)
j_Ke = 3*j - 2
Sum1 = Sum1 + M(i_Ke,k_Ke)*x(j_Ke) + M(i_Ke,k_Ke+1)*x(j_Ke+1) + M(i_Ke,k_Ke+2)*x(j_Ke+2)
Sum2 = Sum2 + M(i_ke+1,k_Ke)*x(j_Ke) + M(i_ke+1,k_Ke+1)*x(j_Ke+1) + M(i_ke+1,k_Ke+2)*x(j_Ke+2)
Sum3 = Sum3 + M(i_ke+2,k_Ke)*x(j_Ke) + M(i_ke+2,k_Ke+1)*x(j_Ke+1) + M(i_ke+2,k_Ke+2)*x(j_Ke+2)
End Do
P(i_Ke) = b(i_Ke) - Sum1
P(i_Ke+1) = b(i_Ke+1) - Sum2
P(i_Ke+2) = b(i_Ke+2) - Sum3
End If
End Subroutine
Subroutine VecdotVec(V1,V2,eCG,nTot)
Implicit None
! Local Vars
Integer:: i, j
! Passed Vars
Integer, Value:: nTot
Real(fp_kind), Device, Intent(IN):: V1(nTot), V2(nTot)
Real(fp_kind), Device, Intent(INOUT):: eCG
!$cuf kernel do(1) <<<*,*>>>
Do i = 1, nTot
eCG = eCG + V1(i)*V2(i)
End Do
End Subroutine
Attributes(Global) Subroutine MatVecMultHT(M,P,Q,Neib,nTot)
Implicit none
! Local Vars
Integer, Device:: i, j, k
Real(fp_kind), Device:: Sum
! PassedVars
Integer, Value:: nTot
Integer, Device, Intent(IN):: Neib(nTot,nNeib_max)
Real(fp_kind), Device, Intent(IN):: M(nTot,nNeib_max), P(nTot)
Real(fp_kind), Device, Intent(OUT):: Q(nTot)
i = (blockIdx%x-1)*blockDim%x + threadIdx%x
If (i >= 1 .and. i <= nTot) Then
Sum = M(i,1)*P(i)
Do k = 2, Neib(i,2)+1
j = Neib(i,k+1)
Sum = Sum + M(i,k)*P(j)
End Do
Q(i) = Sum
End If
End Subroutine
Attributes(Global) Subroutine MatVecMultMech(M,P,Q,Neib,nTot)
Implicit none
! Local Vars
Integer, Device:: i, i_Ke, j, j_Ke, k, k_Ke
Real(fp_kind), Device:: Sum1, Sum2, Sum3
! PassedVars
Integer, Value:: nTot
Integer, Device, Intent(IN):: Neib(nTot/3,nNeib_max)
Real(fp_kind), Device, Intent(IN):: M(nTot,3*nNeib_max), P(nTot)
Real(fp_kind), Device, Intent(OUT):: Q(nTot)
i = (blockIdx%x-1)*blockDim%x + threadIdx%x
i_Ke = 3*i-2
If (i >= 1 .and. i <= nTot/3) Then
Sum1 = M(i_Ke,1)*P(i_Ke) + M(i_Ke,2)*P(i_Ke+1) + M(i_Ke,3)*P(i_Ke+2)
Sum2 = M(i_Ke+1,1)*P(i_Ke) + M(i_Ke+1,2)*P(i_Ke+1) + M(i_Ke+1,3)*P(i_Ke+2)
Sum3 = M(i_Ke+2,1)*P(i_Ke) + M(i_Ke+2,2)*P(i_Ke+1) + M(i_Ke+2,3)*P(i_Ke+2)
Do k = 3, Neib(i,2)+2
k_Ke = 3*(k-2)+1
j = Neib(i,k)
j_Ke = 3*j - 2
Sum1 = Sum1 + M(i_Ke,k_Ke)*P(j_Ke) + M(i_Ke,k_Ke+1)*P(j_Ke+1) + M(i_Ke,k_Ke+2)*P(j_Ke+2)
Sum2 = Sum2 + M(i_ke+1,k_Ke)*P(j_Ke) + M(i_ke+1,k_Ke+1)*P(j_Ke+1) + M(i_ke+1,k_Ke+2)*P(j_Ke+2)
Sum3 = Sum3 + M(i_ke+2,k_Ke)*P(j_Ke) + M(i_ke+2,k_Ke+1)*P(j_Ke+1) + M(i_ke+2,k_Ke+2)*P(j_Ke+2)
End Do
Q(i_Ke) = Sum1
Q(i_Ke+1) = Sum2
Q(i_Ke+2) = Sum3
End If
End Subroutine
Attributes(Global) Subroutine ScalarDivScalar(ScIn,ScOut,dotProd,nTot)
Implicit none
! Local Vars
Integer, Device:: i, j, k
Real(fp_kind), Device:: Sum
! PassedVars
Integer, Value:: nTot
Real(fp_kind), Device, Intent(IN):: dotProd, ScIn
Real(fp_kind), Device, Intent(INOUT):: ScOut
ScOut = ScIn/dotProd
End Subroutine
Attributes(Global) Subroutine DoubVecUpdate(x,R,P,Q,alpha,nTot)
Implicit none
! Local Vars
Integer, Device:: i, j, k
Real(fp_kind), Device:: Sum
! PassedVars
Integer, Value:: nTot
Real(fp_kind), Device, Intent(IN):: P(nTot), Q(nTot), alpha
Real(fp_kind), Device, Intent(INOUT):: x(nTot), R(nTot)
i = (blockIdx%x-1)*blockDim%x + threadIdx%x
If (i >= 1 .and. i <= nTot) Then
x(i) = x(i) + alpha*P(i)
R(i) = R(i) - alpha*Q(i)
End If
End Subroutine
Attributes(Global) Subroutine VecUpdate(R,P,beta,nTot)
Implicit none
! Local Vars
Integer, Device:: i, j, k
Real(fp_kind), Device:: Sum
! PassedVars
Integer, Value:: nTot
Real(fp_kind), Device, Intent(IN):: R(nTot), beta
Real(fp_kind), Device, Intent(INOUT):: P(nTot)
i = (blockIdx%x-1)*blockDim%x + threadIdx%x
If (i >= 1 .and. i <= nTot) Then
P(i) = R(i) + Beta*P(i)
End If
End Subroutine
Subroutine Initialize_CG_Vars(allocN)
Implicit None
Integer:: allocN
Character(60):: ArrayName
! Allocate Device variables for implicit
ArrayName = "Device CG Vectors"
Allocate(Pcg_d(allocN),R_d(allocN),Q_d(allocN),STAT = istat)
Call AllocateErrorCheck(istat,ArrayName)
Pcg_d = 0.0
R_d = 0.0
Q_d = 0.0
End Subroutine
Subroutine Deallocate_CG_Vars()
Implicit None
Deallocate(Pcg_d,R_d,Q_d)
End Subroutine
End Module