Hi,
I have been having trouble with a portion of code that I am trying to port to CUDA. The module is part of a smoothed particle hydrodynamics code for solid mechanics.
The subroutine is used to calculate an “artificial stress” that is used to stabilize particles that are in tension. The algorithm works very well on the CPU, but I just cant seem to get it to work properly on the GPU. On the CPU, the algorithm looks like this:
SUBROUTINE art_stress_simple(itimestep,ntotal,hsml,mass,niac,rho,pair_i,pair_j,w,dwdx,F_int,countiac,sig)
IMPLICIT NONE
INCLUDE 'param90.inc'
INTEGER:: ntotal, niac, pair_i(max_interaction),pair_j(max_interaction)
INTEGER:: i, j, k, d, n, m, countiac(maxn), itimestep
DOUBLE PRECISION:: hsml(maxn), mass(maxn), x(dim,maxn),vx(dim,maxn), &
rho(maxn), w(max_interaction), dvxdt(dim,maxn), &
dwdx(dim,max_interaction), w_deltap
DOUBLE PRECISION:: dx, epsilon_stress, r, dr, Ra(dim,dim,maxn), Rb(dim,dim,maxn), &
sig(dim,dim,maxn), rr, f_abi, F_abj
DOUBLE PRECISION:: F_int(dim,maxn), F1_int, F2_int, F3_int, deltap, n_stress
F_int = 0.0
deltap = 0.005
epsilon_stress = 0.3
n_stress = 6.0
CALL simple_kernel(deltap,hsml(1),w_deltap)
! Calculate SPH sum for artificial stress
DO k=1,niac
i = pair_i(k)
j = pair_j(k)
f_abi = w(k)/w_deltap
f_abi = f_abi**n_stress
f_abj = w(k)/w_deltap
f_abj = f_abj**n_stress
! For the ith particle in the pair
IF (sig(1,1,i) .gt. 0.0) THEN
Ra(1,1,i) = -epsilon_stress*sig(1,1,i)
ELSE
Ra(1,1,i) = 0.0
END IF
IF (sig(2,2,i) .gt. 0.0) THEN
Ra(2,2,i) = -epsilon_stress*sig(2,2,i)
ELSE
Ra(2,2,i) = 0.0
END IF
IF (sig(3,3,i) .gt. 0.0) THEN
Ra(3,3,i) = -epsilon_stress*sig(3,3,i)
ELSE
Ra(3,3,i) = 0.0
END IF
! For the jth particle in the pair
IF (sig(1,1,j) .gt. 0.0) THEN
Rb(1,1,j) = -epsilon_stress*sig(1,1,j)
ELSE
Rb(1,1,j) = 0.0
END IF
IF (sig(2,2,j) .gt. 0.0) THEN
Rb(2,2,j) = -epsilon_stress*sig(2,2,j)
ELSE
Rb(2,2,j) = 0.0
END IF
IF (sig(3,3,j) .gt. 0.0) THEN
Rb(3,3,j) = -epsilon_stress*sig(3,3,j)
ELSE
Rb(3,3,j) = 0.0
END IF
F1_int = (Ra(1,1,i)*f_abi/rho(i)**2.0 + Rb(1,1,j)*f_abj/rho(j)**2.0)*dwdx(1,k)
F2_int = (Ra(2,2,i)*f_abi/rho(i)**2.0 + Rb(2,2,j)*f_abj/rho(j)**2.0)*dwdx(2,k)
F3_int = (Ra(3,3,i)*f_abi/rho(i)**2.0 + Rb(3,3,j)*f_abj/rho(j)**2.0)*dwdx(3,k)
F_int(1,i) = F_int(1,i) + F1_int*mass(j)
F_int(1,j) = F_int(1,j) - F1_int*mass(i)
F_int(2,i) = F_int(2,i) + F2_int*mass(j)
F_int(2,j) = F_int(2,j) - F2_int*mass(i)
F_int(3,i) = F_int(3,i) + F3_int*mass(j)
F_int(3,j) = F_int(3,j) - F3_int*mass(i)
END DO
END SUBROUTINE art_stress_simple
SUBROUTINE SimpleKernel(r,hsml,w)
IMPLICIT NONE
INCLUDE 'param90.inc'
Real(fp_kind):: r, hsml, w, q, factor
q = r/hsml
w = 0.e0
factor = 3.e0/(2.e0*pi*hsml*hsml*hsml)
If (q.ge.0.and.q.le.1.e0) then
w = factor * (2./3. - q*q + q**3 / 2.)
Else If (q.gt.1.e0.and.q.le.2) then
w = factor * 1.e0/6.e0 * (2.-q)**3
Else
w = 0.0
End If
END SUBROUTINE SimpleKernel
The first thing that I figured needs to be different on the GPU is that Ra and Rb should be evaluated in one subroutine and then they should be used in an other routine. The GPU version looks like:
Module Module_Artificial_Stress
Use cudafor
Use Module_Precision
Use Module_Neighbors
Contains
Attributes(Global) Subroutine artificial_stress(sig,Ra1,Ra2,Ra3,nTotal)
Implicit None
Integer, Value:: nTotal
Integer, Device:: i
Real(fp_kind), Parameter:: AS_factor = 0.3
Real(fp_kind), Device:: sig(nTotal,Dim,Dim)
Real(fp_kind), Device:: Ra1(nTotal), Ra2(nTotal), Ra3(nTotal)
i = (blockIdx%x-1)*blockDim%x + threadIdx%x
If (i >= 1 .and. i <= nTotal) Then
If (sig(i,1,1) > 0.0) Then
Ra1(i) = -AS_factor*sig(i,1,1)
End If
If (sig(i,2,2) > 0.0) Then
Ra2(i) = -As_factor*sig(i,2,2)
End If
If (sig(i,3,3) > 0.0) Then
Ra3(i) = -AS_factor*sig(i,3,3)
End If
End If
End Subroutine artificial_stress
Attributes(Global) Subroutine artificial_stress_force(Ra1,Ra2,Ra3,w,dwdx,hsml,rho,mass,Neib,F_int,nTotal)
Implicit None
Include "param90.inc"
Integer, Value:: nTotal
Integer:: i, j, k, d
Integer, Device, Intent(IN):: Neib(nTotal,nNeib_max)
Real(fp_kind), Parameter:: n_stress = 6.0
Real(fp_kind), Parameter:: delta_p = 0.005
Real(fp_kind), Device:: fab, w_delta_p, R1, R2, R3
Real(fp_kind), Device:: w(nTotal,nNeib_max), dwdx(nTotal,nNeib_max,Dim), hsml(nTotal), rho(nTotal), mass(nTotal)
Real(fp_kind), Device:: Ra1(nTotal), Ra2(nTotal), Ra3(nTotal)
Real(fp_kind), Device:: F_int(nTotal,Dim)
i = (blockIdx%x-1)*blockDim%x + threadIdx%x
If (i >= 1 .and. i <= nTotal) Then
Call SimpleKernel(delta_p,hsml(i),w_delta_p)
Do k = 3, Neib(i,2) + 2
j = Neib(i,k)
fab = (w(i,k)/w_delta_p)**n_stress
R1 = Ra1(i)*fab/rho(i)**2.0 + Ra1(j)*fab/rho(j)**2.0
R2 = Ra2(i)*fab/rho(i)**2.0 + Ra2(j)*fab/rho(j)**2.0
R3 = Ra3(i)*fab/rho(i)**2.0 + Ra3(j)*fab/rho(j)**2.0
F_int(i,1) = F_int(i,1) + R1*mass(j)*dwdx(i,k,1)
F_int(i,2) = F_int(i,2) + R2*mass(j)*dwdx(i,k,2)
F_int(i,3) = F_int(i,3) + R3*mass(j)*dwdx(i,k,3)
End Do
End If
End Subroutine artificial_stress_force
Attributes(Device) SUBROUTINE SimpleKernel(r,hsml,w)
IMPLICIT NONE
INCLUDE 'param90.inc'
Real(fp_kind):: r, hsml, w, q, factor
q = r/hsml
w = 0.e0
factor = 3.e0/(2.e0*pi*hsml*hsml*hsml)
If (q.ge.0.and.q.le.1.e0) then
w = factor * (2./3. - q*q + q**3 / 2.)
Else If (q.gt.1.e0.and.q.le.2) then
w = factor * 1.e0/6.e0 * (2.-q)**3
Else
w = 0.0
End If
END SUBROUTINE SimpleKernel
End Module Module_Artificial_Stress
The problem is that on the GPU the results just do not line up with the expected results (the CPU algorithm gives the correct results). The results on the GPU are not consistent, the artificial stress is different each time the program is run (makes me think I am doing something wrong with reading and writing the same value across multiple threads).
Maybe I have done something fundamentally wrong in my GPU implementation? Does anything jump out as being wrong to anyone?
This is driving me a LITTLE crazy, I have been over the code a bunch of times, have re-written the artificial stress module from scratch a couple of times… very frustrated!
The rest of the code (if I do not calculate the artificial stress) works as it should on the GPU and the results fit perfectly with the CPU results.
Thank you for any help,
Kirk