Thank you for your reply, I think it clarifies a lot of the behaviour I have seen around the matmul intrinsic and issues it has caused.
I am comparing the performance of the loop in the actual code, which is a parallel loop nested in a sequential loop. The implicit data transfers explain the performance penalty, as the rest of the data is already transferred before the outer loop.
I added the code below just for reference, but it does not require inspection. The matvec subroutine (which is specialised for multiplying a size 4 vector with a 4x4 matrix) here replaces the matmul intrinsic, which in my case greatly improved performance.
program sweep_test
!use omp_lib
!use magma
!use nvtx
implicit none
! The basis is
! 1, 2x, 2y, 2z. The unit cell is (-1/2,+1/2). So the values are 1 on the bounds
double precision, parameter :: Sigma_t=0.1d0
integer, parameter :: ni=200
integer, parameter :: nj=200
integer, parameter :: nk=200
integer, parameter :: N= 100!N is the number of directional sweep vectors
integer :: O(N) !O denotes the ordinate belonging to a given vector omega
integer :: dir !extra loopiterator introduced for looping over different omega vectors
integer, parameter :: no_elem = ni * nj * nk
integer, dimension(no_elem) :: sweep_order
integer, parameter :: no_planes=ni+nj+nk-2
integer, dimension(no_elem) :: plane_number
integer, dimension(no_planes) :: start_index_in_plane
integer, dimension(no_planes) :: end_index_in_plane
double precision, dimension(3,N) :: Omega !Omega is now a matrix with N Omega vectors as its columns
double precision :: phi_in
integer :: i,j,k,elem,plane,index,sweep_elem,nghb,plane_next,elem_next
double precision, dimension(4 * no_elem,N) :: phi !phi is now a matrix with N phi vectors as its columns
double precision :: time1,time2
integer :: dim,nghb_start_index,start_index
!integer, dimension(4) :: ipiv
double precision, dimension(4,4) :: M,M_xp,M_yp,M_zp
double precision, dimension(4,4) :: M_xm,M_ym,M_zm
double precision, dimension(4,4,3) :: K_mat
double precision, dimension(4,4) :: elem_mat
double precision, dimension(4) :: elem_rhs,temp_vec,matvecprod
double precision, dimension(4) :: Edge_int_xm,Edge_int_ym,Edge_int_zm
integer :: qp_x,qp_y,qp_z
double precision, dimension(2) :: points,weights
double precision :: answer,x,y,z
integer :: cr,cm,rate,stime0,stimepredat,stime1,stime2,stime3,stime15
!call nvtxstartrange("init",color=1)
CALL system_clock(count_rate=cr)
CALL system_clock(count_max=cm)
rate = REAL(cr)
call system_clock(stime0)
points(1) = +0.5d0 / sqrt(3.d0)
points(2) = -0.5d0 / sqrt(3.d0)
weights(1) = 0.5d0
weights(2) = 0.5d0
! Fill the mass matrix M = \int_V h_i h_j
M = 0.d0
M(1,1) = 1.d0
M(2,2) = 1.d0 / 3.d0
M(3,3) = 1.d0 / 3.d0
M(4,4) = 1.d0 / 3.d0
! Fill the vol matrix K = \int_V grad h_i h_j
K_mat = 0.d0
K_mat(2,1,1) = 2.d0
K_mat(3,1,2) = 2.d0
K_mat(4,1,3) = 2.d0
! Fill M_xp
M_xp = 0.d0
M_xp(1,1) = 1.d0
M_xp(1,2) = 1.d0
M_xp(2,1) = 1.d0
M_xp(2,2) = 1.d0
M_xp(3,3) = 1.d0 / 3.d0
M_xp(4,4) = 1.d0 / 3.d0
! Fill M_yp
M_yp = 0.d0
M_yp(1,1) = 1.d0
M_yp(1,3) = 1.d0
M_yp(2,2) = 1.d0 / 3.d0
M_yp(3,1) = 1.d0
M_yp(3,3) = 1.d0
M_yp(4,4) = 1.d0 / 3.d0
! Fill M_zp
M_zp = 0.d0
M_zp(1,1) = 1.d0
M_zp(1,4) = 1.d0
M_zp(2,2) = 1.d0 / 3.d0
M_zp(3,3) = 1.d0 / 3.d0
M_zp(4,1) = 1.d0
M_zp(4,4) = 1.d0
! Fill M_xm
M_xm = 0.d0
M_xm(1,1) = 1.d0
M_xm(1,2) = 1.d0
M_xm(2,1) = - 1.d0
M_xm(2,2) = - 1.d0
M_xm(3,3) = 1.d0 / 3.d0
M_xm(4,4) = 1.d0 / 3.d0
! Fill M_ym
M_ym = 0.d0
M_ym(1,1) = 1.d0
M_ym(1,3) = 1.d0
M_ym(2,2) = 1.d0 / 3.d0
M_ym(3,1) = - 1.d0
M_ym(3,3) = - 1.d0
M_ym(4,4) = 1.d0 / 3.d0
! Fill M_zm
M_zm = 0.d0
M_zm(1,1) = 1.d0
M_zm(1,4) = 1.d0
M_zm(2,2) = 1.d0 / 3.d0
M_zm(3,3) = 1.d0 / 3.d0
M_zm(4,1) = - 1.d0
M_zm(4,4) = - 1.d0
! Fill Edge_int_xm
Edge_int_xm = 0.d0
Edge_int_xm(1) = 1.d0
Edge_int_xm(2) = - 1.d0
! Fill Edge_int_ym
Edge_int_ym = 0.d0
Edge_int_ym(1) = 1.d0
Edge_int_ym(3) = - 1.d0
! Fill Edge_int_zm
Edge_int_zm = 0.d0
Edge_int_zm(1) = 1.d0
Edge_int_zm(4) = - 1.d0
!call nvtxendrange()
!call nvtxstartrange("plane number",color=2)
! Find in which sweep plane elem lies
do k=1,nk
do j=1,nj
do i=1,ni
elem = get_elem_index(i,j,k)
plane_number(elem) = i+j+k-2
enddo
enddo
enddo
!call nvtxendrange
!call nvtxstartrange("sweep order",color=1)
! Now do a very poor implementation of getting then sweep order
index=1
do plane=1,no_planes
do elem=1,no_elem
if (plane_number(elem) == plane) then
sweep_order(index) = elem
index = index + 1
endif
enddo
enddo
!call nvtxendrange()
!call nvtxstartrange("indices in plane",color=2)
! Get the start and end indices for each plane
start_index_in_plane(1) = 1
do sweep_elem=1,no_elem-1
elem = sweep_order(sweep_elem)
elem_next = sweep_order(sweep_elem+1)
plane = plane_number(elem)
plane_next = plane_number(elem_next)
if (plane_next /= plane) then
end_index_in_plane(plane) = sweep_elem
start_index_in_plane(plane+1) = sweep_elem+1
endif
enddo
end_index_in_plane(no_planes) = no_elem
!call nvtxendrange()
! Set the Omega field to a constant
Omega = 1.d0 !to ensure the code doesn't break when larger N is tested
Omega(:,1) = (/ 1.d0, 1.d0, 1.d0 /) !Omega definition was adjusted to allow definition of N different vectors distributed across the 8 octants
Omega(:,2) = (/ 1.d0, -1.d0, 5.d0 /)
do dir=1,N !normalisation step adjusted to make sure each Omega vector has length one
Omega(:,dir) = Omega(:,dir) / sqrt(dot_product(Omega(:,dir),Omega(:,dir)))
enddo
! Perform sweep
! We assume the volume and areas to be unity
phi = 0.d0
call system_clock(stimepredat)
!call nvtxstartrange("copyin",color=3)
!$acc data copyin(sweep_order,start_index,start_index_in_plane,end_index_in_plane,M,Omega,K_mat,M_xp,M_yp,M_zp,M_xm,M_ym,M_zm,Edge_int_xm,Edge_int_ym,Edge_int_zm) copyout(phi,O) create(matvecprod,nghb,nghb_start_index,elem,i,j,k,elem_mat,elem_rhs,temp_vec,phi_in,sweep_elem,dir)
!data copy directive expanded to add the new variables
!call nvtxendrange()
call system_clock(stime1)
! $acc data create(elem_mat,sweep_elem,plane,elem_rhs,nghb,nghb_start_index,temp_vec,i,j,k)
! $acc routine seq
! $acc kernels
!call nvtxstartrange("getO",color=4)
!while we're already at the gpu, let's determine the octant of each omega and make sure to adapt omega to the original range to not break the code already present
!$acc parallel loop gang num_gangs(N) private(dir) present(Omega) default(none)
do dir=1,N
O(dir) = getO(Omega(:,dir)) !assigns octant number to omega associated with current direction
!the following three lines mirrors negative coordinates for each direction, because the original code in the is not equiped to handle negative definite omega vectors
Omega(1,dir) = abs(Omega(1,dir))
Omega(2,dir) = abs(Omega(2,dir))
Omega(3,dir) = abs(Omega(3,dir))
enddo
call system_clock(stime15)
call cpu_time(time1)
!call nvtxendrange()
do plane=1,no_planes
! $acc kernels private(elem_mat,sweep_elem,plane,elem_rhs,nghb,nghb_start_index,temp_vec)
!call nvtxstartrange("plane jump",color=6)
!$acc parallel loop gang vector collapse(2) default(none) private(start_index,nghb_start_index,elem,i,j,k,elem_mat,elem_rhs,nghb,phi_in,temp_vec,sweep_elem,dir) present(M_zm,M_ym,M_xm,M_zp,M_yp,M_xp,K_mat,Omega,M,sweep_order,phi,Edge_int_xm,Edge_int_ym,Edge_int_zm,end_index_in_plane,start_index_in_plane)
do dir=1,N !this is where the magic happens. Note that this loop is nested in the planes loop. this is because only gang level parallelism allows for private variables, which is required to generate correct results. gang parallelism does not work for nested loop directives, which is why instead both parallel loops are grouped together and collapsed.
do sweep_elem=start_index_in_plane(plane),end_index_in_plane(plane)
! !call nvtxstartrange("elem init and get ijk",color=4)
elem = sweep_order(sweep_elem)
call get_ijk(elem,i,j,k)
! !call nvtxendrange()
! Init matrix and rhs
! allocate(elem_mat(4,4))
! !call nvtxstartrange("init arrays",color=5)
elem_mat = 0.d0
elem_rhs = 0.d0
! Removal
elem_mat = Sigma_t * M
! Volumetric streaming term
!in the following lines, Omega and phi slicing were adapted to account for the new structure of these variables
elem_mat(1:4,1:4) = elem_mat(1:4,1:4) - Omega(1,dir) * K_mat(1:4,1:4,1)- Omega(2,dir) * K_mat(1:4,1:4,2)- Omega(3,dir) * K_mat(1:4,1:4,3)
! out x+
elem_mat = elem_mat + Omega(1,dir) * M_xp
! out y+
elem_mat = elem_mat + Omega(2,dir) * M_yp
! out z+
elem_mat = elem_mat + Omega(3,dir) * M_zp
! in x-
if (i > 1) then
nghb = get_elem_index(i-1,j,k)
nghb_start_index = (nghb-1) * 4 + 1
temp_vec = phi(nghb_start_index:nghb_start_index+4-1,dir)
call matvec(M_xm,temp_vec,matvecprod)
elem_rhs = elem_rhs + Omega(1,dir) * matvecprod
else
phi_in = 1.d0
elem_rhs = elem_rhs + phi_in * Omega(1,dir) * Edge_int_xm
endif
! in y-
if (j > 1) then
nghb = get_elem_index(i,j-1,k)
nghb_start_index = (nghb-1) * 4 + 1
temp_vec = phi(nghb_start_index:nghb_start_index+4-1,dir)
call matvec(M_ym,temp_vec,matvecprod)
elem_rhs = elem_rhs + Omega(2,dir) * matvecprod
else
phi_in = 1.d0
elem_rhs = elem_rhs + phi_in * Omega(2,dir) * Edge_int_ym
endif
! in z-
if (k > 1) then
nghb = get_elem_index(i,j,k-1)
nghb_start_index = (nghb-1) * 4 + 1
temp_vec = phi(nghb_start_index:nghb_start_index+4-1,dir)
call matvec(M_zm,temp_vec,matvecprod)
elem_rhs = elem_rhs + Omega(3,dir) * matvecprod
else
phi_in = 1.d0
elem_rhs = elem_rhs + phi_in * Omega(3,dir) * Edge_int_zm
endif
! !call nvtxendrange()
!!call nvtxstartrange("local solve",color=6)
! Local solve
! $acc host_data use_device(elem_mat,ipiv,elem_rhs)
! elem_rhs = RSOLVE( elem_mat, elem_rhs)
call dgesv_gpu(4,elem_mat,elem_rhs) ! magma_dgesv_gpu(4,1,elem_mat,4,ipiv,elem_rhs,4,info)
! if (info /= 0) STOP 'problem in dgesv'
!!call nvtxendrange()
!!call nvtxstartrange("assign phi",color=5)
start_index = (elem-1) * 4 + 1
phi(start_index:start_index+4-1,dir) = elem_rhs
! $acc end host_data
! !call nvtxendrange()
! deallocate(elem_mat)
enddo
enddo
!call nvtxendrange
enddo
! $acc end kernels
call system_clock(stime2)
!call nvtxstartrange("data out",color=5)
!$acc end data
!call nvtxendrange()
call cpu_time(time2)
call system_clock(stime3)
print *,'Sweep time',time2-time1
print *,'Sum of phinorms',sumnormphi(phi,N) !here the phinorm calculation was adapted to give a single variable combining all different directions that were calculated
print *,'octant of each vector omega',O(:)
!print *,'Norm phi2',sqrt(dot_product(phi(:,2),phi(:,2)))
print *,'----------------------------------' !adapted system time to account for contribution ordinate calculation time
print *,'system time:'
print *,'system clock rate',rate
print *,'prep time',real(stimepredat-stime0)/rate
print *,'copyin time',real(stime1-stimepredat)/rate
print *,'ordinate calculation time',real(stime15-stime1)/rate
print *,'sweep time',real(stime2-stime15)/rate
print *,'copyout time',real(stime3-stime2)/rate
contains
pure subroutine matvec(A,x,y)
!$acc routine seq
implicit none
double precision, intent(in) :: A(4,4),x(4)
double precision, intent(inout) :: y(4)
integer :: i,j
y=0.d0
do i=1,4
do j=1,4
y(i)=y(i)+A(i,j)*x(j)
end do
end do
end subroutine matvec
double precision function sumnormphi(phi,N) !added to check correctness between cpu and gpu, replaces norm of phi in original output
!$acc routine seq
implicit none
double precision :: phi(:,:)
integer :: direc,N
sumnormphi = 0.d0
do direc=1,N
sumnormphi = sumnormphi+sqrt(dot_product(phi(:,direc),phi(:,direc)))
enddo
end function sumnormphi
pure integer function getO(Omega) !added to calculate the octant of each omega vector, which is relevant to keep track of the original sweep direction in a heterogenous medium and the orientation of the phi contribution corresponding to this omega
!$acc routine seq
implicit none
double precision,dimension(3),intent(in) :: Omega
getO = int(2*(1-sign(1.d0,Omega(3)))+(1-sign(1.d0,Omega(2)))+0.5*(1-sign(1.d0,Omega(3))))
end function getO
integer function get_elem_index(i,j,k)
!$acc routine seq
implicit none
integer :: i,j,k
get_elem_index = (k-1) * (ni * nj) + (j-1) * (ni) + i
end function get_elem_index
subroutine get_ijk(elem,i,j,k)
!$acc routine seq
implicit none
integer :: elem,i,j,k
integer :: remainder,remainder_j
k = elem / (ni * nj)
remainder = mod(elem,ni*nj)
if (remainder == 0) then
k = k
else
k = k + 1
endif
remainder = elem - (k-1) * (ni*nj)
!!!!!!!
j = remainder / (ni)
remainder_j = mod(remainder,ni)
if (remainder_j == 0) then
j = j
else
j = j + 1
endif
remainder = remainder - (j-1) * (ni)
!!!!!!!
i = remainder
end subroutine get_ijk
double precision function fi(i,x,y,z)
!$acc routine seq
implicit none
integer :: i
double precision :: x,y,z
if (i==1) then
fi = 1.d0
endif
if (i==2) then
fi = 2.d0 * x
endif
if (i==3) then
fi = 2.d0 * y
endif
if (i==4) then
fi = 2.d0 * z
endif
end function fi
double precision function grad_fi(i,dim,x,y,z)
!$acc routine seq
implicit none
integer :: i,dim
double precision :: x,y,z
grad_fi = 0.d0
if (i==2) then
if (dim==1) grad_fi = 2.d0
endif
if (i==3) then
if (dim==2) grad_fi = 2.d0
endif
if (i==4) then
if (dim==3) grad_fi = 2.d0
endif
end function grad_fi
pure subroutine dgesv_gpu(n,A,b) !
!$acc routine seq
implicit none
integer, intent(in) :: n
double precision, intent(inout) :: A(4,4),b(4)
integer :: i,row,col,ind
double precision :: p,t
double precision :: maximum !added to replace maxloc
do row=1,n-1
! Partial pivotting: (the next 6 lines have been added because maxloc is not allowed)
maximum = abs(A(row,row))
ind=row
do i=row+1,n
if (abs(A(i,row)) > maximum) then
maximum = abs(A(i,row))
ind = i
endif
enddo
! Swap rows of A and b
if (ind /= row) then
do col=row,n
t = A(ind,col)
A(ind,col) = A(row,col)
A(row,col) = t
enddo
t = b(ind)
b(ind) = b(row)
b(row) = t
endif
! Eliminate by row subtraction
do i=row+1,n
p = A(i,row) / A(row,row)
A(i,row+1:n) = A(i,row+1:n) - p * A(row,row+1:n)
b(i) = b(i) - p * b(row)
enddo
enddo
! Back substitution
do row=n,1,-1
p = b(row)
do col=row+1,n
p = p - A(row,col) * b(col)
end do
b(row) = p / A(row,row)
enddo
end subroutine dgesv_gpu
end program sweep_test
``