Dear Mat,
Thanks for your replies. I have tried your suggestions to my Makefile, and now the compiler works but the compiling speed has not been improved. After adding the “-time“, I obtained some statistics of the compiling time for two SUBROUTINEs that contain the FUNCTION mentioned above.
And I am sorry to provide a complete code, because I cannot split it from a project. But I can share the part of KERNEL:
!$acc kernels async(11)
!$acc loop private(x_pos,y_pos,z_pos,idx_num,idx1_num,idx2_num,idx3_num,idx4_num,&
!$acc Forward_loop,Ratio_Rec_IP_1,Ratio_Rec_IP_2,Ratio_Rec_IP_3,&
!$acc Ratio_Rec_IP_4,Ratio_Rec_IP_5,TEMP_SIG,TEMP_IPM,TEMP_IPT,&
!$acc TEMP_IPC,DELX1,DELY1,DELZ1)
DO I=1,Model_Num
!! Calculate the first term of gradient
val_temp=0.0D0
!$acc loop seq
DO jj=1,4
idx_num = Field_Idx_Rec(I,jj)
idx1_num = Field_Idx1_Rec(I,jj)
idx2_num = Field_Idx2_Rec(I,jj)
idx3_num = Field_Idx3_Rec(I,jj)
idx4_num = Field_Idx4_Rec(I,jj)
x_pos = Field_Coord(idx_num)%Coordmesh_X
y_pos = Field_Coord(idx_num)%Coordmesh_Y
z_pos = Field_Coord(idx_num)%Coordmesh_Z
DELY1 = (CDELY(y_pos-1)+CDELY(y_pos))/2.0D0
DELZ1 = (CDELZ(z_pos-1)+CDELZ(z_pos))/2.0D0
TEMP_SIG = CCSIG(x_pos,y_pos-1,z_pos-1)*CDELY(y_pos-1)*CDELZ(z_pos-1)&
&+CCSIG(x_pos,y_pos-1,z_pos )*CDELY(y_pos-1)*CDELZ(z_pos )&
&+CCSIG(x_pos,y_pos ,z_pos-1)*CDELY(y_pos )*CDELZ(z_pos-1)&
&+CCSIG(x_pos,y_pos ,z_pos )*CDELY(y_pos )*CDELZ(z_pos )
TEMP_IPM = IPM(x_pos,y_pos-1,z_pos-1)*CDELY(y_pos-1)*CDELZ(z_pos-1)&
&+IPM(x_pos,y_pos-1,z_pos )*CDELY(y_pos-1)*CDELZ(z_pos )&
&+IPM(x_pos,y_pos ,z_pos-1)*CDELY(y_pos )*CDELZ(z_pos-1)&
&+IPM(x_pos,y_pos ,z_pos )*CDELY(y_pos )*CDELZ(z_pos )
TEMP_IPT = IPT(x_pos,y_pos-1,z_pos-1)*CDELY(y_pos-1)*CDELZ(z_pos-1)&
&+IPT(x_pos,y_pos-1,z_pos )*CDELY(y_pos-1)*CDELZ(z_pos )&
&+IPT(x_pos,y_pos ,z_pos-1)*CDELY(y_pos )*CDELZ(z_pos-1)&
&+IPT(x_pos,y_pos ,z_pos )*CDELY(y_pos )*CDELZ(z_pos )
TEMP_IPC = IPC(x_pos,y_pos-1,z_pos-1)*CDELY(y_pos-1)*CDELZ(z_pos-1)&
&+IPC(x_pos,y_pos-1,z_pos )*CDELY(y_pos-1)*CDELZ(z_pos )&
&+IPC(x_pos,y_pos ,z_pos-1)*CDELY(y_pos )*CDELZ(z_pos-1)&
&+IPC(x_pos,y_pos ,z_pos )*CDELY(y_pos )*CDELZ(z_pos )
TEMP_SIG = TEMP_SIG/(4.0D0*DELY1*DELZ1)
TEMP_IPM = TEMP_IPM/(4.0D0*DELY1*DELZ1)
TEMP_IPT = TEMP_IPT/(4.0D0*DELY1*DELZ1)
TEMP_IPC = TEMP_IPC/(4.0D0*DELY1*DELZ1)
Forward_loop = Loop-1
! Complex expressions for Ratio_Rec_IP_1 to Ratio_Rec_IP_5
! (involving GAMMA, digamma, log functions)
...
val_temp = val_temp + Ratio_Rec(I,jj)&
&*(Ratio_Rec_IP_1*Forward_fields(idx_num, Loop-Field_Rec_Idxstart+1)&
& -Ratio_Rec_IP_2*Forward_fields(idx1_num,Loop-Field_Rec_Idxstart+1-1)&
& -Ratio_Rec_IP_3*Forward_fields(idx2_num,Loop-Field_Rec_Idxstart+1-1)&
& +Ratio_Rec_IP_4*Forward_fields(idx3_num,Loop-Field_Rec_Idxstart+1-1)&
& +Ratio_Rec_IP_5*Forward_fields(idx4_num,Loop-Field_Rec_Idxstart+1-1))&
&*Field_False_Rec(idx_num)
ENDDO
!$acc loop seq
DO jj=5,8
! Similar structure with DELX1, DELZ1
...
ENDDO
ENDDO
!$acc end kernels
In my MODULE, there are six SUBROUTINEs. Each SUBROUTINE has such complex formula to call the FUNCTION.
The function “digamma“ is defined as
function digamma(c) result(psi)
implicit none
!$acc routine seq
real(8), intent(in) :: c
real(8) :: psi
real(8), parameter :: gamma_e = 0.5772156649015328606d0
integer :: i, N
real(8) :: x, sum
! if (c <= 0.d0 .or. c >= 1.d0) then
! write(*,*) "c must be in (0,1)"
! stop
! end if
x = c
N = 60
sum = 0.d0
do i = 1, N
sum = sum + x / ( dble(i) * (dble(i) + x) )
end do
psi = -gamma_e - 1.d0/x + sum
end function digamma