Hello everyone,
I’m dealing with the porting of a Fortran90 parallel MPI CFD code into GPU using OpenACC. I’m actually compiling the code with nvhpc-openmpi4/24.1 (wrapper mpif90). I have a problem trying to accelerate a simple (but long) subroutine using kernels. The kernel is the following:
!$acc kernels
do k = MINk(B),MAXk(B)
do j = MINj(B),MAXj(B)
do i= MINi(B),MAXi(B)
inv_rC = 2.D0 / (r(j) + r(j-1))
T_ijk = T(i,j,k) ; T2_ijk = T_ijk * T_ijk ;
ER_ijk = ER(i,j,k); Fr_ijk = Fr(i,j,k); Fz_ijk = Fz(i,j,k) ; Ft_ijk = Ft(i,j,k) ;
Sp = SIGMA_P(i,j,k)
cER_old= V_light * ER_old(i,j,k)
if (Sp.NE.0.D0) then
call ABS_EFF(T_ijk,ER_ijk,Fr_ijk,Fz_ijk,Ft_ijk,Sp,0,Se,Sf,SEv,Sfv)
SEcdt = Se * cdt
SFcdt = Sf * cdt
DivFR = (FLUX_FR_r(i,j,k) - FLUX_FR_r(i,j-1,k)) * Delta_r(j) + &
(FLUX_FR_z(i,j,k) - FLUX_FR_z(i-1,j,k)) * Delta_z(i) + &
(FLUX_FR_t(i,j,k) - FLUX_FR_t(i,j,k-1)) * Delta_a(k)
DivPR_r = (FLUX_PR_rr(i,j,k) - FLUX_PR_rr(i,j-1,k)) * Delta_r(j) + &
(FLUX_PR_rz(i,j,k) - FLUX_PR_rz(i-1,j,k)) * Delta_z(i) + &
(FLUX_PR_rt(i,j,k) - FLUX_PR_rt(i,j,k-1)) * Delta_a(k)
DivPR_z = (FLUX_PR_zr(i,j,k) - FLUX_PR_zr(i,j-1,k)) * Delta_r(j) + &
(FLUX_PR_zz(i,j,k) - FLUX_PR_zz(i-1,j,k)) * Delta_z(i) + &
(FLUX_PR_zt(i,j,k) - FLUX_PR_zt(i,j,k-1)) * Delta_a(k)
DivPR_t =((FLUX_PR_tr(i,j,k) - FLUX_PR_tr(i,j-1,k)) * Delta_r(j) + &
(FLUX_PR_tz(i,j,k) - FLUX_PR_tz(i-1,j,k)) * Delta_z(i) + &
(FLUX_PR_tt(i,j,k) - FLUX_PR_tt(i,j,k-1)) * Delta_a(k)) * const_2D
RK_ER(i,j,k) = ER_old(i,j,k)
RK_Fr(i,j,k) = Fr_old(i,j,k)
RK_Fz(i,j,k) = Fz_old(i,j,k)
RK_Ft(i,j,k) = Ft_old(i,j,k)
sm(1)=ER_old(i,j,k)
sm(2)=Fr_old(i,j,k)
sm(3)=Fz_old(i,j,k)
sm(4)=Ft_old(i,j,k)
dt_a_c = dtRAD*A_RK(runge,runge)*V_light
Tsumille = T_ijk/1000.d0
RHS_F(runge,1,1,i,j,k)= -DivFR + (2.99792458D+4*Sp*7.5657845d0*(Tsumille**4.d0))
RHS_F(runge,1,2,i,j,k)= - V_light*V_light*DivPR_r
RHS_F(runge,1,3,i,j,k)= - V_light*V_light*DivPR_z
RHS_F(runge,1,4,i,j,k)= - V_light*V_light*DivPR_t
RHS_F(runge,2,1,i,j,k)= 0.d0
RHS_F(runge,2,2,i,j,k)= 0.d0
RHS_F(runge,2,3,i,j,k)= 0.d0
RHS_F(runge,2,4,i,j,k)= 0.d0
do s=1,runge-1
sm(1) = sm(1) + dtRAD*(ABar_RK(runge,s)*RHS_F(s,1,1,i,j,k) + A_RK(runge,s)*RHS_F(s,2,1,i,j,k))
sm(2) = sm(2) + dtRAD*(ABar_RK(runge,s)*RHS_F(s,1,2,i,j,k) + A_RK(runge,s)*RHS_F(s,2,2,i,j,k))
sm(3) = sm(3) + dtRAD*(ABar_RK(runge,s)*RHS_F(s,1,3,i,j,k) + A_RK(runge,s)*RHS_F(s,2,3,i,j,k))
sm(4) = sm(4) + dtRAD*(ABar_RK(runge,s)*RHS_F(s,1,4,i,j,k) + A_RK(runge,s)*RHS_F(s,2,4,i,j,k))
enddo
!NEWTON-RAPHSON ITERATION
res=1.d0; it=0; Mat=0.d0;
nvec(1) = RK_ER(i,j,k); nvec(2) = RK_Fr(i,j,k); nvec(3) = RK_Fz(i,j,k); nvec(4) = RK_Ft(i,j,k);
do while (res>0.0001d0.and.it<10000)
it=it+1
vec(1) = nvec(1); vec(2) = nvec(2); vec(3) = nvec(3); vec(4) = nvec(4);
call ABS_EFF(T_ijk,vec(1),vec(2),vec(3),vec(4),Sp,1,Se,Sf,SEv,Sfv)
Mat(1,1) = 1.d0 + dt_a_c*(SEv*vec(1) + Se)
Mat(2,1) = dt_a_c*(Sfv*vec(2) )
Mat(3,1) = dt_a_c*(Sfv*vec(3) )
Mat(4,1) = dt_a_c*(Sfv*vec(4) )
call ABS_EFF(T_ijk,vec(1),vec(2),vec(3),vec(4),Sp,2,Se,Sf,SEv,Sfv)
Mat(1,2) = + dt_a_c*(SEv*vec(1) )
Mat(2,2) = 1.d0 + dt_a_c*(Sfv*vec(2) + Sf)
Mat(3,2) = + dt_a_c*(Sfv*vec(3) )
Mat(4,2) = + dt_a_c*(Sfv*vec(4) )
call ABS_EFF(T_ijk,vec(1),vec(2),vec(3),vec(4),Sp,3,Se,Sf,SEv,Sfv)
Mat(1,3) = + dt_a_c*(SEv*vec(1) )
Mat(2,3) = + dt_a_c*(Sfv*vec(2) )
Mat(3,3) = 1.d0 + dt_a_c*(Sfv*vec(3) + Sf)
Mat(4,3) = + dt_a_c*(Sfv*vec(4) )
call ABS_EFF(T_ijk,vec(1),vec(2),vec(3),vec(4),Sp,4,Se,Sf,SEv,Sfv)
Mat(1,4) = + dt_a_c*(SEv*vec(1) )
Mat(2,4) = + dt_a_c*(Sfv*vec(2) )
Mat(3,4) = + dt_a_c*(Sfv*vec(3) )
Mat(4,4) = 1.d0 + dt_a_c*(Sfv*vec(4) + Sf)
f(1) = - (vec(1) - sm(1) + dt_a_c*Se)
f(2) = - (vec(2) - sm(2) + dt_a_c*Sf)
f(3) = - (vec(3) - sm(3) + dt_a_c*Sf)
f(4) = - (vec(4) - sm(4) + dt_a_c*Sf)
call gaussj(Mat,4,4,f,1,1)
app_ER = vec(1) + f(1)
if (app_ER>0.d0) then
nvec(1) = app_ER
endif
nvec(2) = vec(2) + f(2)
nvec(3) = vec(3) + f(3)
nvec(4) = vec(4) + f(4)*const_2D
res = dabs((nvec(1)-vec(1))/(vec(1)+1.d-10)) + &
dabs((nvec(2)-vec(2))/(vec(2)+1.d-10)) + &
dabs((nvec(3)-vec(3))/(vec(3)+1.d-10)) + &
dabs((nvec(4)-vec(4))/(vec(4)+1.d-10))
enddo
if(it>9999) then
print*, "it>10000"
print*, "res",res
print*, i,j,k
stop
endif
RK_ER(i,j,k) = nvec(1); RK_Fr(i,j,k) = nvec(2); RK_Fz(i,j,k) = nvec(3); RK_Ft(i,j,k) =nvec(4);
call ABS_EFF(T_ijk,nvec(1),nvec(2),nvec(3),nvec(4),Sp,0,Se,Sf,SEv,Sfv)
RHS_F(runge,2,1,i,j,k) = - Se*V_light*nvec(1)
RHS_F(runge,2,2,i,j,k) = - Sf*V_light*nvec(2)
RHS_F(runge,2,3,i,j,k) = - Sf*V_light*nvec(3)
RHS_F(runge,2,4,i,j,k) = - Sf*V_light*nvec(4)
if(runge==IMEX_s)then
RK_ER(i,j,k) = ER_old(i,j,k)
RK_Fr(i,j,k) = Fr_old(i,j,k)
RK_Fz(i,j,k) = Fz_old(i,j,k)
RK_Ft(i,j,k) = Ft_old(i,j,k)
do s=1,runge
RK_ER(i,j,k) = RK_ER(i,j,k) + dtRAD*(bBar_RK(s)*RHS_F(s,1,1,i,j,k) + b_RK(s)*RHS_F(s,2,1,i,j,k))
RK_Fr(i,j,k) = RK_Fr(i,j,k) + dtRAD*(bBar_RK(s)*RHS_F(s,1,2,i,j,k) + b_RK(s)*RHS_F(s,2,2,i,j,k))
RK_Fz(i,j,k) = RK_Fz(i,j,k) + dtRAD*(bBar_RK(s)*RHS_F(s,1,3,i,j,k) + b_RK(s)*RHS_F(s,2,3,i,j,k))
RK_Ft(i,j,k) = RK_Ft(i,j,k) + dtRAD*(bBar_RK(s)*RHS_F(s,1,4,i,j,k) + b_RK(s)*RHS_F(s,2,4,i,j,k))
enddo
endif
IF (RK_ER(i,j,k).LE.0.D0) THEN
count_Err1 = count_Err1 + 1
RK_ER(i,j,k) = ER_ijk
print*, "RK_ER<0",i,j,k,RK_ER(i,j,k)
END IF
Nf = DSQRT(RK_Fr(i,j,k)**2.D0 + RK_Fz(i,j,k)**2.D0 + RK_Ft(i,j,k)**2.D0)/(V_light*RK_ER(i,j,k))
IF (Nf.GT.1.D0) THEN
count_Err2 = count_Err2 + 1
inv_Nf = 1.D0 / Nf
RK_Fr(i,j,k) = RK_Fr(i,j,k) * inv_Nf
RK_Fz(i,j,k) = RK_Fz(i,j,k) * inv_Nf
RK_Ft(i,j,k) = RK_Ft(i,j,k) * inv_Nf
ENDIF
else
PRINT*, "SIGMA_P = 0", my_id
endif
end do
end do
end do
!$acc end kernels
In this kernel I call two other subroutine (ABS_EFF and gaussj) that are located in the same file and that have !$acc routine
information. When I compile the code I obtain the following information:
rad_eddington:
1177, Generating implicit copyin(fz_old(:,:,:),a_rk(runge,:)) [if not already present]
Generating implicit copy(nvec(:)) [if not already present]
Generating implicit copyin(sigma_p(:,:,:)) [if not already present]
Generating implicit copy(ft_ijk,fz_ijk,fr_ijk,er_ijk) [if not already present]
Generating implicit copyin(flux_pr_rz(:,:,:),flux_pr_tt(:,:,:),flux_pr_tr(:,:,:),flux_pr_tz(:,:,:),flux_pr_zt(:,:,:),flux_pr_zr(:,:,:),flux_fr_z(:,:,:),flux_pr_rt(:,:,:),flux_pr_rr(:,:,:)) [if not already present]
Generating implicit copy(f(:)) [if not already present]
Generating implicit copyin(delta_a(:),flux_fr_t(:,:,:),delta_r(:),flux_fr_r(:,:,:)) [if not already present]
Generating implicit copy(count_err2,sf,se,sev,rk_fz(:,:,:),sm(:)) [if not already present]
Generating implicit copyin(ft(:,:,:),fz(:,:,:),fr(:,:,:),er(:,:,:),t(:,:,:)) [if not already present]
Generating implicit copy(sp) [if not already present]
Generating implicit copyin(delta_z(:),flux_pr_zz(:,:,:),ft_old(:,:,:),fr_old(:,:,:)) [if not already present]
Generating implicit copy(sfv) [if not already present]
Generating implicit copyin(minj(b),er_old(:,:,:)) [if not already present]
Generating implicit copy(t_ijk,vec(:),rhs_f(:,1:2,1:4,:,:,:)) [if not already present]
Generating implicit copyin(abar_rk(runge,:),b_rk(:)) [if not already present]
Generating implicit copy(rk_er(:,:,:),rk_ft(:,:,:),rk_fr(:,:,:),mat(:,:)) [if not already present]
Generating implicit copyin(maxj(b),maxi(b),mini(b),bbar_rk(:)) [if not already present]
Generating implicit copy(count_err1) [if not already present]
1178, Complex loop carried dependence of maxj,minj,maxi,mini,t,er,fr,fz,ft,sigma_p,delta_a,flux_fr_t,delta_z,flux_fr_z,delta_r,flux_fr_r,flux_pr_rt,flux_pr_rz,flux_pr_rr,flux_pr_zt,flux_pr_zz,flux_pr_zr,flux_pr_tt,flux_pr_tz,flux_pr_tr,er_old prevents parallelization
Loop carried dependence due to exposed use of nvec(:) prevents parallelization
Complex loop carried dependence of fr_old prevents parallelization
Loop carried dependence due to exposed use of rk_er(:,:,:) prevents parallelization
Complex loop carried dependence of fz_old prevents parallelization
Loop carried dependence due to exposed use of rk_ft(:,:,:) prevents parallelization
Complex loop carried dependence of ft_old prevents parallelization
Loop carried dependence due to exposed use of rk_fr(:,:,:),rk_fz(:,:,:) prevents parallelization
Complex loop carried dependence of rhs_f prevents parallelization
Loop carried dependence due to exposed use of mat(:,:),f(:),vec(:),sm(:) prevents parallelization
Generating NVIDIA GPU code
1178, !$acc loop seq
Generating implicit reduction(+:count_err1,count_err2)
1179, !$acc loop seq
Generating implicit reduction(+:count_err2,count_err1)
1180, !$acc loop seq
Generating implicit reduction(+:count_err2,count_err1)
1228, !$acc loop seq
1236, !$acc loop vector(32) collapse(2) ! threadidx%x
! threadidx%x auto-collapsed
1323, !$acc loop seq
1179, Complex loop carried dependence of maxi,mini,t,er,fr,fz,ft,sigma_p,delta_a,flux_fr_t,delta_z,flux_fr_z,delta_r,flux_fr_r,flux_pr_rt,flux_pr_rz,flux_pr_rr,flux_pr_zt,flux_pr_zz,flux_pr_zr,flux_pr_tt,flux_pr_tz,flux_pr_tr,er_old prevents parallelization
Loop carried dependence due to exposed use of nvec(:) prevents parallelization
Complex loop carried dependence of fr_old prevents parallelization
Loop carried dependence due to exposed use of rk_er(:,:,:) prevents parallelization
Complex loop carried dependence of fz_old prevents parallelization
Loop carried dependence due to exposed use of rk_ft(:,:,:) prevents parallelization
Complex loop carried dependence of ft_old prevents parallelization
Loop carried dependence due to exposed use of rk_fr(:,:,:),rk_fz(:,:,:) prevents parallelization
Complex loop carried dependence of rhs_f prevents parallelization
Parallelization requires privatization of rhs_f as well as last value
Loop carried dependence due to exposed use of mat(:,:),f(:),vec(:),sm(:) prevents parallelization
Complex loop carried dependence of mat prevents parallelization
1180, Complex loop carried dependence of t,er,fr,fz,ft,sigma_p,delta_a,flux_fr_t,delta_z,flux_fr_z,delta_r,flux_fr_r,flux_pr_rt,flux_pr_rz,flux_pr_rr,flux_pr_zt,flux_pr_zz,flux_pr_zr,flux_pr_tt,flux_pr_tz,flux_pr_tr,er_old,rk_er prevents parallelization
Loop carried dependence due to exposed use of nvec(:) prevents parallelization
Complex loop carried dependence of fr_old,rk_fr prevents parallelization
Loop carried dependence due to exposed use of rk_er(:,:,:) prevents parallelization
Complex loop carried dependence of fz_old,rk_fz prevents parallelization
Loop carried dependence due to exposed use of rk_ft(:,:,:) prevents parallelization
Complex loop carried dependence of ft_old,rk_ft prevents parallelization
Loop carried dependence due to exposed use of rk_fr(:,:,:) prevents parallelization
Complex loop carried dependence of sm prevents parallelization
Loop carried dependence due to exposed use of rk_fz(:,:,:) prevents parallelization
Complex loop carried dependence of rhs_f prevents parallelization
Parallelization requires privatization of rhs_f as well as last value
Complex loop carried dependence of nvec prevents parallelization
Loop carried dependence due to exposed use of mat(:,:),f(:),vec(:),sm(:) prevents parallelization
Generating implicit private(fz_ijk,it,inv_nf,nf,tsumille,t_ijk,divpr_z,divpr_r,divpr_t,divfr,res,fr_ijk,ft_ijk,er_ijk,dt_a_c,sp)
1189, Reference argument passing prevents parallelization: sfv
Reference argument passing prevents parallelization: sev
Reference argument passing prevents parallelization: sf
Reference argument passing prevents parallelization: se
Reference argument passing prevents parallelization: sp
Reference argument passing prevents parallelization: ft_ijk
Reference argument passing prevents parallelization: fz_ijk
Reference argument passing prevents parallelization: fr_ijk
Reference argument passing prevents parallelization: er_ijk
Reference argument passing prevents parallelization: t_ijk
1228, Complex loop carried dependence of sm prevents parallelization
Parallelization requires privatization of sm as well as last value
1236, Loop is parallelizable
1238, Accelerator restriction: induction variable live-out from loop: it
Complex loop carried dependence of nvec,vec prevents parallelization
Loop carried dependence due to exposed use of vec(:),f(:) prevents parallelization
Complex loop carried dependence of f prevents parallelization
Parallelization requires privatization of nvec as well as last value
Scalar last value needed after loop for res at line 1300
Parallelization would require privatization of array mat(:,:)
Generating implicit private(res,it,app_er)
1239, Accelerator restriction: induction variable live-out from loop: it
1242, Reference argument passing prevents parallelization: sfv
Reference argument passing prevents parallelization: sev
Reference argument passing prevents parallelization: sf
Reference argument passing prevents parallelization: se
Reference argument passing prevents parallelization: sp
Reference argument passing prevents parallelization: t_ijk
1248, Reference argument passing prevents parallelization: sfv
Reference argument passing prevents parallelization: sev
Reference argument passing prevents parallelization: sf
Reference argument passing prevents parallelization: se
Reference argument passing prevents parallelization: sp
Reference argument passing prevents parallelization: t_ijk
1253, Reference argument passing prevents parallelization: sfv
Reference argument passing prevents parallelization: sev
Reference argument passing prevents parallelization: sf
Reference argument passing prevents parallelization: se
Reference argument passing prevents parallelization: sp
Reference argument passing prevents parallelization: t_ijk
1258, Reference argument passing prevents parallelization: sfv
Reference argument passing prevents parallelization: sev
Reference argument passing prevents parallelization: sf
Reference argument passing prevents parallelization: se
Reference argument passing prevents parallelization: sp
Reference argument passing prevents parallelization: t_ijk
1299, Reference argument passing prevents parallelization:
1300, Reference argument passing prevents parallelization:
1301, Reference argument passing prevents parallelization:
1312, Reference argument passing prevents parallelization: sfv
Reference argument passing prevents parallelization: sev
Reference argument passing prevents parallelization: sf
Reference argument passing prevents parallelization: se
Reference argument passing prevents parallelization: sp
Reference argument passing prevents parallelization: t_ijk
1323, Loop carried dependence due to exposed use of rk_er(*,*,*) prevents parallelization
Complex loop carried dependence of rk_er prevents parallelization
Loop carried dependence due to exposed use of rk_fr(*,*,*),rk_er(:,:,:) prevents parallelization
Complex loop carried dependence of rk_fz,rk_ft prevents parallelization
Loop carried dependence due to exposed use of rk_fz(:,:,:),rk_fz(*,*,*),rk_ft(:,:,:) prevents parallelization
Complex loop carried dependence of rk_fr prevents parallelization
Loop carried dependence due to exposed use of rk_ft(*,*,*),rk_fr(:,:,:) prevents parallelization
1336, Reference argument passing prevents parallelization:
1347, Reference argument passing prevents parallelization:
gaussj:
2504, Generating acc routine seq
Generating NVIDIA GPU code
abs_eff:
2583, Generating acc routine seq
Generating NVIDIA GPU code
When I run the code I obtain this error:
FORTRAN STOP: 0: Block (1,1,1), Thread (1,1,1)
stop08.cu:28: pgf90_stop08a: block: [0,0,0], thread: [0,0,0] Assertion `FORTRAN_STOP_STATEMENT` failed.
Failing in Thread:1
Accelerator Fatal Error: call to cuStreamSynchronize returned error 710: Other
File: /afs/enea.it/por/user/cimini/SRC/GPU/HeaRT_NEC_nunzio_ACC_RADE/Xcrescogpu_O2/../SOLVER/RAD/Radiation_Eddington_PEER4S.f90
Function: rad_eddington:868
Line: 1178
--------------------------------------------------------------------------
Primary job terminated normally, but 1 process returned
a non-zero exit code. Per user-direction, the job has been aborted.
--------------------------------------------------------------------------
--------------------------------------------------------------------------
mpirun detected that one or more processes exited with non-zero status, thus causing
the job to be terminated. The first process to do so was:
Process name: [[37517,1],0]
Exit code: 1
Anyone had the same problem? Any idea?
Thanks all in advance!