Accelerator Fatal Error: call to cuStreamSynchronize returned error 710: Other

Hello everyone,

I’m dealing with the porting of a Fortran90 parallel MPI CFD code into GPU using OpenACC. I’m actually compiling the code with nvhpc-openmpi4/24.1 (wrapper mpif90). I have a problem trying to accelerate a simple (but long) subroutine using kernels. The kernel is the following:

  !$acc kernels
  do k = MINk(B),MAXk(B)
    do j = MINj(B),MAXj(B)
       do i= MINi(B),MAXi(B)

       inv_rC   = 2.D0 / (r(j) + r(j-1))
       T_ijk  = T(i,j,k) ; T2_ijk = T_ijk * T_ijk ;
       ER_ijk = ER(i,j,k); Fr_ijk = Fr(i,j,k); Fz_ijk = Fz(i,j,k) ; Ft_ijk = Ft(i,j,k) ;
       Sp  = SIGMA_P(i,j,k)
       cER_old= V_light * ER_old(i,j,k)

     if (Sp.NE.0.D0) then
       call ABS_EFF(T_ijk,ER_ijk,Fr_ijk,Fz_ijk,Ft_ijk,Sp,0,Se,Sf,SEv,Sfv)

       SEcdt  = Se * cdt
       SFcdt  = Sf * cdt  

       DivFR  = (FLUX_FR_r(i,j,k) - FLUX_FR_r(i,j-1,k)) * Delta_r(j) + &
                (FLUX_FR_z(i,j,k) - FLUX_FR_z(i-1,j,k)) * Delta_z(i) + &
                (FLUX_FR_t(i,j,k) - FLUX_FR_t(i,j,k-1)) * Delta_a(k)
       DivPR_r = (FLUX_PR_rr(i,j,k) - FLUX_PR_rr(i,j-1,k)) * Delta_r(j) + &
                 (FLUX_PR_rz(i,j,k) - FLUX_PR_rz(i-1,j,k)) * Delta_z(i) + &
                 (FLUX_PR_rt(i,j,k) - FLUX_PR_rt(i,j,k-1)) * Delta_a(k)
       DivPR_z = (FLUX_PR_zr(i,j,k) - FLUX_PR_zr(i,j-1,k)) * Delta_r(j) + &
                 (FLUX_PR_zz(i,j,k) - FLUX_PR_zz(i-1,j,k)) * Delta_z(i) + &
                 (FLUX_PR_zt(i,j,k) - FLUX_PR_zt(i,j,k-1)) * Delta_a(k)
       DivPR_t =((FLUX_PR_tr(i,j,k) - FLUX_PR_tr(i,j-1,k)) * Delta_r(j) + &
                 (FLUX_PR_tz(i,j,k) - FLUX_PR_tz(i-1,j,k)) * Delta_z(i) + &
                 (FLUX_PR_tt(i,j,k) - FLUX_PR_tt(i,j,k-1)) * Delta_a(k)) * const_2D

       RK_ER(i,j,k) = ER_old(i,j,k)
       RK_Fr(i,j,k) = Fr_old(i,j,k)
       RK_Fz(i,j,k) = Fz_old(i,j,k)
       RK_Ft(i,j,k) = Ft_old(i,j,k)
       sm(1)=ER_old(i,j,k)
       sm(2)=Fr_old(i,j,k)
       sm(3)=Fz_old(i,j,k)
       sm(4)=Ft_old(i,j,k)

       dt_a_c = dtRAD*A_RK(runge,runge)*V_light

       Tsumille = T_ijk/1000.d0
       RHS_F(runge,1,1,i,j,k)= -DivFR + (2.99792458D+4*Sp*7.5657845d0*(Tsumille**4.d0))
       RHS_F(runge,1,2,i,j,k)= - V_light*V_light*DivPR_r
       RHS_F(runge,1,3,i,j,k)= - V_light*V_light*DivPR_z
       RHS_F(runge,1,4,i,j,k)= - V_light*V_light*DivPR_t
       RHS_F(runge,2,1,i,j,k)= 0.d0
       RHS_F(runge,2,2,i,j,k)= 0.d0
       RHS_F(runge,2,3,i,j,k)= 0.d0
       RHS_F(runge,2,4,i,j,k)= 0.d0

       do s=1,runge-1
        sm(1) = sm(1) + dtRAD*(ABar_RK(runge,s)*RHS_F(s,1,1,i,j,k) + A_RK(runge,s)*RHS_F(s,2,1,i,j,k))
        sm(2) = sm(2) + dtRAD*(ABar_RK(runge,s)*RHS_F(s,1,2,i,j,k) + A_RK(runge,s)*RHS_F(s,2,2,i,j,k))
        sm(3) = sm(3) + dtRAD*(ABar_RK(runge,s)*RHS_F(s,1,3,i,j,k) + A_RK(runge,s)*RHS_F(s,2,3,i,j,k))
        sm(4) = sm(4) + dtRAD*(ABar_RK(runge,s)*RHS_F(s,1,4,i,j,k) + A_RK(runge,s)*RHS_F(s,2,4,i,j,k))
       enddo

       !NEWTON-RAPHSON ITERATION
       res=1.d0; it=0; Mat=0.d0;
       nvec(1) = RK_ER(i,j,k); nvec(2) = RK_Fr(i,j,k); nvec(3) = RK_Fz(i,j,k); nvec(4) = RK_Ft(i,j,k);
       do while (res>0.0001d0.and.it<10000)
        it=it+1

        vec(1) = nvec(1); vec(2) = nvec(2); vec(3) = nvec(3); vec(4) = nvec(4);
        call ABS_EFF(T_ijk,vec(1),vec(2),vec(3),vec(4),Sp,1,Se,Sf,SEv,Sfv)
        Mat(1,1) = 1.d0 + dt_a_c*(SEv*vec(1) + Se)
        Mat(2,1) =        dt_a_c*(Sfv*vec(2)        )
        Mat(3,1) =        dt_a_c*(Sfv*vec(3)        )
        Mat(4,1) =        dt_a_c*(Sfv*vec(4)        )

        call ABS_EFF(T_ijk,vec(1),vec(2),vec(3),vec(4),Sp,2,Se,Sf,SEv,Sfv)
        Mat(1,2) =      + dt_a_c*(SEv*vec(1)        )
        Mat(2,2) = 1.d0 + dt_a_c*(Sfv*vec(2) + Sf)
        Mat(3,2) =      + dt_a_c*(Sfv*vec(3)        )
        Mat(4,2) =      + dt_a_c*(Sfv*vec(4)        )
        call ABS_EFF(T_ijk,vec(1),vec(2),vec(3),vec(4),Sp,3,Se,Sf,SEv,Sfv)
        Mat(1,3) =      + dt_a_c*(SEv*vec(1)        )
        Mat(2,3) =      + dt_a_c*(Sfv*vec(2)        )
        Mat(3,3) = 1.d0 + dt_a_c*(Sfv*vec(3) + Sf)
        Mat(4,3) =      + dt_a_c*(Sfv*vec(4)        )
        call ABS_EFF(T_ijk,vec(1),vec(2),vec(3),vec(4),Sp,4,Se,Sf,SEv,Sfv)
        Mat(1,4) =      + dt_a_c*(SEv*vec(1)        )
        Mat(2,4) =      + dt_a_c*(Sfv*vec(2)        )
        Mat(3,4) =      + dt_a_c*(Sfv*vec(3)        )
        Mat(4,4) = 1.d0 + dt_a_c*(Sfv*vec(4) + Sf)
        f(1) = - (vec(1) - sm(1) + dt_a_c*Se)
        f(2) = - (vec(2) - sm(2) + dt_a_c*Sf)
        f(3) = - (vec(3) - sm(3) + dt_a_c*Sf)
        f(4) = - (vec(4) - sm(4) + dt_a_c*Sf)

        call gaussj(Mat,4,4,f,1,1)

        app_ER = vec(1) + f(1)
        if (app_ER>0.d0) then
         nvec(1) = app_ER
        endif

        nvec(2) = vec(2) + f(2)
        nvec(3) = vec(3) + f(3)
        nvec(4) = vec(4) + f(4)*const_2D

        res = dabs((nvec(1)-vec(1))/(vec(1)+1.d-10)) + &
              dabs((nvec(2)-vec(2))/(vec(2)+1.d-10)) + &
              dabs((nvec(3)-vec(3))/(vec(3)+1.d-10)) + &
              dabs((nvec(4)-vec(4))/(vec(4)+1.d-10))

       enddo

       if(it>9999) then
        print*, "it>10000"
        print*, "res",res
        print*, i,j,k
        stop
       endif

       RK_ER(i,j,k) = nvec(1); RK_Fr(i,j,k) = nvec(2); RK_Fz(i,j,k) = nvec(3); RK_Ft(i,j,k) =nvec(4);

       call ABS_EFF(T_ijk,nvec(1),nvec(2),nvec(3),nvec(4),Sp,0,Se,Sf,SEv,Sfv)
       RHS_F(runge,2,1,i,j,k) = - Se*V_light*nvec(1)
       RHS_F(runge,2,2,i,j,k) = - Sf*V_light*nvec(2)
       RHS_F(runge,2,3,i,j,k) = - Sf*V_light*nvec(3)
       RHS_F(runge,2,4,i,j,k) = - Sf*V_light*nvec(4)

       if(runge==IMEX_s)then
        RK_ER(i,j,k) = ER_old(i,j,k)
        RK_Fr(i,j,k) = Fr_old(i,j,k)
        RK_Fz(i,j,k) = Fz_old(i,j,k)
        RK_Ft(i,j,k) = Ft_old(i,j,k)
        do s=1,runge
         RK_ER(i,j,k) = RK_ER(i,j,k) + dtRAD*(bBar_RK(s)*RHS_F(s,1,1,i,j,k) + b_RK(s)*RHS_F(s,2,1,i,j,k))
         RK_Fr(i,j,k) = RK_Fr(i,j,k) + dtRAD*(bBar_RK(s)*RHS_F(s,1,2,i,j,k) + b_RK(s)*RHS_F(s,2,2,i,j,k))
         RK_Fz(i,j,k) = RK_Fz(i,j,k) + dtRAD*(bBar_RK(s)*RHS_F(s,1,3,i,j,k) + b_RK(s)*RHS_F(s,2,3,i,j,k))
         RK_Ft(i,j,k) = RK_Ft(i,j,k) + dtRAD*(bBar_RK(s)*RHS_F(s,1,4,i,j,k) + b_RK(s)*RHS_F(s,2,4,i,j,k))
        enddo

       endif


       IF (RK_ER(i,j,k).LE.0.D0) THEN
         count_Err1 = count_Err1 + 1
         RK_ER(i,j,k) = ER_ijk
         print*, "RK_ER<0",i,j,k,RK_ER(i,j,k)
       END IF
       Nf = DSQRT(RK_Fr(i,j,k)**2.D0 + RK_Fz(i,j,k)**2.D0 + RK_Ft(i,j,k)**2.D0)/(V_light*RK_ER(i,j,k))
       IF (Nf.GT.1.D0) THEN
         count_Err2 = count_Err2 + 1
         inv_Nf = 1.D0 / Nf
         RK_Fr(i,j,k) = RK_Fr(i,j,k) * inv_Nf
         RK_Fz(i,j,k) = RK_Fz(i,j,k) * inv_Nf
         RK_Ft(i,j,k) = RK_Ft(i,j,k) * inv_Nf
       ENDIF
     else
       PRINT*, "SIGMA_P = 0", my_id
     endif

     end do
   end do
  end do
  !$acc end kernels

In this kernel I call two other subroutine (ABS_EFF and gaussj) that are located in the same file and that have !$acc routine information. When I compile the code I obtain the following information:

rad_eddington:
   1177, Generating implicit copyin(fz_old(:,:,:),a_rk(runge,:)) [if not already present]
         Generating implicit copy(nvec(:)) [if not already present]
         Generating implicit copyin(sigma_p(:,:,:)) [if not already present]
         Generating implicit copy(ft_ijk,fz_ijk,fr_ijk,er_ijk) [if not already present]
         Generating implicit copyin(flux_pr_rz(:,:,:),flux_pr_tt(:,:,:),flux_pr_tr(:,:,:),flux_pr_tz(:,:,:),flux_pr_zt(:,:,:),flux_pr_zr(:,:,:),flux_fr_z(:,:,:),flux_pr_rt(:,:,:),flux_pr_rr(:,:,:)) [if not already present]
         Generating implicit copy(f(:)) [if not already present]
         Generating implicit copyin(delta_a(:),flux_fr_t(:,:,:),delta_r(:),flux_fr_r(:,:,:)) [if not already present]
         Generating implicit copy(count_err2,sf,se,sev,rk_fz(:,:,:),sm(:)) [if not already present]
         Generating implicit copyin(ft(:,:,:),fz(:,:,:),fr(:,:,:),er(:,:,:),t(:,:,:)) [if not already present]
         Generating implicit copy(sp) [if not already present]
         Generating implicit copyin(delta_z(:),flux_pr_zz(:,:,:),ft_old(:,:,:),fr_old(:,:,:)) [if not already present]
         Generating implicit copy(sfv) [if not already present]
         Generating implicit copyin(minj(b),er_old(:,:,:)) [if not already present]
         Generating implicit copy(t_ijk,vec(:),rhs_f(:,1:2,1:4,:,:,:)) [if not already present]
         Generating implicit copyin(abar_rk(runge,:),b_rk(:)) [if not already present]
         Generating implicit copy(rk_er(:,:,:),rk_ft(:,:,:),rk_fr(:,:,:),mat(:,:)) [if not already present]
         Generating implicit copyin(maxj(b),maxi(b),mini(b),bbar_rk(:)) [if not already present]
         Generating implicit copy(count_err1) [if not already present]
   1178, Complex loop carried dependence of maxj,minj,maxi,mini,t,er,fr,fz,ft,sigma_p,delta_a,flux_fr_t,delta_z,flux_fr_z,delta_r,flux_fr_r,flux_pr_rt,flux_pr_rz,flux_pr_rr,flux_pr_zt,flux_pr_zz,flux_pr_zr,flux_pr_tt,flux_pr_tz,flux_pr_tr,er_old prevents parallelization
         Loop carried dependence due to exposed use of nvec(:) prevents parallelization
         Complex loop carried dependence of fr_old prevents parallelization
         Loop carried dependence due to exposed use of rk_er(:,:,:) prevents parallelization
         Complex loop carried dependence of fz_old prevents parallelization
         Loop carried dependence due to exposed use of rk_ft(:,:,:) prevents parallelization
         Complex loop carried dependence of ft_old prevents parallelization
         Loop carried dependence due to exposed use of rk_fr(:,:,:),rk_fz(:,:,:) prevents parallelization
         Complex loop carried dependence of rhs_f prevents parallelization
         Loop carried dependence due to exposed use of mat(:,:),f(:),vec(:),sm(:) prevents parallelization
         Generating NVIDIA GPU code
       1178, !$acc loop seq
             Generating implicit reduction(+:count_err1,count_err2)
       1179, !$acc loop seq
             Generating implicit reduction(+:count_err2,count_err1)
       1180, !$acc loop seq
             Generating implicit reduction(+:count_err2,count_err1)
       1228, !$acc loop seq
       1236, !$acc loop vector(32) collapse(2) ! threadidx%x
               ! threadidx%x auto-collapsed
       1323, !$acc loop seq
   1179, Complex loop carried dependence of maxi,mini,t,er,fr,fz,ft,sigma_p,delta_a,flux_fr_t,delta_z,flux_fr_z,delta_r,flux_fr_r,flux_pr_rt,flux_pr_rz,flux_pr_rr,flux_pr_zt,flux_pr_zz,flux_pr_zr,flux_pr_tt,flux_pr_tz,flux_pr_tr,er_old prevents parallelization
         Loop carried dependence due to exposed use of nvec(:) prevents parallelization
         Complex loop carried dependence of fr_old prevents parallelization
         Loop carried dependence due to exposed use of rk_er(:,:,:) prevents parallelization
         Complex loop carried dependence of fz_old prevents parallelization
         Loop carried dependence due to exposed use of rk_ft(:,:,:) prevents parallelization
         Complex loop carried dependence of ft_old prevents parallelization
         Loop carried dependence due to exposed use of rk_fr(:,:,:),rk_fz(:,:,:) prevents parallelization
         Complex loop carried dependence of rhs_f prevents parallelization
         Parallelization requires privatization of rhs_f as well as last value
         Loop carried dependence due to exposed use of mat(:,:),f(:),vec(:),sm(:) prevents parallelization
         Complex loop carried dependence of mat prevents parallelization
   1180, Complex loop carried dependence of t,er,fr,fz,ft,sigma_p,delta_a,flux_fr_t,delta_z,flux_fr_z,delta_r,flux_fr_r,flux_pr_rt,flux_pr_rz,flux_pr_rr,flux_pr_zt,flux_pr_zz,flux_pr_zr,flux_pr_tt,flux_pr_tz,flux_pr_tr,er_old,rk_er prevents parallelization
         Loop carried dependence due to exposed use of nvec(:) prevents parallelization
         Complex loop carried dependence of fr_old,rk_fr prevents parallelization
         Loop carried dependence due to exposed use of rk_er(:,:,:) prevents parallelization
         Complex loop carried dependence of fz_old,rk_fz prevents parallelization
         Loop carried dependence due to exposed use of rk_ft(:,:,:) prevents parallelization
         Complex loop carried dependence of ft_old,rk_ft prevents parallelization
         Loop carried dependence due to exposed use of rk_fr(:,:,:) prevents parallelization
         Complex loop carried dependence of sm prevents parallelization
         Loop carried dependence due to exposed use of rk_fz(:,:,:) prevents parallelization
         Complex loop carried dependence of rhs_f prevents parallelization
         Parallelization requires privatization of rhs_f as well as last value
         Complex loop carried dependence of nvec prevents parallelization
         Loop carried dependence due to exposed use of mat(:,:),f(:),vec(:),sm(:) prevents parallelization
         Generating implicit private(fz_ijk,it,inv_nf,nf,tsumille,t_ijk,divpr_z,divpr_r,divpr_t,divfr,res,fr_ijk,ft_ijk,er_ijk,dt_a_c,sp)
   1189, Reference argument passing prevents parallelization: sfv
         Reference argument passing prevents parallelization: sev
         Reference argument passing prevents parallelization: sf
         Reference argument passing prevents parallelization: se
         Reference argument passing prevents parallelization: sp
         Reference argument passing prevents parallelization: ft_ijk
         Reference argument passing prevents parallelization: fz_ijk
         Reference argument passing prevents parallelization: fr_ijk
         Reference argument passing prevents parallelization: er_ijk
         Reference argument passing prevents parallelization: t_ijk
   1228, Complex loop carried dependence of sm prevents parallelization
         Parallelization requires privatization of sm as well as last value
   1236, Loop is parallelizable
   1238, Accelerator restriction: induction variable live-out from loop: it
         Complex loop carried dependence of nvec,vec prevents parallelization
         Loop carried dependence due to exposed use of vec(:),f(:) prevents parallelization
         Complex loop carried dependence of f prevents parallelization
         Parallelization requires privatization of nvec as well as last value
         Scalar last value needed after loop for res at line 1300
         Parallelization would require privatization of array mat(:,:)
         Generating implicit private(res,it,app_er)
   1239, Accelerator restriction: induction variable live-out from loop: it
   1242, Reference argument passing prevents parallelization: sfv
         Reference argument passing prevents parallelization: sev
         Reference argument passing prevents parallelization: sf
         Reference argument passing prevents parallelization: se
         Reference argument passing prevents parallelization: sp
         Reference argument passing prevents parallelization: t_ijk
   1248, Reference argument passing prevents parallelization: sfv
         Reference argument passing prevents parallelization: sev
         Reference argument passing prevents parallelization: sf
         Reference argument passing prevents parallelization: se
         Reference argument passing prevents parallelization: sp
         Reference argument passing prevents parallelization: t_ijk
   1253, Reference argument passing prevents parallelization: sfv
         Reference argument passing prevents parallelization: sev
         Reference argument passing prevents parallelization: sf
         Reference argument passing prevents parallelization: se
         Reference argument passing prevents parallelization: sp
         Reference argument passing prevents parallelization: t_ijk
   1258, Reference argument passing prevents parallelization: sfv
         Reference argument passing prevents parallelization: sev
         Reference argument passing prevents parallelization: sf
         Reference argument passing prevents parallelization: se
         Reference argument passing prevents parallelization: sp
         Reference argument passing prevents parallelization: t_ijk
   1299, Reference argument passing prevents parallelization: 
   1300, Reference argument passing prevents parallelization: 
   1301, Reference argument passing prevents parallelization: 
   1312, Reference argument passing prevents parallelization: sfv
         Reference argument passing prevents parallelization: sev
         Reference argument passing prevents parallelization: sf
         Reference argument passing prevents parallelization: se
         Reference argument passing prevents parallelization: sp
         Reference argument passing prevents parallelization: t_ijk
   1323, Loop carried dependence due to exposed use of rk_er(*,*,*) prevents parallelization
         Complex loop carried dependence of rk_er prevents parallelization
         Loop carried dependence due to exposed use of rk_fr(*,*,*),rk_er(:,:,:) prevents parallelization
         Complex loop carried dependence of rk_fz,rk_ft prevents parallelization
         Loop carried dependence due to exposed use of rk_fz(:,:,:),rk_fz(*,*,*),rk_ft(:,:,:) prevents parallelization
         Complex loop carried dependence of rk_fr prevents parallelization
         Loop carried dependence due to exposed use of rk_ft(*,*,*),rk_fr(:,:,:) prevents parallelization
   1336, Reference argument passing prevents parallelization: 
   1347, Reference argument passing prevents parallelization: 
gaussj:
   2504, Generating acc routine seq
         Generating NVIDIA GPU code
abs_eff:
   2583, Generating acc routine seq
         Generating NVIDIA GPU code

When I run the code I obtain this error:

FORTRAN STOP: 0: Block (1,1,1), Thread (1,1,1)
stop08.cu:28: pgf90_stop08a: block: [0,0,0], thread: [0,0,0] Assertion `FORTRAN_STOP_STATEMENT` failed.
Failing in Thread:1
Accelerator Fatal Error: call to cuStreamSynchronize returned error 710: Other
 File: /afs/enea.it/por/user/cimini/SRC/GPU/HeaRT_NEC_nunzio_ACC_RADE/Xcrescogpu_O2/../SOLVER/RAD/Radiation_Eddington_PEER4S.f90
 Function: rad_eddington:868
 Line: 1178

--------------------------------------------------------------------------
Primary job  terminated normally, but 1 process returned
a non-zero exit code. Per user-direction, the job has been aborted.
--------------------------------------------------------------------------
--------------------------------------------------------------------------
mpirun detected that one or more processes exited with non-zero status, thus causing
the job to be terminated. The first process to do so was:

  Process name: [[37517,1],0]
  Exit code:    1

Anyone had the same problem? Any idea?

Thanks all in advance!

Hi matteo.cimini1,

I believe the issue is that you’re trying to call “stop” in a kernels region executing in parallel on the GPU. Ideally, I think, what you would do here is instead of calling “stop” if you enter that exit condition, you could update a scalar value to serve as a “flag” to call stop when you exit the kernel. So, after the kernel, you check the scalars value in a CPU region and if it’s set to that stop condition, you then stop. You could still print the i,j,k and res values - you just wouldn’t stop there.

So a quick verification of that would be to just comment out the “stop” and see that the code continues execution.

However, I think it’s a compiler bug that we don’t catch this and warn you about it ahead of time - or force the loop to be a serial loop because of the existence of the stop. We shouldn’t let this happen, I think. I’m going to raise an issue with engineering about it.

Please let me know if that helps!

Seth is correct that the message means that the kernel is encountering the STOP statement, though I don’t think there’s a compiler issue here. If I read the code correctly, the program is supposed to stop when the do while loop terminates when reaching the max iteration count (it .eq. 10000). So the question is why “res” is staying above 0.0001d0.

I can’t say for sure, but I do see that you have several scratch arrays (sm, nvec, vec, Mat, f). However arrays are shared by default so each vector will be overwriting each others data. Try privatizing these arrays.

Note that you can’t put a private clause on a “kernels” directive, so you’ll need to add a “loop” clause. Something like:

!$acc kernels loop collapse(3) private(sm,nvec,vec,Mat,f)

If I missed any other scratch array, please add it to the private clause.

-Mat

1 Like

And looking at your compiler feedback messages, I do see several scalars that may need to be privatized as well, like Sp, sfv, ft_ijk,fz_ijk,fr_ijk,er_ijk.sf,se,sev. Scalars typically are private by default, except when they are global (i.e. in a module) or passed by reference (default in Fortran) to a subroutine.

count_Err1 and count_Err2 seem like you do want them to be shared, so to avoid a race condition, I’d explicitly copy them into the kernels region and add an atomic directive.

!$acc kernels loop collapse(3) private(sm,nvec,vec,Mat,f, Sp, sfv, ft_ijk,fz_ijk,fr_ijk,er_ijk.sf,se,sev) copy(count_Err1,count_Err2)
...
       IF (RK_ER(i,j,k).LE.0.D0) THEN
!$acc atomic update
         count_Err1 = count_Err1 + 1
         RK_ER(i,j,k) = ER_ijk
...
       IF (Nf.GT.1.D0) THEN
!$acc atomic update
         count_Err2 = count_Err2 + 1
         inv_Nf = 1.D0 / Nf

Of course, double check me by reviewing the compiler feedback messages for any remaining “implicit” copies.

1 Like

Thank you for the suggestion, actually this is just a check and the stop conditions is never called in this case. Anyway I commented it for the moment and the error disappeared.

Now I have other problems with the correctness of results :)

Thank you again!

Hi Mat!

Thank you for your help! I implemented all your suggestion and now the code is much better. Actually there was an error in the code and I need sm, nvec, vec, Mat and f to be also function of (i,j,k) for other reasons in the code. For this reason, I allocated them to be for example sm(4,:,:,:).

I also followed the suggestion about the atomic directive on count_Err and all the other scalars. Now the code has the following directives:

!$acc parallel loop gang vector collapse(3) &
!$acc& private(inv_rC,T_ijk,T2_ijk,ER_ijk,Fr_ijk,Fz_ijk,Ft_ijk,Sp,Se,Sf,SEv,Sfv) &
!$acc& private(SEcdt,SFcdT,cER_old) & 
!$acc& private(DivPR_r,DivPR_z,DivPR_t,DivFR) & 
!$acc& private(dt_a_c,Tsumille,s,res,it,app_ER) &
!$acc& private(Nf,inv_Nf) &
!$acc& copy(i,j,k,count_Err1,count_Err2)
do k = MINk(B),MAXk(B)
  do j = MINj(B),MAXj(B)
    do i = MINi(B),MAXi(B)

           ...Code...

  end do
 end do
end do
!$acc update self(RK_ER,RK_Fr,RK_Fz,RK_Ft)
!$acc update self(ER,Fr,Fz,Ft)
!$acc update self(count_Err1,count_Err2)

The outcomes of this kernel are RK_ER, RK_Fr, RK_Fz, RK_Ft. I checked the results of the GPU code vs the CPU one in some i,j,k points and apparently I obtain correct results. However, after this kernel I do other operations and something goes wrong because I get segmentation fault due to an index overrun. I inserted this check:

do k = MINk(B),MAXk(B)
  do j = MINj(B),MAXj(B)
    do i = MINi(B),MAXi(B)

       if ((ER(i,j,k).gt.1e+10).or.(Fr(i,j,k).gt.1e+10).or.(Fz(i,j,k).gt.1e+10).or.(Ft(i,j,k).gt.1e+10)) then
        print*,"ERROR!!",i,j,k,ER(i,j,k), Fr(i,j,k), Fz(i,j,k), Ft(i,j,k)
        STOP
       end if

   end do
 end do
end do

Indeed, sometimes I get Inf or NaN values in some points. This is strange to me since in other points I get the correct result. Do you have any suggestion?

Thank you again :)

I’d first ensure that the code gets correct results without enabling the GPU offload in case it’s an algorithmic error.

Next compile targeting the multicore CPU. If it works, then there’s more likely a data issue, or if it still fails, it could be a parallelization issue, such as a race condition. Though there are far fewer threads on a CPU so if there is a race condition, it may not be encountered as often so only shows up on the GPU. Another tell if it’s more likely a race condition is if the bad data seems to be random.

Another possibility is uninitialized memory or an out-of-bounds read/write. For this, try running the utility “compute-sanitizer ./a.out” (where a.out is your binary).

If that doesn’t give clues, the next step would be to add more of your debugging code in the earlier kernels. Try to trace back to where the bad numbers get introduced. Hopefully once you know where they are coming from, it will give ideas on how to solve it.

1 Like

Checked, all good from the CPU side.

Thanks for the suggestion. I forgot to mention that the error arise only with collapse(3) directive. If I do not include it the GPU code works but it is slower than the CPU one.

Checked, all good from this aspect.

I think the bad numbers are introduced in the kernel I’m trying to accelerating because the error is induced from a variable that is an output of the kernel itself. I checked again all the code and I found that I missed some variables to be updated from CPU to GPU. Now from my point of view is all good but the issue still persist. Is it possible that I’m missing some fundamental directive? The following are implemented:

  !$acc enter data create(T(:,:,:),ER(:,:,:),Fr(:,:,:),Fz(:,:,:),Ft(:,:,:)) 
  !$acc enter data copyin(SIGMA_P(:,:,:))
  !$acc enter data create(ER_old(:,:,:),Fr_old(:,:,:),Fz_old(:,:,:),Ft_old(:,:,:))
  !$acc enter data create(FLUX_FR_r(:,:,:),FLUX_FR_z(:,:,:),FLUX_FR_t(:,:,:))
  !$acc enter data create(FLUX_PR_rr(:,:,:),FLUX_PR_rz(:,:,:),FLUX_PR_rt(:,:,:))
  !$acc enter data create(FLUX_PR_zr(:,:,:),FLUX_PR_zz(:,:,:),FLUX_PR_zt(:,:,:))
  !$acc enter data create(FLUX_PR_tr(:,:,:),FLUX_PR_tz(:,:,:),FLUX_PR_tt(:,:,:))
  !$acc enter data create(Delta_r(:),Delta_z(:),Delta_a(:))
  !$acc enter data create(RHS_F(:,:,:,:,:,:))
  !$acc enter data create(r(:))
  !$acc enter data create(ABar_RK,A_RK,bBar_RK,b_RK)
  !$acc enter data create(runge,IMEX_s)

  !$acc update device(B,MINi,MAXi,MINj,MAXj,MINk,MAXk,V_light,my_gpu,const_2D,cdt,r(:),dtRAD)
  !$acc update device(T(:,:,:),ER(:,:,:),Fr(:,:,:),Fz(:,:,:),Ft(:,:,:))
  !$acc update device(SIGMA_P(:,:,:))
  !$acc update device(ER_old(:,:,:),Fr_old(:,:,:),Fz_old(:,:,:),Ft_old(:,:,:))
  !$acc update device(FLUX_FR_r(:,:,:),FLUX_FR_z(:,:,:),FLUX_FR_t(:,:,:))
  !$acc update device(FLUX_PR_rr(:,:,:),FLUX_PR_rz(:,:,:),FLUX_PR_rt(:,:,:))
  !$acc update device(FLUX_PR_zr(:,:,:),FLUX_PR_zz(:,:,:),FLUX_PR_zt(:,:,:))
  !$acc update device(FLUX_PR_tr(:,:,:),FLUX_PR_tz(:,:,:),FLUX_PR_tt(:,:,:))
  !$acc update device(Delta_r(:),Delta_z(:),Delta_a(:))
  !$acc update device(RK_ER(:,:,:), RK_Fr(:,:,:), RK_Fz(:,:,:), RK_Ft(:,:,:))
  !$acc update device(ABar_RK,A_RK,bBar_RK,b_RK)
  !$acc update device(runge,IMEX_s)

  !$acc parallel loop gang vector collapse(3) &
  !$acc& private(inv_rC,T_ijk,T2_ijk,ER_ijk,Fr_ijk,Fz_ijk,Ft_ijk,Sp,Se,Sf,SEv,Sfv) &
  !$acc& private(SEcdt,SFcdT,cER_old) & 
  !$acc& private(DivPR_r,DivPR_z,DivPR_t,DivFR) & 
  !$acc& private(dt_a_c,Tsumille,s,res,it,app_ER) &
  !$acc& private(Nf,inv_Nf,r(:)) &
  !$acc& copy(i,j,k,count_Err1,count_Err2,runge,IMEX_s)
  do k
    do j
      do i
         ...Code...
     end do
   end do
  end do
  !$acc update self(RK_ER,RK_Fr,RK_Fz,RK_Ft)
  !$acc update self(ER,Fr,Fz,Ft)
  !$acc update self(count_Err1,count_Err2)
  !$acc exit data delete(T(:,:,:),ER(:,:,:),Fr(:,:,:),Fz(:,:,:),Ft(:,:,:)) 
  !$acc exit data delete(SIGMA_P(:,:,:))
  !$acc exit data delete(ER_old(:,:,:),Fr_old(:,:,:),Fz_old(:,:,:),Ft_old(:,:,:))
  !$acc exit data delete(FLUX_FR_r(:,:,:),FLUX_FR_z(:,:,:),FLUX_FR_t(:,:,:))
  !$acc exit data delete(FLUX_PR_rr(:,:,:),FLUX_PR_rz(:,:,:),FLUX_PR_rt(:,:,:))
  !$acc exit data delete(FLUX_PR_zr(:,:,:),FLUX_PR_zz(:,:,:),FLUX_PR_zt(:,:,:))
  !$acc exit data delete(FLUX_PR_tr(:,:,:),FLUX_PR_tz(:,:,:),FLUX_PR_tt(:,:,:))
  !$acc exit data delete(Delta_r(:),Delta_z(:),Delta_a(:))
  !$acc exit data delete(RHS_F(:,:,:,:,:,:))
  !$acc exit data delete(r(:))
  !$acc exit data delete(ABar_RK,A_RK,bBar_RK,b_RK)
  !$acc exit data delete(RK_ER,RK_Fr,RK_Fz,RK_Ft)
  !$acc exit data delete(sm,Mat,vec,nvec,f)

and I get the following information from the compiler:

mpif90 -c -r8 -acc=gpu,noautopar -target=gpu -gpu=cc70 -Mpreprocess -Mfree -Mextend -Munixlogical -Mbyteswapio -traceback -Mchkstk -Mnostack_arrays -Mnofprelaxed -Mnofpapprox -Minfo=accel  -I../INCLUDE -I../STATISTICS -I../MODULE -I../MODULE/MPH -I../POST_HEART  ../SOLVER/RAD/Radiation_Eddington_PEER4S.f90
rad_eddington:
    901, Generating create(l,cc,i,j,k,level,b,c_p1,ll,m) [if not already present]
    904, Generating create(count_err1,count_err2) [if not already present]
    907, Generating create(divpr_z,er_ijk,ft_ijk,fr_ijk,divpr_t,divpr_r,fz_ijk) [if not already present]
    909, Generating create(sfv,sev,se_v,sf_v,se,sf,sp) [if not already present]
    911, Generating create(inv_nf,inv_r,no_div_0,inv_rc,sor,dt_local,tmp) [if not already present]
    915, Generating create(er_t,er_r,er_z) [if not already present]
    917, Generating create(t2_ijk,t_ijk,tsumille) [if not already present]
    923, Generating create(c2dt,nf,secdt,cdt,cer_old,sfcdt) [if not already present]
    926, Generating create(inv_mat(:,:)) [if not already present]
    933, Generating create(app_er,rhs_er,rhs_ft,rhs_fr,rhs_fz) [if not already present]
    937, Generating create(max_k,min_i,max_i,min_j,max_j,min_k) [if not already present]
    939, Generating create(iter_peer,s) [if not already present]
    951, Generating create(pd,krr,rd,it,kr,ss) [if not already present]
    955, Generating create(dt_a_c,sd(:),res,u_sol(:)) [if not already present]
    962, Generating copyin(my_gpu) [if not already present]
    963, Generating create(maxk(:),mini(:),maxi(:),minj(:),maxj(:),mink(:)) [if not already present]
    964, Generating create(v_light,dtrad,const_2d) [if not already present]
   1216, Generating enter data create(fr(:,:,:),ft(:,:,:),er(:,:,:),fz(:,:,:),t(:,:,:))
   1217, Generating enter data copyin(sigma_p(:,:,:))
   1218, Generating enter data create(fr_old(:,:,:),ft_old(:,:,:),er_old(:,:,:),fz_old(:,:,:))
   1219, Generating enter data create(flux_fr_r(:,:,:),flux_fr_t(:,:,:),flux_fr_z(:,:,:))
   1220, Generating enter data create(flux_pr_rr(:,:,:),flux_pr_rt(:,:,:),flux_pr_rz(:,:,:))
   1221, Generating enter data create(flux_pr_zr(:,:,:),flux_pr_zt(:,:,:),flux_pr_zz(:,:,:))
   1222, Generating enter data create(flux_pr_tr(:,:,:),flux_pr_tt(:,:,:),flux_pr_tz(:,:,:))
   1223, Generating enter data create(delta_r(:),delta_a(:),delta_z(:))
   1224, Generating enter data create(rhs_f(:,:,:,:,:,:))
   1225, Generating enter data create(r(:))
   1226, Generating enter data create(abar_rk(:,:),b_rk(:),a_rk(:,:),bbar_rk(:))
   1227, Generating enter data create(runge,imex_s)
   1229, Generating update device(const_2d,my_gpu,b,cdt,mink(:),r(:),maxj(:),minj(:),maxi(:),mini(:),dtrad,maxk(:),v_light)
   1230, Generating update device(fr(:,:,:),ft(:,:,:),er(:,:,:),fz(:,:,:),t(:,:,:))
   1231, Generating update device(sigma_p(:,:,:))
   1232, Generating update device(fr_old(:,:,:),ft_old(:,:,:),er_old(:,:,:),fz_old(:,:,:))
   1233, Generating update device(flux_fr_r(:,:,:),flux_fr_t(:,:,:),flux_fr_z(:,:,:))
   1234, Generating update device(flux_pr_rr(:,:,:),flux_pr_rt(:,:,:),flux_pr_rz(:,:,:))
   1235, Generating update device(flux_pr_zr(:,:,:),flux_pr_zt(:,:,:),flux_pr_zz(:,:,:))
   1236, Generating update device(flux_pr_tr(:,:,:),flux_pr_tt(:,:,:),flux_pr_tz(:,:,:))
   1237, Generating update device(delta_r(:),delta_a(:),delta_z(:))
   1238, Generating update device(rk_fr(:,:,:),rk_ft(:,:,:),rk_er(:,:,:),rk_fz(:,:,:))
   1239, Generating update device(abar_rk(:,:),b_rk(:),a_rk(:,:),bbar_rk(:))
   1240, Generating update device(runge,imex_s)
   1246, Generating implicit copyin(fz_old(:,:,:)) [if not already present]
         Generating copy(i,count_err1) [if not already present]
         Generating implicit copy(rhs_f(:,1:2,1:4,:,:,:)) [if not already present]
         Generating NVIDIA GPU code
       1254, !$acc loop gang, vector(128) collapse(3) ! blockidx%x threadidx%x
       1255,   ! blockidx%x threadidx%x collapsed
       1256,   ! blockidx%x threadidx%x collapsed
       1328, !$acc loop seq
       1347, !$acc loop seq
       1455, !$acc loop seq
   1246, Generating copy(j) [if not already present]
         Generating implicit copyin(ft(:,:,:),fz(:,:,:),fr(:,:,:),flux_pr_rz(:,:,:),flux_pr_tt(:,:,:),flux_pr_tr(:,:,:),flux_pr_tz(:,:,:),flux_pr_zt(:,:,:),flux_pr_zr(:,:,:),flux_fr_z(:,:,:),flux_pr_rt(:,:,:),flux_pr_rr(:,:,:),er_old(:,:,:),delta_a(:),flux_fr_t(:,:,:),delta_r(:),flux_fr_r(:,:,:)) [if not already present]
         Generating copy(count_err2) [if not already present]
         Generating implicit copyin(t(:,:,:),sigma_p(:,:,:),delta_z(:),flux_pr_zz(:,:,:),ft_old(:,:,:),fr_old(:,:,:)) [if not already present]
         Generating copy(runge) [if not already present]
         Generating implicit copyin(er(:,:,:)) [if not already present]
   1256, Generating implicit firstprivate(my_id)
   1328, Complex loop carried dependence of sm prevents parallelization
         Loop carried dependence of sm prevents parallelization
         Loop carried backward dependence of sm prevents vectorization
   1347, Loop is parallelizable
   1358, Loop carried scalar dependence for it at line 1359,1358
         Complex loop carried dependence of nvec,mat,vec prevents parallelization
         Loop carried dependence of vec,nvec prevents parallelization
         Loop carried backward dependence of vec,nvec prevents vectorization
         Loop carried dependence of mat prevents parallelization
         Loop carried backward dependence of mat prevents vectorization
         Complex loop carried dependence of f prevents parallelization
         Loop carried dependence of f prevents parallelization
         Loop carried backward dependence of f prevents vectorization
   1361, Reference argument passing prevents parallelization: sf
         Reference argument passing prevents parallelization: se
         Reference argument passing prevents parallelization: sev
         Reference argument passing prevents parallelization: sfv
         Reference argument passing prevents parallelization: sp
         Reference argument passing prevents parallelization: t_ijk
   1366, Reference argument passing prevents parallelization: sf
         Reference argument passing prevents parallelization: se
         Reference argument passing prevents parallelization: sev
         Reference argument passing prevents parallelization: sfv
         Reference argument passing prevents parallelization: sp
         Reference argument passing prevents parallelization: t_ijk
   1371, Reference argument passing prevents parallelization: sf
         Reference argument passing prevents parallelization: se
         Reference argument passing prevents parallelization: sev
         Reference argument passing prevents parallelization: sfv
         Reference argument passing prevents parallelization: sp
         Reference argument passing prevents parallelization: t_ijk
   1376, Reference argument passing prevents parallelization: sf
         Reference argument passing prevents parallelization: se
         Reference argument passing prevents parallelization: sev
         Reference argument passing prevents parallelization: sfv
         Reference argument passing prevents parallelization: t_ijk
         Reference argument passing prevents parallelization: sp
   1455, Complex loop carried dependence of rk_er prevents parallelization
         Loop carried dependence of rk_er prevents parallelization
         Loop carried backward dependence of rk_er prevents vectorization
         Complex loop carried dependence of rk_fz,rk_ft prevents parallelization
         Loop carried dependence of rk_fz,rk_ft prevents parallelization
         Loop carried backward dependence of rk_fz,rk_ft prevents vectorization
         Complex loop carried dependence of rk_fr prevents parallelization
         Loop carried dependence of rk_fr prevents parallelization
         Loop carried backward dependence of rk_fr prevents vectorization
   1492, Generating update self(rk_fr(:,:,:),rk_ft(:,:,:),rk_er(:,:,:),rk_fz(:,:,:))
   1493, Generating update self(fr(:,:,:),ft(:,:,:),er(:,:,:),fz(:,:,:))
   1494, Generating update self(count_err2,count_err1)
   1495, Generating exit data delete(fr(:,:,:),ft(:,:,:),er(:,:,:),fz(:,:,:),t(:,:,:))
   1496, Generating exit data delete(sigma_p(:,:,:))
   1497, Generating exit data delete(fr_old(:,:,:),ft_old(:,:,:),er_old(:,:,:),fz_old(:,:,:))
   1498, Generating exit data delete(flux_fr_r(:,:,:),flux_fr_t(:,:,:),flux_fr_z(:,:,:))
   1499, Generating exit data delete(flux_pr_rr(:,:,:),flux_pr_rt(:,:,:),flux_pr_rz(:,:,:))
   1500, Generating exit data delete(flux_pr_zr(:,:,:),flux_pr_zt(:,:,:),flux_pr_zz(:,:,:))
   1501, Generating exit data delete(flux_pr_tr(:,:,:),flux_pr_tt(:,:,:),flux_pr_tz(:,:,:))
   1502, Generating exit data delete(delta_r(:),delta_a(:),delta_z(:))
   1503, Generating exit data delete(rhs_f(:,:,:,:,:,:))
   1504, Generating exit data delete(r(:))
   1505, Generating exit data delete(abar_rk(:,:),b_rk(:),a_rk(:,:),bbar_rk(:))
   1558, Generating exit data delete(rk_fr(:,:,:),rk_ft(:,:,:),rk_er(:,:,:),rk_fz(:,:,:))
   1559, Generating exit data delete(mat(:,:,:,:,:),vec(:,:,:,:),sm(:,:,:,:),f(:,:,:,:),nvec(:,:,:,:))
emac:
   2694, Generating acc routine seq
         Generating NVIDIA GPU code
gaussj:
   2782, Generating acc routine seq
         Generating NVIDIA GPU code

For me it is strange because only collapse(3) induce the error.

Thanks again for all your support!!!

I would verify that the scalars correct. For example, “Se” and “Sf”:

       call ABS_EFF(T_ijk,ER_ijk,Fr_ijk,Fz_ijk,Ft_ijk,Sp,0,Se,Sf,SEv,Sfv)
       SEcdt  = Se * cdt
       SFcdt  = Sf * cdt

From the feedback messages these are in a create clause making then global but uninitialized.

909, Generating create(sfv,sev,se_v,sf_v,se,sf,sp) [if not already present]

Though you also have them in a private clause:

!$acc& private(inv_rC,T_ijk,T2_ijk,ER_ijk,Fr_ijk,Fz_ijk,Ft_ijk,Sp,Se,Sf,SEv,Sfv) &

Technically they can’t be both global and private, but I think private is going to win out here, but something to clean-up.

I don’t know what’s done in ABS_EFF. If the routine sets Se and Sf, then using private is fine. Private creates a private copy of the variable, but don’t initialize it. If ABS_EFF doesn’t set their values, then they are uninitialized on the device and problematic.

If they are read-only, then it’s fine to make them global, but using “create” only creates the device data, it does not set it’s value. For that, you either need to use a “copyin” clause or a following “update” directive to set the value.

A third option is the “firstprivate” clause. Like private, each thread gets it’s own private copy but it also initializes each copy to the value from the host.

A second example is “cdt”. It appears in a “create” clause but I don’t see a corresponding “update”, hence it’s likely uninitialized.

There’s too many variables for me to go through them all, and I don’t have the full source to know, so it difficult for me to highlight every potential issue. Instead you should review each variable to ensure that the correct data clause is being applied.

If the scalar is a temp variable that gets assigned an intermediary value in the offloaded region. then it should be “private”. If the variable needs to be initialized before the first assignment, then use “firstprivate”.

If the scalar is a read-only variable, then use “firstprivate” or it could be global (i.e. added to a data region), but ensure the device copy is initialized using “copyin” or and update directive. Though if it’s initialized on the device within another compute region, then “create” is fine.

Another issue is the loop indices, i, j, and k. These must be private (no need to add them in a private clause since the compile must make them private), but you include them in a copy clause:

!$acc& copy(i,j,k,count_Err1,count_Err2,runge,IMEX_s)

Now the compiler is likely ignoring the global copies and using the private variables, but there’s room for confusion here. I’d remove i, j, and l from the copy clause.

1 Like

Thank you Mat for all the suggestions!!! Sorry for the late reply, it took me some time to make all the changes.

I solved the issue by making firstprivate the runge variable.

Have a nice day!!

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.