Hi, I am using PGI/18.5.
The openacc loop regime has some FORTRAN build-in and customized functions. It works fine for sequential program.
After check the minfo, it shows that all these functions requires implicit copyin. Eventually, the memory allocation becomes huge, exceeds the GPU’s capacity.
Would you please suggest how could I walk around that?
Thanks,
nt=1
!$acc data copyin(dt,Eop,Hop,Feihj,Feiej,Fhiej,Fhihj,Feihi,Feiei,FMJx,ncrosMJx,FMJy,ncrosMJy,E0,E,H,Fhiei,Fhihi,Me,Se) create(Fe,Fh,Erka,Erkb,Erkc,Erkd,Hrka,Hrkb,Hrkc,Hrkd,Enext,Hnext) copyout(Eneed1,Eneed2)
DO
tn=dble(nt)dt
E0=-exp(-4pi*(tn-t0)2/tm2)cos(omega(tn))
num_MJ1=0; num_MJ2=0
call initial() !initialize computation sequential
!$acc parallel loop independent present(dt,Eop,Hop,Feihj,Feiej,Fhiej,Fhihj,Feihi,Feiei,FMJx,ncrosMJx,FMJy,ncrosMJy,E0,E,H,Fhiei,Fhihi,Me,Se,Fe,Fh,Erka,Erkb,Erkc,Erkd,Hrka,Hrkb,Hrkc,Hrkd,Enext,Hnext)
!!$acc kernels
DO el=1,num_element
! Filed values at the neighboring element
Ej(:,:,el)=reshape((/Eop(1:4,1,el),Eop(1:4,3,el),Eop(1:4,1,el),Eop(1:4,4,el),Eop(1:4,1,el),Eop(1:4,5,el),Eop(1:4,2,el),Eop(1:4,3,el),Eop(1:4,2,el),Eop(1:4,4,el),&
Eop(1:4,2,el),Eop(1:4,5,el),Eop(1:4,3,el),Eop(1:4,5,el),Eop(1:4,3,el),Eop(1:4,4,el),Eop(1:4,4,el),Eop(1:4,5,el)/),(/8,9/))
Hj(:,:,el)=reshape((/Hop(1:4,1,el),Hop(1:4,3,el),Hop(1:4,1,el),Hop(1:4,4,el),Hop(1:4,1,el),Hop(1:4,5,el),Hop(1:4,2,el),Hop(1:4,3,el),Hop(1:4,2,el),Hop(1:4,4,el),&
Hop(1:4,2,el),Hop(1:4,5,el),Hop(1:4,3,el),Hop(1:4,5,el),Hop(1:4,3,el),Hop(1:4,4,el),Hop(1:4,4,el),Hop(1:4,5,el)/),(/8,9/))
! Flux value for the E-H marching
!!$acc loop
DO ii=1,9
Fe(ii,el)=dot_product(Feihj(ii,1:8,el),Hj(1:8,ii,el))+dot_product(Feiej(ii,1:8,el),Ej(1:8,ii,el))
Fh(ii,el)=-dot_product(Fhiej(ii,1:8,el),Ej(1:8,ii,el))+dot_product(Fhihj(ii,1:8,el),Hj(1:8,ii,el))
END DO
Fe(:,el)=Fe(:,el)-mmul(Feihi(:,:,el),H(:,el))-mmul(Feiei(:,:,el),E(:,el))-(FMJx(:,el)*ncrosMJx(el)+FMJy(:,el)*ncrosMJy(el))*E0
Fh(:,el)=Fh(:,el)+mmul(Fhiei(:,:,el),E(:,el))-mmul(Fhihi(:,:,el),H(:,el))-(FMJx(:,el)*MJx(el)+FMJy(:,el)*MJy(el))*E0
Erka(:,el)=1.d0/epsil(el)*(mmul(Me(:,:,el),mmul(Se(:,:,el),H(:,el))+Fe(:,el)))
Hrka(:,el)=-1.d0/mur*mmul(Me(:,:,el),mmul(Se(:,:,el),E(:,el))+Fh(:,el))
Erkb(:,el)=1.d0/epsil(el)*(mmul(Me(:,:,el),mmul(Se(:,:,el),H(:,el)+Hrka(:,el)*dt/2.d0)+Fe(:,el)))
Hrkb(:,el)=-1.d0/mur*mmul(Me(:,:,el),mmul(Se(:,:,el),E(:,el)+Erka(:,el)*dt/2.d0)+Fh(:,el))
Erkc(:,el)=1.d0/epsil(el)*(mmul(Me(:,:,el),mmul(Se(:,:,el),H(:,el)+Hrkb(:,el)*dt/2.d0)+Fe(:,el)))
Hrkc(:,el)=-1.d0/mur*mmul(Me(:,:,el),mmul(Se(:,:,el),E(:,el)+Erkb(:,el)*dt/2.d0)+Fh(:,el))
Erkd(:,el)=1.d0/epsil(el)*(mmul(Me(:,:,el),mmul(Se(:,:,el),H(:,el)+Hrkc(:,el)*dt)+Fe(:,el)))
Hrkd(:,el)=-1.d0/mur*mmul(Me(:,:,el),mmul(Se(:,:,el),E(:,el)+Erkc(:,el)*dt)+Fh(:,el))
Enext(1:9,el)=E(:,el)+(Erka(:,el)+2.d0*Erkb(:,el)+2.d0*Erkc(:,el)+Erkd(:,el))*dt/6.d0
Hnext(1:9,el)=H(:,el)+(Hrka(:,el)+2.d0*Hrkb(:,el)+2.d0*Hrkc(:,el)+Hrkd(:,el))*dt/6.d0
END DO
!!$acc end kernels
E=Enext
H=Hnext
Eneed1(1:3,nt+1,1:num_WP)=E(1:3,Eneed1num)
Eneed2(1:3,nt+1,1:num_WP)=E(1:3,Eneed2num)
if (MOD(nt,1000)==0)then
!print*,E0,Eneed2(1:3,nt+1,1)
!print*,nt
end if
nt=nt+1
if(nt==num_time)then
exit
end if
enddo
!$acc end data
And the compiler information goes:
Accelerator kernel generated
Generating Tesla code
105, !$acc loop gang ! blockidx%x
107, !$acc loop vector(32) ! threadidx%x
!$acc loop seq
109, !$acc loop seq
!$acc loop vector(32) ! threadidx%x
113, !$acc loop seq
114, !$acc loop vector(32) ! threadidx%x
Generating implicit reduction(+:feihj$r,feiej$r)
115, !$acc loop vector(32) ! threadidx%x
Generating implicit reduction(+:hj$r,ej$r)
117, !$acc loop vector(32) ! threadidx%x
118, !$acc loop vector(32) ! threadidx%x
120, !$acc loop vector(32) ! threadidx%x
121, !$acc loop vector(32) ! threadidx%x
123, !$acc loop vector(32) ! threadidx%x
!$acc loop seq
124, !$acc loop vector(32) ! threadidx%x
!$acc loop seq
126, !$acc loop vector(32) ! threadidx%x
!$acc loop seq
127, !$acc loop vector(32) ! threadidx%x
!$acc loop seq
129, !$acc loop vector(32) ! threadidx%x
!$acc loop seq
130, !$acc loop vector(32) ! threadidx%x
!$acc loop seq
132, !$acc loop vector(32) ! threadidx%x
103, Generating implicit copy(e$a23(:),h$a22(:),e$a(:),h$a(:))
Generating present(dt,fmjy(:,:),eop(:,:,:))
Generating implicit copyin(mjx(:))
Generating present(me(:,:,:))
Generating implicit copyin(z_c_1(:),z_c_0(:))
Generating present(ncrosmjy(:),e0)
Generating implicit copy(z_a_11(:),z_a_12(:),z_a_13(:),z_a_14(:),z_a_15(:),z_a_16(:),z_a_17(:))
Generating present(se(:,:,:))
Generating implicit copy(z_a_10(:),z_a_1(:),z_a_8(:),z_a_7(:),z_a_6(:),z_a_5(:),z_a_4(:),z_a_3(:),z_a_2(:),z_a_18(:),z_a_0(:),z_a_9(:))
Generating present(h(:,:))
Generating implicit copyin(reshap$r(1:8,1:9),mjy(:))
Generating implicit copy(mmul19(:),mmul20(:),mmul3(:),mmul5$a(:),mmul4(:),mmul5(:),mmul7$a(:),mmul6(:),mmul7(:),mmul9$a(:),mmul8(:),mmul1(:),mmul11$a(:),mmul10(:),mmul11(:),mmul13$a(:),mmul12(:),mmul13(:),mmul15$a(:),mmul14(:),mmul15(:),mmul17$a(:),mmul16(:),mmul17(:),mmul19$a(:),mmul18(:),mmul2(:))
107, Accelerator restriction: induction variable live-out from loop: z_i_0
Loop is parallelizable
Loop carried reuse of z_a_0 prevents parallelization
Loop is parallelizable
Loop carried reuse of z_a_0 prevents parallelization
Loop is parallelizable
109, Accelerator restriction: induction variable live-out from loop: z_i_0
Loop carried dependence of z_a_0 prevents parallelization
Loop carried backward dependence of z_a_0 prevents vectorization
Loop is parallelizable
Loop carried dependence of z_a_0 prevents parallelization
Loop is parallelizable
113, Loop is parallelizable
114, Loop is parallelizable
115, Loop is parallelizable
117, Loop is parallelizable
118, Loop is parallelizable
120, Loop is parallelizable
121, Loop is parallelizable
123, Loop carried dependence of h$a prevents parallelization
Loop carried backward dependence of h$a prevents vectorization
Loop is parallelizable
124, Loop carried dependence of e$a prevents parallelization
Loop carried backward dependence of e$a prevents vectorization
Loop is parallelizable
126, Loop carried dependence of h$a22 prevents parallelization
Loop carried backward dependence of h$a22 prevents vectorization
Loop is parallelizable
127, Loop carried dependence of e$a23 prevents parallelization
Loop carried backward dependence of e$a23 prevents vectorization
Loop is parallelizable
129, Loop carried dependence of h$a24 prevents parallelization
Loop carried backward dependence of h$a24 prevents vectorization
Loop is parallelizable
130, Loop carried dependence of e$a25 prevents parallelization
Loop carried backward dependence of e$a25 prevents vectorization
Loop is parallelizable
132, Loop is parallelizable