pgfortran compiler stop there

I wrote a fortran cuda code include a lot of device subroutines. When I tried to compile it, the compiler runs very slow and stop there forever. How can I deal with this? Thank you.


     module test 
      integer,device :: nlayer,nfield,nsource
      real,device   :: xfreq,mur(6),xloc(6),tolerance1,tolerance2,xpi,xmu0,xepsilon0,xomega,k_max
      complex,device   :: cer(6),ck(6),ci
      logical,device  :: pec_first,pec_last
      real xx(20),xy(20),xz(20)
      parameter(nlayer=6)
      parameter(nfield=1)
      parameter(nsource=5)
      parameter(pec_first=.false.)
      parameter(pec_last=.false.)
      parameter(xfreq=4.739336e+14)
      parameter(tolerance1=1.0e-5)
      parameter(tolerance2=1.0e-3)
      
      parameter(xpi=acos(-1.0))
      parameter(xepsilon0=8.854187817e-12)
      parameter(xmu0=4.0e-7*xpi)
      parameter(ci=(0,1))
      parameter(k_max=3.1410636E+07)
      parameter(xomega=2.0*xpi*xfreq)

           
     contains
     
     attributes(global) subroutine GreensFunction_CompEJ(Test_Green,&
     &	xx,xy,xz,xxs,xys,xzs,Nxx,Nyy,Nxz)

       implicit none
       
       
 

       real krhomax
       parameter(krhomax=1.0e32)
       

      
      integer N,N1
   
      real xL1,wrho
      real dk,dk1,kpmax,dk2
      

   

      integer Nxx,Nyy,Nxz,Nrect,i_temp
      logical Test_Green
      real xxs,xys,xzs
      real xx(20),xy(20),xz(20)

      integer i,j,k,counter,s
      real theta1, theta2
      real xr,xconst1,xconst2,xconst3,xrho,zzp,yzb,rt1,rt2
      complex ca(200),cb(200),cG(9),prim,tyu2,zyu,yyu
      complex ki,tyu,xyu,cons
      complex Gxxpm,Gxzpm,Gzxpm,Gxypm,Gzzpm,Gyypm
      complex Gyxpm,Gyzpm,Gzypm
      complex xxu,zzu,ki2,cons2,zyu2,prim2
      complex Gxx_minus,Gxy_minus,Gxz_minus,Gzy_minus
      complex Gyy_minus,Gzx_minus,Gzz_minus,Gyz_minus

!      open(93,file='GEJxx.dat',status='unknown')
!      open(94,file='GEJzx.dat',status='unknown')
!      open(95,file='GEJyy.dat',status='unknown')
!      open(97,file='GEJxz.dat',status='unknown')
!      open(98,file='GEJxy.dat',status='unknown')
!      open(99,file='GEJzz.dat',status='unknown')
!      open(91,file='GEJyx.dat',status='unknown')
!      open(92,file='GEJyz.dat',status='unknown')
!      open(90,file='GEJzy.dat',status='unknown')
      

      counter=0
 
      if (Test_Green .eqv. .true.) then
         do 201 i=1,Nxx
            xconst1=(xx(i)-xxs)**2
            do 202 j=1,Nyy
               xconst2=(xy(j)-xys)**2

               xrho=sqrt((xx(i)-xxs)**2+(xy(j)-xys)**2)

                if (xrho .eq. 0.0) then
                 theta1=0.0
                 theta2=0.0
               else
                  theta1=acos(((xx(i)-xxs))/xrho)
                  theta2=asin(((xy(j)-xys))/xrho)
               end if

               rt1=sin(theta2)*sin(theta2)
               rt2=cos(theta1)*cos(theta1)
               
               do 203 k=1,Nxz
                 
                  xconst3=(xz(k)-xzs)**2
                  xr=sqrt(xconst1+xconst2+xconst3)
                  
                  call LayerNo2(xzs,nsource,xloc,nlayer)
                  call LayerNo2(xz(k),nfield,xloc,nlayer)            
              
      cons=1.0/(xomega**2*xmu0*xepsilon0*cer(nsource))
      cons2=1.0/(xomega**2*xmu0*xepsilon0*cer(nfield))      
    
      
                  
                    s=-1
                    if (xz(k) .lt. xzs) s=1
              
                 if (nsource .eq. nfield) then
                    yzb=xconst2+(xz(k)-xzs)**2
                    zzp=abs(xz(k)-xzs)
                    ki=ck(nsource)
                    ki2=ck(nfield)                    
                    prim=exp(-ci*ki*xr)/(4*xpi*xr)*cons
                    prim2=exp(-ci*ki*xr)/(4*xpi*xr)*cons2

                    zyu=(3+3*ci*ki2*xr-(ki2*xr)**2)/xr**4
                    zyu2=-(1.0+ci*ki2*xr)/xr**2  
                    tyu=(3+3*ci*ki*xr-(ki*xr)**2)/xr**4
                    tyu2=-(1.0+ci*ki*xr)/xr**2 
                    xxu=xconst1*tyu+tyu2
                    yyu=xconst2*tyu+tyu2
                    xyu=(xx(i)-xxs)*(xy(j)-xys)*tyu          
                    zzu=xconst3*tyu+tyu2 
                    
                    Gxx_minus=(ki**2+xxu)*prim
                    Gxy_minus= xyu*prim
                    Gxz_minus=-s*(xx(i)-xxs)*zzp*prim*tyu

                    Gyy_minus=(ki**2+yyu)*prim
                    Gyz_minus=-s*(xy(j)-xys)*zzp*prim*tyu

                    Gzx_minus=-s*(xx(i)-xxs)*zzp*prim*tyu
		    Gzy_minus=-s*(xy(j)-xys)*zzp*prim*tyu
                    Gzz_minus=(ki**2+zzu)*prim2
                  else
                    Gxx_minus=0.0
                    Gyy_minus=0.0
                    Gxy_minus=0.0
                    Gxz_minus=0.0
                    Gzx_minus=0.0
                    Gzy_minus=0.0
                    Gzz_minus=0.0
                    Gyz_minus=0.0
                  end if
!                print*,'xzs,xz(k)',xzs,xz(k)
                call LayerNo2(xzs,nsource,xloc,nlayer)
!                print*,'nsource',nsource
                call LayerNo2(xz(k),nfield,xloc,nlayer)
               
                
                
                
                
                  kpmax=1.2*real(k_max)

      wrho=xrho

      if (xrho .lt. 3.0e6/xfreq) wrho=3.0e6/xfreq      

      dk=0.1*xpi/wrho
      dk2=xpi/wrho

     
      if (dk .gt. kpmax) then
           dk1=kpmax/2
           N1=1
      else
          N1=(1+int(kpmax/dk))
          
          dk1=0.5*kpmax/N1
      endif

       xL1=0.6*k_max

      N=200

      ca(1)=0.0
      if (N1 .eq. 1) then
         cb(1)=dk1+ci*xL1
         cb(2)=kpmax
         ca(2)=cb(1)
         do i_temp=2,N
           ca(i_temp+1)=cb(i_temp)
           cb(i_temp+1)=cb(i_temp)+dk
         end do
      else
         do i_temp=1,N1
            cb(i_temp)=i_temp*(dk1+ci*xl1/N1)
            ca(i_temp+1)=cb(i_temp)
            cb(N1+i_temp)=kpmax/2+ci*xL1+i_temp*(dk1-ci*xl1/N1)
	  ca(N1+i_temp+1)=cb(N1+i_temp)
         end do
         cb(2*N1+1)=real(cb(2*N1)+dk)
         do i_temp=2*N1,N
           ca(i_temp+1)=real(cb(i_temp))
           cb(i_temp+1)=real(cb(i_temp)+dk2)
         end do
      end if
      Nrect=N1+3
                

                  
                 
                 
                 
                 
                 
                 
                 
                 
                 
                 
                 
                 
                 
                 
                 
                 

                  call GreenOnePointEJ(xrho,xz(k),xzs,ca,cb,cG,Nrect)
                  counter=counter+1
   

      
       Gxxpm=(Gxx_minus+rt1*(cG(1)+cG(6))+rt2*(cG(2)+cG(7)))
       Gxzpm=(Gxz_minus+cG(3)*cos(theta1))
       Gyzpm=(Gyz_minus+cG(3)*sin(theta2))
       Gzxpm=(Gzx_minus-cG(4)*cos(theta1))
       Gzypm=(Gzy_minus-cG(4)*sin(theta2))
       Gxypm=(Gxy_minus+(cG(9)-cG(8))*sin(theta2)*cos(theta1))
       Gyxpm=Gxypm
       Gzzpm=(Gzz_minus+cG(5))
       Gyypm=(Gyy_minus+rt2*(cG(1)+cG(6))+rt1*(cG(2)+cG(7)))

!                  write(90,'(e16.7,1x,e16.7)')Gzypm/mur(nsource)
!                  write(93,'(e16.7,1x,e16.7)')Gxxpm/mur(nsource)
!                  write(94,'(e16.7,1x,e16.7)')Gzxpm/mur(nsource)
!                  write(95,'(e16.7,1x,e16.7)')Gyypm/mur(nsource)
!                  write(97,'(e16.7,1x,e16.7)')Gxzpm/mur(nsource)
!                  write(98,'(e16.7,1x,e16.7)')Gxypm/mur(nsource)
!                  write(99,'(e16.7,1x,e16.7)')Gzzpm/mur(nsource)
!                  write(91,'(e16.7,1x,e16.7)')Gyxpm/mur(nsource)
!                  write(92,'(e16.7,1x,e16.7)')Gyzpm/mur(nsource)

                  
203     	     continue
202     	  continue
201 	 continue
      end if
!      close(90)
!      close(91)
!      close(92)
!      close(93)
!      close(94)
!      close(95)
!      close(96)
!      close(97)
!      close(98)
!      close(99)
      
      return
      end


     !**************************Spectral*****************************************************************     
      attributes(device) subroutine Spectral(xdrho,ckrho,xz,xzs,ns,nf,&
     &     iitem,iitep,ivtem,ivtep,vitem,vitep,vvtem,vvtep,&
     &     iitmm,iitmp,ivtmm,ivtmp,vitmm,vitmp,vvtmm,vvtmp)


      implicit none
      integer nlayermax,nsegmax,nmax,nAJ,nEJ,nHJ,nIntPoint
      parameter(nlayermax=12,nsegmax=20000,nmax=2000)
      parameter(nAJ=6,nEJ=9,nHJ=8,nIntPoint=32)
      integer :: gejtest,ghjtest,gatest,nlayer,nfield,nsource,intno,xaxis,xzdir, gf_form
      real   :: xfreq,xxsp,xysp,xzsp,xxfp,xyfp,xzfp,intstart,intend,mur(12),xsig(12),xloc(12),tolerance1,tolerance2,xpi,xmu0,xepsilon0,xomega,theta,k_max
      complex   :: cer(12),ck(12),ci
      logical   :: pec_first,pec_last
      
      

      integer ns,nf,sign2
      real xz,xzs,xzcall,sign,xdrho
      complex ckrho,ckrho2,kz_s,kz_f
      complex iitem,iitep,ivtem,ivtep,vitem,vitep,vvtem,vvtep
      complex iitmm,iitmp,ivtmm,ivtmp,vitmm,vitmp,vvtmm,vvtmp
      
      complex cR_leftTE,cR_leftTM,cR_rightTE,cR_rightTM
      complex cRTTMV,cRTTMI,cRTTEV,cRTTEI
      complex cLTTEV,cLTTMV,cLTTMI,cLTTEI
      complex cRRTE,cRRTM,cD0TE,cD0TM,cfphi0
      
      complex tev,tei,tmv,tmi
      complex fn0A,fn1A,fn2A,fn3A,fn4A,fn5A
      complex ft0, ft1,ft2,ft3,ft4,ft5,pt0

      complex rte1,rte2,rte3,rte4
      complex rtm1,rtm2,rtm3,rtm4
      complex ztes, ztms,ztef, ztmf
      
      complex kzsf, epssf, musf
      
      ckrho2=ckrho**2
      kz_s=csqrt(ck(ns)**2-ckrho2)
      call process_ckz(kz_s)
      
      if (ns .eq. nf) then
          kz_f=kz_s
      else
         kz_f=csqrt(ck(nf)**2-ckrho2)
         call process_ckz(kz_f)
      end if

      
      ztmf=kz_f/(xomega*xepsilon0*cer(nf))
      ztms=kz_s/(xomega*xepsilon0*cer(ns))
      ztef=xomega*xmu0*mur(nf)/kz_f
      ztes=xomega*xmu0*mur(ns)/kz_s
      
      kzsf=kz_s/kz_f
      epssf=cer(ns)/cer(nf)
      musf=mur(ns)/mur(nf)

      
       ckrho2=ckrho**2
      if (nf .gt. ns) then 
         xzcall=xloc(ns)
      else if (nf .lt. ns) then
         xzcall=xloc(ns-1)
      else
         xzcall=xz
      end if
      

      sign=-1.0
      if (xz .gt.xzs) sign=1.0
      sign2=1
      if (xzcall .lt. xzs) sign2=-1
      
      pt0=exp(-ckrho*abs(xz-xzs))
      
      call fun0A(kz_s,xz,xzs,ci,fn0A)
      ft0=fn0A
      call fun1A(kz_s,xzcall,xzs,sign2,ci,fn1A)
      ft1=fn1A
      call fun2A(kz_s,xzcall,xzs,ns,sign2,xloc,ci,fn2A)
      ft2=fn2A
      call fun3A(kz_s,xzcall,xzs,ns,sign2,xloc,ci,fn3A)
      ft3=fn3A
      call fun4A(kz_s,xzcall,xzs,ns,sign2,xloc,ci,fn4A)
      ft4=fn4A
      call fun5A(kz_s,xzcall,xzs,ns,sign2,xloc,ci,fn5A)
      ft5=fn5A
         call R_right(ckrho2,xz,ns,nf,&
     &        cR_rightTE,cR_rightTM,cRTTEV,cRTTMV,cRTTEI,cRTTMI)
         call R_left(ckrho2,xz,ns,nf,&
     &        cR_leftTE,cR_leftTM,cLTTEV,cLTTMV,cLTTEI,cLTTMI)


         if (ns .eq. 1) then
!============================================================================
!The source is in the first layer
!============================================================================
           ft2=0.0
           ft4=0.0
           ft5=0.0
           cfphi0=0.0
           cR_leftTE=0.0
           cR_leftTM=0.0
           
         else if (ns .eq. nlayer) then
!============================================================================
!The source is in the last layer
!============================================================================        
           ft3=0.0
           ft4=0.0
           ft5=0.0
           cfphi0=0.0
           cR_rightTE=0.0
           cR_rightTM=0.0

         else
!============================================================================
!The source is neither in the first layer nor in the last layer
!============================================================================
           cfphi0=exp(-ci*2*kz_s*(xloc(ns)-xloc(ns-1)))
         end if



         cRRTE=cR_rightTE*cR_leftTE
         cD0TE=1-cRRTE*cfphi0

         cRRTM=cR_rightTM*cR_leftTM
         cD0TM=1-cRRTM*cfphi0

         rtm1=cR_leftTM*ft2/cD0TM         
         rtm2=cR_rightTM*ft3/cD0TM
         rtm3=cRRTM*ft4/cD0TM
         rtm4=cRRTM*ft5/cD0TM
         rte1=cR_leftTE*ft2/cD0TE         
         rte2=cR_rightTE*ft3/cD0TE
         rte3=cRRTE*ft4/cD0TE
         rte4=cRRTE*ft5/cD0TE

         vitep=(rte1+rte2)*ztes/2
         vitmp=(rtm1+rtm2)*ztms/2
         vitem=(rte3+rte4)*ztes/2
         vitmm=(rtm3+rtm4)*ztms/2

         vvtep=(rte1-rte2)/2
         vvtmp=(rtm1-rtm2)/2
         vvtem=(rte3-rte4)/2
         vvtmm=(rtm3-rtm4)/2

         iitep=-(rte1-rte2)/2
         iitmp=-(rtm1-rtm2)/2
         iitem=(rte3-rte4)/2
         iitmm=(rtm3-rtm4)/2

         ivtep=(-rte1-rte2)/ztef/2
         ivtmp=(-rtm1-rtm2)/ztmf/2
         ivtem=(rte3+rte4)/ztef/2
         ivtmm=(rtm3+rtm4)/ztmf/2

!========================================================================
! the source and observation points are not in the same layer
!========================================================================
      if (ns .ne. nf) then
      if (ns .lt. nf) then
            tev=cRTTEV
            tei=cRTTEI
            tmv=cRTTMV
            tmi=cRTTMI

            ivtem=(tei*(ft1-rte3+rte4))/ztef/2
            ivtmm=(tmi*(ft1-rtm3+rtm4))/ztmf/2
            ivtep=-tei*(rte1-rte2)/ztef/2
            ivtmp=-tmi*(rtm1-rtm2)/ztmf/2
           
            vitep=tev*(rte1+rte2)*ztes/2
            vitmp=tmv*(rtm1+rtm2)*ztms/2

            vitem=(tev*(ft1+rte3+rte4))*ztes/2
            vitmm=(tmv*(ft1+rtm3+rtm4))*ztms/2
            
            iitem=-tei*(ft1+rte3+rte4)*ztes/ztef/2
            iitmm=-tmi*(ft1+rtm3+rtm4)*ztms/ztmf/2
            iitep=-tei*(rte1+rte2)*ztes/ztef/2
            iitmp=-tmi*(rtm1+rtm2)*ztms/ztmf/2

            vvtem=(tev*(-ft1+rte3-rte4))/2
            vvtmm=(tmv*(-ft1+rtm3-rtm4))/2
            vvtep=tev*(rte1-rte2)/2
            vvtmp=tmv*(rtm1-rtm2)/2


         else 
            tev=CLTTEV
            tei=CLTTEI
            tmv=CLTTMV
            tmi=CLTTMI

            vvtep=tev*(rte1-rte2)/2
            vvtmp=tmv*(rtm1-rtm2)/2
            vvtem=(tev*(ft1+rte3-rte4))/2
            vvtmm=(tmv*(ft1+rtm3-rtm4))/2

            ivtem=(tei*(ft1+rte3-rte4))/ztef/2
            ivtmm=(tmi*(ft1+rtm3-rtm4))/ztmf/2
            ivtep=tei*(rte1-rte2)/ztef/2
            ivtmp=tmi*(rtm1-rtm2)/ztmf/2


            vitem=(tev*(ft1+rte3+rte4))*ztes/2
            vitmm=(tmv*(ft1+rtm3+rtm4))*ztms/2
            vitep=tev*(rte1+rte2)*ztes/2
            vitmp=tmv*(rtm1+rtm2)*ztms/2
           
            iitem=tei*(ft1+rte3+rte4)*ztes/ztef/2
            iitmm=tmi*(ft1+rtm3+rtm4)*ztms/ztmf/2
            iitep=tei*(rte1+rte2)*ztes/ztef/2
            iitmp=tmi*(rtm1+rtm2)*ztms/ztmf/2

         end if
      end if

      return
      end

!*************************************************
!     LayerNo2
!*************************************************

      attributes(device) subroutine LayerNo2(pt,N,xloc,nlayer)

      implicit none

      integer N,index,nlayer
      real pt,xloc(12)
      if (pt .lt. xloc(1)) then
         N=1
      else if (pt .ge. xloc(nlayer-1)) then
         N=nlayer
      else
        do 228 index=2, nlayer-1
          if (pt .ge. xloc(index-1) .and. pt .lt. xloc(index)) then
            N=index
          end if
228     continue
      end if
      return
      end


!*************************************************
!     32 PO GAUSS QUADRATURE
!*************************************************
      attributes(device) SUBROUTINE GQ32A (FCT,Y)
      COMPLEX FCT,Y
      DIMENSION FCT(32)
      Y=.35093050047350483E-2*(FCT(32)+FCT(1))
      Y=Y+.8137197365452835E-2*(FCT(31)+FCT(2))
      Y=Y+.12696032654631030E-1*(FCT(30)+FCT(3))
      Y=Y+.17136931456510717E-1*(FCT(29)+FCT(4))
      Y=Y+.21417949011113340E-1*(FCT(28)+FCT(5))
      Y=Y+.25499029631188088E-1*(FCT(27)+FCT(6))
      Y=Y+.29342046739267774E-1*(FCT(26)+FCT(7))
      Y=Y+.32911111388180923E-1*(FCT(25)+FCT(8))
      Y=Y+.36172897054424253E-1*(FCT(24)+FCT(9))
      Y=Y+.39096947893535153E-1*(FCT(23)+FCT(10))
      Y=Y+.41655962113473378E-1*(FCT(22)+FCT(11))
      Y=Y+.43826046502201906E-1*(FCT(21)+FCT(12))
      Y=Y+.45586939347881942E-1*(FCT(20)+FCT(13))
      Y=Y+.46922199540402283E-1*(FCT(19)+FCT(14))
      Y=Y+.47819360039637430E-1*(FCT(18)+FCT(15))
      Y=Y+.48270044257363900E-1*(FCT(17)+FCT(16))
      RETURN
      END


!========================================================================
! the direct term
!========================================================================

      attributes(device) subroutine fun0A(kz_s,xz,xzs,ci,fn0A)
      implicit none
 

      real xz,xzs
      complex kz_s,fn0A,ci

      fn0A=exp(-ci*kz_s*abs(xz-xzs))
      
      return
      end

!========================================================================
! the direct term
!========================================================================
      attributes(device) subroutine fun1A(kz_s,xzcall,xzs,sign2,ci,fn1A)
      implicit none


      real xzcall,xzs
      complex kz_s,fn1A,ci
      integer sign2      

      fn1A=exp(-ci*kz_s*sign2*(xzcall-xzs))
      
      return
      end

!========================================================================
! function the reflection from the left(lower) interface
!========================================================================
      attributes(device) subroutine fun2A(kz_s,xzcall,xzs,ns,sign2,xloc,ci,fn2A)

      implicit none

      integer ns,sign2
      real xzcall,xzs,xloc(12)
      
      complex kz_s,fn2A,ci
      fn2A=exp(-ci*kz_s*abs((xzcall+xzs)-2.0*xloc(ns-1)))
      return
      end

!========================================================================
! function the reflection from the right(upper) interface
!========================================================================
      attributes(device) subroutine fun3A(kz_s,xzcall,xzs,ns,sign2,xloc,ci,fn3A)

      implicit none


      integer ns,sign2
      real xzcall, xzs,xloc(12)
      complex kz_s,fn3A,ci


      fn3A=exp(-ci*kz_s*(2.0*xloc(ns)-(xzcall+xzs)))

      return
      end

!========================================================================
! function the reflection from the right interface--the third term
!========================================================================
      attributes(device) subroutine fun4A(kz_s,xzcall,xzs,ns,sign2,xloc,ci,fn4A)

      implicit none
  
      integer ns,sign2
      real xzcall, xzs,xloc(12)
      complex kz_s,fn4A,cphi,ci

      cphi=kz_s*(xloc(ns)-xloc(ns-1))
      fn4A=exp(-2*ci*cphi+ci*kz_s*(xzcall-xzs))
      return
      end
!========================================================================
! function the reflection from the right interface--the third term
!========================================================================
      attributes(device) subroutine fun5A(kz_s,xzcall,xzs,ns,sign2,xloc,ci,fn5A)

      implicit none

      integer ns,sign2
      real xzcall, xzs,xloc(12)
      complex kz_s,fn5A, cphi,ci

      cphi=kz_s*(xloc(ns)-xloc(ns-1))
      fn5A=exp(-2*ci*cphi-ci*kz_s*(xzcall-xzs))
      return
      end



!c******************************************************************
!c calculate Green's function  for a point (xx, xy, xz),
! The source point is located at (xxs, xys, xzs)
!*******************************************************************
      attributes(device) subroutine GreenOnePointEJ(xrho,xzp,xzs,ca,cb,cG,Nrct)
      implicit none
      
      

     





      integer i,n, Nf,qN,Nrct
      real xerr
      real xrho,xzp,xzs
      complex ca(200),cb(200),cG(9),cr(9)


      qN=200-2
      Nf=9
      do 207 i=1,Nf
         cG(i)=0
207 	continue
!     print*,'Nrct=',Nrct
      do 208 i=1,Nrct
       
         call lineinteEJ(Nf,xrho,xzp,xzs,ca(i),cb(i),cr)
         do 209 n=1,Nf
            cG(n)=cG(n)+cr(n)
209	 continue
208 	continue
      xerr=1.0
      i=Nrct+1
      DO WHILE (xerr .eq. 1.0 .and. i .le. qN)
   
400   call lineinteEJ(Nf,xrho,xzp,xzs,ca(i),cb(i),cr)
         xerr=0.0
         i=i+1
         cG(1)=cG(1)+cr(1)
         if (cabs(cr(1)) .gt. tolerance1*cabs(cG(1))) xerr=1.0
         do 210 n=2,Nf
            cG(n)=cG(n)+cr(n)
            if (cabs(cr(n)) .gt. tolerance1*cabs(cG(n))) then
                 xerr=1.0
            end if
210	continue
       
      END DO
      return
      end



!***********************************************************************
      attributes(device) subroutine lineinteEJ(Nf,xtrho,xzp,xzs,caK,cbK,cr1)
      implicit none
       


       
       

      real XU(32)
      integer Nf
      real xzp,xzs,xtrho
      complex caK,cbK,cr1(9)
      integer m,j,kk,Nseg
      real xerr,xerr1,xerr3,xerr5,xerr7,xerr9,xerr11,xinv2pi,wmu
      complex cr0(9),cf(9,32), ctf(32)
      complex wer4,wer0,zxcons,zzcons,xzcons
      complex ckrho,ckrho2,ctr,clen,cnewa,wer1,wer2,wer3,ert1,ert2
      complex*16 ctx,cbj(-2:2),ch(-2:2),cjp(-2:2),chp(-2:2)
      complex iiitem,iiitep,iivtem,iivtep,ivitem,ivitep,ivvtem,ivvtep
      complex iiitmm,iiitmp,iivtmm,iivtmp,ivitmm,ivitmp,ivvtmm,ivvtmp

      wmu=1/(xomega*xmu0)
      zxcons=1/(xomega*xepsilon0*cer(nfield))
      xzcons=-1/(xomega*xepsilon0*cer(nsource))
      XU(1)=0.136806908E-02
       XU(2)=0.719424423E-02
       XU(3)=0.176188722E-01
       XU(4)=0.325469620E-01
       XU(5)=0.518394221E-01
       XU(6)=0.753161931E-01
       XU(7)=0.102758102E+00
       XU(8)=0.133908941E+00
       XU(9)=0.168477867E+00
       XU(10)=0.206142121E+00
       XU(11)=0.246550046E+00
       XU(12)=0.289324362E+00
       XU(13)=0.334065699E+00
       XU(14)=0.380356319E+00
       XU(15)=0.427764019E+00
       XU(16)=0.475846167E+00
       XU(17)=0.524153833E+00
       XU(18)=0.572235981E+00
       XU(19)=0.619643681E+00
       XU(20)=0.665934301E+00
       XU(21)=0.710675638E+00
       XU(22)=0.753449954E+00
       XU(23)=0.793857879E+00
       XU(24)=0.831522133E+00
       XU(25)=0.866091059E+00
       XU(26)=0.897241898E+00
       XU(27)=0.924683807E+00
       XU(28)=0.948160578E+00
       XU(29)=0.967453038E+00
       XU(30)=0.982381128E+00
       XU(31)=0.992805756E+00
       XU(32)=0.998631931E+00
       xinv2pi=1.0/(2.0*xpi)
      clen=cbK-caK

      do 211 j=1,32
         ckrho=clen*XU(j)+caK
         ckrho2=ckrho**2
      zzcons=ckrho2/(xomega**2*xepsilon0**2*cer(nfield)*cer(nsource))/ci
         call Spectral(xtrho,ckrho,xzp,xzs,nsource,nfield, &
     &     iiitem,iiitep,iivtem,iivtep,ivitem,ivitep,ivvtem,ivvtep,&
     &     iiitmm,iiitmp,iivtmm,iivtmp,ivitmm,ivitmp,ivvtmm,ivvtmp)

         ctx = ckrho*xtrho
         call bslsdr(2,ctx,cbj,ch,cjp,chp,2)

         if ( xtrho .eq. 0.0) then
            wer0=(cbj(0)*ckrho-ckrho)*xinv2pi
            wer2=cbj(1)*ckrho2*xinv2pi
            wer3=cbj(0)*ckrho*xinv2pi
            wer1=-ci*(cbj(0)*ckrho-ckrho/2)*xinv2pi
            wer4=-ci*ckrho*xinv2pi/2
         else
           wer0=(cbj(0)*ckrho-2*cbj(1)/xtrho)*xinv2pi
           wer2=cbj(1)*ckrho2*xinv2pi
           wer3=cbj(0)*ckrho*xinv2pi
           wer1=-ci*(cbj(0)*ckrho-cbj(1)/xtrho)*xinv2pi
           wer4=-ci*cbj(1)*xinv2pi/xtrho
         
         end if

         cf(1,j) = (ivitep+ivitem)*wer1*wmu
         cf(2,j) = (ivitmp+ivitmm)*wer1*wmu
         cf(3,j) = (ivvtmm+ivvtmp)*wer2*wmu*xzcons
         cf(4,j) = (iiitmm+iiitmp)*wer2*wmu*zxcons
         cf(5,j) = (iivtmm+iivtmp)*wer3*wmu*zzcons
         cf(6,j) = (ivitmp+ivitmm)*wer4*wmu
         cf(7,j) = (ivitep+ivitem)*wer4*wmu
         cf(8,j) = -ci*(ivitep+ivitem)*wer0*wmu
         cf(9,j) = -ci*(ivitmp+ivitmm)*wer0*wmu

211     continue
      do 212 m=1,Nf
         do j=1,32
            ctf(j)=cf(m,j)
         end do
         call GQ32A(ctf,ctr)
         cr0(m)=ctr*clen
212    continue
 
      Nseg=1
 99   Nseg=Nseg*2
      clen=(cbK-caK)/float(Nseg)
      do m=1,Nf
         cr1(m)=0
      end do
      do kk=1,Nseg
         cnewa=caK+float(kk-1)*clen
      do  j=1,32
            ckrho=clen*XU(j)+cnewa
            ckrho2=ckrho**2
      zzcons=ckrho2/(xomega**2*xepsilon0**2*cer(nfield)*cer(nsource))/ci
         call Spectral(xtrho,ckrho,xzp,xzs,nsource,nfield, &
     &     iiitem,iiitep,iivtem,iivtep,ivitem,ivitep,ivvtem,ivvtep,&
     &     iiitmm,iiitmp,iivtmm,iivtmp,ivitmm,ivitmp,ivvtmm,ivvtmp)

         ctx = ckrho*xtrho
         call bslsdr(2,ctx,cbj,ch,cjp,chp,2)

         if ( xtrho .eq. 0.0) then
            wer0=(cbj(0)*ckrho-ckrho)*xinv2pi
            wer2=cbj(1)*ckrho2*xinv2pi
            wer3=cbj(0)*ckrho*xinv2pi
            wer1=-ci*(cbj(0)*ckrho-ckrho/2)*xinv2pi
            wer4=-ci*ckrho*xinv2pi/2
         else
           wer0=(cbj(0)*ckrho-2*cbj(1)/xtrho)*xinv2pi
           wer2=cbj(1)*ckrho2*xinv2pi
           wer3=cbj(0)*ckrho*xinv2pi
           wer1=-ci*(cbj(0)*ckrho-cbj(1)/xtrho)*xinv2pi
           wer4=-ci*cbj(1)*xinv2pi/xtrho
         
         end if

         cf(1,j) = (ivitep+ivitem)*wer1*wmu
         cf(2,j) = (ivitmp+ivitmm)*wer1*wmu
         cf(3,j) = (ivvtmm+ivvtmp)*wer2*wmu*xzcons
         cf(4,j) = (iiitmm+iiitmp)*wer2*wmu*zxcons
         cf(5,j) = (iivtmm+iivtmp)*wer3*wmu*zzcons
         cf(6,j) = (ivitmp+ivitmm)*wer4*wmu
         cf(7,j) = (ivitep+ivitem)*wer4*wmu
         cf(8,j) = -ci*(ivitep+ivitem)*wer0*wmu
         cf(9,j) = -ci*(ivitmp+ivitmm)*wer0*wmu


       end do

         do m=1,Nf
            do j=1,32
               ctf(j)=cf(m,j)
            end do
            call GQ32A(ctf,ctr)
            cr1(m)=cr1(m)+ctr*clen
         end do
      end do


      xerr1=0.0
      xerr3=0.0
      xerr5=0.0
      xerr7=0.0
      xerr9=0.0


      if (cabs(cr1(1)-cr0(1)) .gt. tolerance2*cabs(cr1(1))) xerr1=1.0
      if (cabs(cr1(3)-cr0(3)) .gt. tolerance2*cabs(cr1(3))) xerr3=1.0
      if (cabs(cr1(5)-cr0(5)) .gt. tolerance2*cabs(cr1(5))) xerr5=1.0
      if (cabs(cr1(7)-cr0(7)) .gt. tolerance2*cabs(cr1(7))) xerr7=1.0
      if (cabs(cr1(9)-cr0(9)) .gt. tolerance2*cabs(cr1(9))) xerr9=1.0

      xerr=max(xerr1,xerr3,xerr5,xerr7,xerr9)

      DO WHILE (xerr.eq. 1.0 .and. Nseg .le. 513)
         do  m=1,Nf
            cr0(m)=cr1(m)
         end do
         
         Nseg=Nseg*2
      clen=(cbK-caK)/float(Nseg)
      do m=1,Nf
         cr1(m)=0
      end do
      do kk=1,Nseg
         cnewa=caK+float(kk-1)*clen
      do  j=1,32
            ckrho=clen*XU(j)+cnewa
            ckrho2=ckrho**2
      zzcons=ckrho2/(xomega**2*xepsilon0**2*cer(nfield)*cer(nsource))/ci
         call Spectral(xtrho,ckrho,xzp,xzs,nsource,nfield, &
     &     iiitem,iiitep,iivtem,iivtep,ivitem,ivitep,ivvtem,ivvtep,&
     &     iiitmm,iiitmp,iivtmm,iivtmp,ivitmm,ivitmp,ivvtmm,ivvtmp)

         ctx = ckrho*xtrho
         call bslsdr(2,ctx,cbj,ch,cjp,chp,2)

         if ( xtrho .eq. 0.0) then
            wer0=(cbj(0)*ckrho-ckrho)*xinv2pi
            wer2=cbj(1)*ckrho2*xinv2pi
            wer3=cbj(0)*ckrho*xinv2pi
            wer1=-ci*(cbj(0)*ckrho-ckrho/2)*xinv2pi
            wer4=-ci*ckrho*xinv2pi/2
         else
           wer0=(cbj(0)*ckrho-2*cbj(1)/xtrho)*xinv2pi
           wer2=cbj(1)*ckrho2*xinv2pi
           wer3=cbj(0)*ckrho*xinv2pi
           wer1=-ci*(cbj(0)*ckrho-cbj(1)/xtrho)*xinv2pi
           wer4=-ci*cbj(1)*xinv2pi/xtrho
         
         end if

         cf(1,j) = (ivitep+ivitem)*wer1*wmu
         cf(2,j) = (ivitmp+ivitmm)*wer1*wmu
         cf(3,j) = (ivvtmm+ivvtmp)*wer2*wmu*xzcons
         cf(4,j) = (iiitmm+iiitmp)*wer2*wmu*zxcons
         cf(5,j) = (iivtmm+iivtmp)*wer3*wmu*zzcons
         cf(6,j) = (ivitmp+ivitmm)*wer4*wmu
         cf(7,j) = (ivitep+ivitem)*wer4*wmu
         cf(8,j) = -ci*(ivitep+ivitem)*wer0*wmu
         cf(9,j) = -ci*(ivitmp+ivitmm)*wer0*wmu


       end do

         do m=1,Nf
            do j=1,32
               ctf(j)=cf(m,j)
            end do
            call GQ32A(ctf,ctr)
            cr1(m)=cr1(m)+ctr*clen
         end do
      end do


      xerr1=0.0
      xerr3=0.0
      xerr5=0.0
      xerr7=0.0
      xerr9=0.0


      if (cabs(cr1(1)-cr0(1)) .gt. tolerance2*cabs(cr1(1))) xerr1=1.0
      if (cabs(cr1(3)-cr0(3)) .gt. tolerance2*cabs(cr1(3))) xerr3=1.0
      if (cabs(cr1(5)-cr0(5)) .gt. tolerance2*cabs(cr1(5))) xerr5=1.0
      if (cabs(cr1(7)-cr0(7)) .gt. tolerance2*cabs(cr1(7))) xerr7=1.0
      if (cabs(cr1(9)-cr0(9)) .gt. tolerance2*cabs(cr1(9))) xerr9=1.0

      xerr=max(xerr1,xerr3,xerr5,xerr7,xerr9)
         
         
         
         
         
      END DO
      return
      end








     attributes(device) subroutine R_left(ckrho2,xz,ns,nf,cR_leftTE,cR_leftTM,&
     &     cLTTEV,cLTTMV,cLTTEI,cLTTMI)
      implicit none
      
      integer nlayermax,nsegmax,nmax,nAJ,nEJ,nHJ,nIntPoint
      parameter(nlayermax=12,nsegmax=20000,nmax=2000)
      parameter(nAJ=6,nEJ=9,nHJ=8,nIntPoint=32)
      integer :: gejtest,ghjtest,gatest,nlayer,nfield,nsource,intno,xaxis,xzdir, gf_form
      real   :: xfreq,xxsp,xysp,xzsp,xxfp,xyfp,xzfp,intstart,intend,mur(12),xsig(12),xloc(12),tolerance1,tolerance2,xpi,xmu0,xepsilon0,xomega,theta,k_max
      complex   :: cer(12),ck(12),ci
      logical   :: pec_first,pec_last
      
      
      
      
      integer n,ns,nf
      real xz
      complex ckrho2,cR_leftTE,cR_leftTM
      complex cZTE,cZTM,ctant,cphi
      complex ckz(1:nlayermax),cRTE(1:nlayermax-1),cRTM(1:nlayermax-1)
      complex cpsi(1:nlayermax-1)
      complex cLTTEV,cLTTMI,cLTTMV,cLTTEI
      complex const1,const2

      cLTTEV=1.0
      cLTTMI=1.0
      cLTTEI=1.0
      cLTTMV=1.0
      cR_leftTE=0.0
      cR_leftTM=0.0
      cRTE(ns)=0.0
      cRTM(ns)=0.0

      if (pec_first .eqv. .true.) then
         if (ns .ne. 2) then
            ckz(2)=csqrt(ck(2)**2-ckrho2)
            call process_ckz(ckz(2))
            cpsi(2)=exp(-ci*ckz(2)*(xloc(2)-xloc(1)))
            cphi=cpsi(2)**2
            ctant=(1-cphi)/(1+cphi)
            cZTE=mur(2)*ctant/ckz(2)
            cZTM=ckz(2)/cer(2)*ctant
            cRTE(2)=-1
            cRTM(2)=-1
            do 250 n=3,ns
               ckz(n)=csqrt(ck(n)**2-ckrho2)
               call process_ckz(ckz(n))
               call InputImpedance(ckz(n),n,cZTE,cZTM,&
     &			CRTE(n),cRTM(n),cpsi(n),mur,cer,nlayer,xloc,ci)

250         continue
            cR_leftTE=cRTE(ns)
            cR_leftTM=cRTM(ns)
         else
            CR_leftTE=-1
            cR_leftTM=-1
         end if
      else
         ckz(1)=csqrt(ck(1)**2-ckrho2)
         call process_ckz(ckz(1))
         cZTE=mur(1)/ckz(1)
         cZTM=ckz(1)/(cer(1))

         do 251 n=2,ns
            ckz(n)=csqrt(ck(n)**2-ckrho2)
            call process_ckz(ckz(n))

            call InputImpedance(ckz(n),n,cZTE,cZTM,&
     &			CRTE(n),cRTM(n),cpsi(n),mur,cer,nlayer,xloc,ci)
251      continue
         cR_leftTE=cRTE(ns)
         cR_leftTM=cRTM(ns)
      end if

      if (nf .lt. ns) then
         const1=exp(-ci*ckz(nf)*(xloc(nf)-xz))
         if (nf .eq. 1) then
            cLTTEV=const1
            cLTTMI=const1
            cLTTEI=const1
            cLTTMV=const1
         else
            const2=exp(-ci*2*ckz(nf)*(xz-xloc(nf-1)))
            cLTTEV=const1*(1+cRTE(nf)*const2)/(1+cRTE(nf)*cpsi(nf)**2)
            cLTTMV=const1*(1+cRTM(nf)*const2)/(1+cRTM(nf)*cpsi(nf)**2)
            cLTTMI=const1*(1-cRTM(nf)*const2)/(1+cRTM(nf)*cpsi(nf)**2)
            cLTTEI=const1*(1-cRTE(nf)*const2)/(1+cRTE(nf)*cpsi(nf)**2)
         end if

         do 252 n=nf+1,ns-1
            cLTTEV=cLTTEV*(1+cRTE(n))*cpsi(n)/(1+cRTE(n)*cpsi(n)**2)
            cLTTMV=cLTTMV*(1+cRTM(n))*cpsi(n)/(1+cRTM(n)*cpsi(n)**2)
            cLTTMI=cLTTMI*(1+cRTM(n))*cpsi(n)/(1+cRTM(n)*cpsi(n)**2)
            cLTTEI=cLTTEI*(1+cRTE(n))*cpsi(n)/(1+cRTE(n)*cpsi(n)**2)
252      continue 
      end if

      return
      end 





!*****************************************************************
!    ------------
!
!          x     field
!    
!    ------------
!
!
!
!    ------------
!
!         x     source
!    ------------
!
!
!

      attributes(device) subroutine R_right(ckrho2,xz,ns,nf,cR_rightTE,cR_rightTM,&
     &     cRTTEV,cRTTMV,cRTTEI,cRTTMI)
      implicit none
      
      
      
      integer nlayermax,nsegmax,nmax,nAJ,nEJ,nHJ,nIntPoint
      parameter(nlayermax=12,nsegmax=20000,nmax=2000)
      parameter(nAJ=6,nEJ=9,nHJ=8,nIntPoint=32)
      integer :: gejtest,ghjtest,gatest,nlayer,nfield,nsource,intno,xaxis,xzdir, gf_form
      real   :: xfreq,xxsp,xysp,xzsp,xxfp,xyfp,xzfp,intstart,intend,mur(12),xsig(12),xloc(12),tolerance1,tolerance2,xpi,xmu0,xepsilon0,xomega,theta,k_max
      complex   :: cer(12),ck(12),ci
      logical   :: pec_first,pec_last
      integer n,ns,nf,nb
      real xz
      complex ckrho2,cR_rightTE,cR_rightTM
      complex cZTE,cZTM,ctant,cphi
      complex ckz(1:nlayermax),cRTE(1:nlayermax-1),cRTM(1:nlayermax-1)
      complex cpsi(1:nlayermax-1)
      complex cRTTEV,cRTTMI,cRTTMV,cRTTEI
      complex const1,const2

      cRTTEV=1.0
      cRTTMI=1.0
      cRTTEI=1.0
      cRTTMV=1.0
      cR_rightTE=0.0
      cR_rightTM=0.0
      nb=nlayer-1
      cRTE(ns)=0.0
      cRTM(ns)=0.0

      if (pec_last .eqv. .true.) then
         if (ns .ne. nb) then
            ckz(nb)=csqrt(ck(nb)**2-ckrho2)
            call process_ckz(ckz(nb))
            cpsi(nb)=exp(-ci*ckz(nb)*(xloc(nb)-xloc(nb-1)))
            cphi=cpsi(nb)**2
            ctant=(1-cphi)/(1+cphi)
            cZTE=mur(nb)*ctant/ckz(nb)
            cZTM=ckz(nb)/cer(nb)*ctant
            cRTE(nb)=-1
            cRTM(nb)=-1
            do 256 n=nb-1,ns,-1
               ckz(n)=csqrt(ck(n)**2-ckrho2)
               call process_ckz(ckz(n))

               call InputImpedance(ckz(n),n,cZTE,cZTM,&
     &			CRTE(n),cRTM(n),cpsi(n),mur,cer,nlayer,xloc,ci)
256         continue
            cR_rightTE=cRTE(ns)
            cR_rightTM=cRTM(ns)
         else
            CR_rightTE=-1
            cR_rightTM=-1
         end if
      else
         ckz(nlayer)=csqrt(ck(nlayer)**2-ckrho2)
         call process_ckz(ckz(nlayer))
         cZTE=mur(nlayer)/ckz(nlayer)
         cZTM=ckz(nlayer)/(cer(nlayer))
         do 257 n=nlayer-1,ns,-1
            ckz(n)=csqrt(ck(n)**2-ckrho2)
            call process_ckz(ckz(n))

            call InputImpedance(ckz(n),n,cZTE,cZTM,&
     &			CRTE(n),cRTM(n),cpsi(n),mur,cer,nlayer,xloc,ci)
257      continue
         cR_rightTE=cRTE(ns)
         cR_rightTM=cRTM(ns)
      end if
      if (nf .gt. ns) then
         const1=exp(-ci*ckz(nf)*(xz-xloc(nf-1)))
         if (nf .eq. nlayer) then
            cRTTEV=const1
            cRTTMI=const1
            cRTTEI=const1
            cRTTMV=const1
         else
            const2=exp(-ci*2*ckz(nf)*(xloc(nf)-xz))
            cRTTEV=const1*(1+cRTE(nf)*const2)/(1+cRTE(nf)*cpsi(nf)**2)
            cRTTMV=const1*(1+cRTM(nf)*const2)/(1+cRTM(nf)*cpsi(nf)**2)
            cRTTMI=const1*(1-cRTM(nf)*const2)/(1+cRTM(nf)*cpsi(nf)**2)
            cRTTEI=const1*(1-cRTE(nf)*const2)/(1+cRTE(nf)*cpsi(nf)**2)
         end if
         do 258 n=ns+1,nf-1
            cRTTEV=cRTTEV*(1+cRTE(n))*cpsi(n)/(1+cRTE(n)*cpsi(n)**2)
            cRTTMV=cRTTMV*(1+cRTM(n))*cpsi(n)/(1+cRTM(n)*cpsi(n)**2)
            cRTTMI=cRTTMI*(1+cRTM(n))*cpsi(n)/(1+cRTM(n)*cpsi(n)**2)
            cRTTEI=cRTTEI*(1+cRTE(n))*cpsi(n)/(1+cRTE(n)*cpsi(n)**2)
258       continue
      end if
      return
      end 


!***************************************************************************
      attributes(device) subroutine InputImpedance(ckz,n,cZTE,cZTM,cRTE,cRTM,cpsi,mur,cer,nlayer,xloc,ci)
      implicit none
      real mur(12),xloc(12)
      complex cer(12),ci
      integer n,nlayer
      complex cphi,ckz,ctant,cZTE,cZTM,cZ0TE,cZ0TM
      complex cRTE,cRTM, cpsi,reflr

      cZ0TE=mur(n)/ckz
      cZ0TM=ckz/cer(n)
      call refl(cZTE,cZ0TE,reflr)
      cRTE=reflr
      call refl(cZTM,cZ0TM,reflr)
      cRTM=reflr
      cpsi=0
      if (n .gt. 1 .and. n .lt. nlayer) then
         cpsi=exp(-ci*ckz*(xloc(n)-xloc(n-1)))
         cphi=cpsi**2
         ctant=(1-cphi)/(1+cphi)
         cZTE=cZ0TE*(cZTE+cZ0TE*ctant)/(cZ0TE+cZTE*ctant)
         cZTM=cZ0TM*(cZTM+cZ0TM*ctant)/(cZ0TM+cZTM*ctant)
      end if      
      return
      end

!***************************************************************************
! Calculate the reflection coefficient
!**************************************************************************
      attributes(device) subroutine refl(cZ, cZ0,reflr) 
      implicit none
      complex reflr,cZ,cZ0
      reflr=(cZ-cZ0)/(cZ+cZ0)
      return 
      end




!====================================================================
!===========================process_ckzA=============================
!====================================================================

      attributes(device) subroutine process_ckz(ckz)

      implicit none

      real xrkz, xikz
      complex ckz,ci

      ci=(0,1)
      xrkz=real(ckz)
      xikz=imag(ckz)
      if (xrkz.lt.0) then
         xrkz=-xrkz
      end if
      if (xikz.gt.0) then
         xikz=-xikz
      end if
      ckz=xrkz+ci*xikz
      return
      end




        attributes(device) subroutine bslsdr(n,x,cj,ch,cjp,chp,m)
	parameter(nmax=36)
        integer n,i,m
	complex *16 cjd,chd,xd, jpd,hpd
	complex *16 jd(-36:36),hd(-36:36)
	complex *16 cj(-m:m), cjp(-m:m),ci
	complex *16 ch(-m:m), chp(-m:m),x
	double precision rn


	ci=(0.0,1.0)
	do i=-n,n
	   cj(i) = 0.0
	   cjp(i) = 0.0
	   ch(i) = 0.0
	   chp(i) = 0.0
	end do
	xd = x
	call bes(xd,0,cjd,chd,0)
	jd(0) = cjd
	hd(0) = chd
	
	ir=1
	do i=1,n+1
	   call bes(xd,i,cjd,chd,0)
	   jd(i) = cjd
	   hd(i) = chd
	   jd(-i) = jd(i)
	   hd(-i) = hd(i)
	   if(ir.gt.0) jd(-i) = -jd(i)
	   if(ir.gt.0) hd(-i) = -hd(i)
	   ir = -ir
	end do
	do i=-n,n
	   rn=float(i)
	   jpd = rn*jd(i)-xd*jd(i+1)
	   hpd = rn*hd(i)-xd*hd(i+1)
	   cj(i) = jd(i)
	   ch(i) = hd(i)
	   cjp(i) = jpd
	   chp(i) = hpd
	end do
	return
	end
!*******************************************************************
! This is the code to calculate the Bessel Function of any order   *
!*******************************************************************
	attributes(device)  subroutine bes(x,i,bej,beh,iex)
!...   x: the input variable.
!...   i: order.
!... bej: the first order Bessel Function J0.
!... beh: the second order Bessel Function H0.
!... iex: the exponential index (exp(ix*iex). Suggested value: 0   
!---------------------------------------------------------------                    
       	 real*8 pi,gr,gi,xi,ansr,ansi,anskr,anski !8 bytes means double precision
         complex *16 ci,g,x,bej,beh                !16 bytes means both the real and 
                                                  !the imaginary parts are all in 
                                                  !double precision
	                        
	 ci=(0.d0,1.d0)
	 pi=acos(-1.0)
	 
	 xi=dimag(x)
	 g=x
	 if(xi.lt.0.d0)g=dconjg(x)
       	 g=-ci*g
	 gr=dreal(g)
	 gi=dimag(g)
  	 call bessel(gr,gi,i,ansr,ansi,anskr,anski)
	 bej=ci**i*dcmplx(ansr,ansi)
	 beh=2.d0*(-ci)**(i+1)*dcmplx(anskr,anski)/pi
	 if(xi.ge.0.d0)return
	 if(iex.eq.1)beh=beh*dexp(-2.d0*gr)
	 beh=2.d0*bej-beh
	 bej=dconjg(bej)
	 beh=dconjg(beh)
!-----------------------------------------------------                    
	 return
	 end
!*********************************************************
         attributes(device) subroutine bessel (zr,zi,n,ansr,ansi,anskr,anski)
         implicit double precision (a-h,o-z)
!---------------------------------------------------------                    
         
         real pi
	
         real gam
	 gam=.5772156649015328606d0
	 pi=3.14159265358979323846264338d0
          sr=dsqrt(zr*zr+zi*zi)
         if (sr-(14.+n/2.))300,310,310
  300    continue
         ffn=n
         zdr=dabs(zr)
         zdi=dabs(zi)
         iq=1
         if (zi.lt.0) then
             iq=3
             if (zr.ge.0.) iq=4
         else
         if (zr.le.0.) iq=2
         end if

  123    nm=n-1
         cer1=zdr*.5d0
         cei=zdi*.5d0
         wsumr=0.
         wsumi=0.
         wwr=0.
         wwi=0.
         ucr=1.d0
         uci=0.
         if (n.ne.0) then
         ucr=cer1
         uci=cei
         if (n.ne.1) then
         br=ucr
         bi=uci
         do i=1,nm
            temp1=ucr*br-uci*bi
            uci=ucr*bi+uci*br
            ucr=temp1
        end do
        end if
        end if
  210    on=1.d0
         if (n.ne.0) on=(-1.d0)**n
         ucir=ucr
         ucii=uci
         ucr=ucr*on
         uci=uci*on
         vcr=ucr*.5d0
         vci=uci*.5d0
         pn=0.
         fn=1.d0
         if (n.ne.0) then
         do  i=1,n
            fi=i
            pn=pn+1.d0/fi
         fn=fn*fi
         end do
         end if
   10    uur=1.d0/fn
         uui=0.
         vvr=pn/fn
         vvi=0.
         usumr=uur
         usumi=0.
         vsumr=vvr
         vsumi=0.
         temp1=cer1*cer1
         temp2=cei*cei
         cesr=temp1-temp2
         cesi=2.d0*cer1*cei
         if (n.ne.0) then
         temp1=1.d0/(temp1+temp2)
         wcr=cer1*temp1
         wci=-cei*temp1
         if (nm.ne.0) then
         br=wcr
         bi=wci
         do i=1,nm
            temp1=wcr*br-wci*bi
            wci=wcr*bi+wci*br
            wcr=temp1
         end do
        end if
  221    wcr=wcr*.5d0
         wci=wci*.5d0
         fnm=fn/ffn
         wsumr=fnm
         wsumi=0.
         if (nm.ne.0) then
         wwr=fnm
         wwi=0.
         do k=1,nm
            fk=k
            ft=1.d0/(fk*(ffn-fk))
            wwr=-wwr*ft
            wwi=-wwi*ft
            temp1=wwr*cesr-wwi*cesi
            wwi=wwr*cesi+wwi*cesr
            wwr=temp1
            wsumr=wsumr+wwr
            wsumi=wsumi+wwi
        end do
        end if 
   35    temp1=wsumr*wcr-wsumi*wci
         wsumi=wsumr*wci+wsumi*wcr
         wsumr=temp1
         end if
  100    pk=0.
         do 20 k=1,33
            fk=k
            fnk=fk+ffn
            d=fk*fnk
            tr=cesr/d
            ti=cesi/d
            temp1=uur*tr-uui*ti
            uui=uur*ti+uui*tr
            uur=temp1
            usumr=usumr+uur
            usumi=usumi+uui
            pk=pk+1.d0/fk
            pn=pn+1.d0/fnk
            cr=pn+pk
            vvr=uur*cr
            vvi=uui*cr
            vsumr=vsumr+vvr
            vsumi=vsumi+vvi
            temp1=usumr
            if (temp1.eq.0.) temp1=1.d0
            temp2=usumi
            if (temp2.eq.0.) temp2=1.d0
            temp1=uur/temp1
            temp2=uui/temp2
            if (dabs(temp1).ge..1e-15) cycle
            if (dabs(temp2).ge..1e-15) cycle
            temp1=vsumr
            if (temp1.eq.0.) temp1=1.d0
            temp2=vsumi
            if (temp2.eq.0.) temp2=1.d0
            temp1=vvr/temp1
            temp2=vvi/temp2
            if (dabs(temp1).ge..1e-15) cycle
            if (dabs(temp2).ge..1e-15) cycle
            exit
   20    continue
   15    temp1=vsumr*vcr-vsumi*vci
         vsumi=vsumr*vci+vsumi*vcr
         vsumr=temp1
         celr=.5d0*dlog(cer1*cer1+cei*cei)
         celi=datan2(cei,cer1)
         ar=gam+celr
         ai=celi
         temp1=ar*usumr-ai*usumi
         ai=ar*usumi+ai*usumr
         ar=ucr*temp1-uci*ai
         ai=ucr*ai+uci*temp1
         anskr=wsumr-ar+vsumr
         anski=wsumi-ai+vsumi
         if (sr.ge.5.+n/4.) call zk24 (zdr,zdi,n,anskr,anski)
         ansr=usumr*ucir-usumi*ucii
         ansi=usumr*ucii+usumi*ucir
         if (iq.ne.1) then 
         if (iq.eq.4) then
         
   60    ansi=-ansi
         anski=-anski
         else
   70    cesr=1+2*(2*(n/2)-n)
         if (iq.ne.3) then
         usumr=ansr
         usumi=-ansi
         br=-pi*usumi
         bi=pi*usumr
         anski=-anski
         anskr=cesr*anskr-br
         anski=cesr*anski-bi
         ansr=usumr*cesr
         ansi=usumi*cesr
         else
   80    cr=-pi*ansi
         ci1=pi*ansr
         anskr=anskr*cesr+cr
         anski=anski*cesr+ci1
         ansr=ansr*cesr
         ansi=ansi*cesr
         end if
         end if
         end if
  500    if (iexpck.eq.0) return
         zrxp=dexp(zr)
         anskr=anskr*zrxp
       	 anski=anski*zrxp
         ansr=ansr/zrxp
         ansi=ansi/zrxp
         return
  310    call iln (zr,zi,n,ansr,ansi,anskr,anski)
!-------------------------------------------------------                    
         return
         end
!*******************************************************
         attributes(device) subroutine iln (zr,zi,n,ansr,ansi,anskr,anski)
         implicit double precision (a-h,o-z)
!-------------------------------------------------------                    
         
         real pi
	 pi=3.14159265358979323846264338d0
        argr=2.d0*pi*zr
         argi=2.d0*pi*zi

        if (argr.ge.0.) then
             pr=dsqrt((dsqrt(argr*argr+argi*argi)+argr)*.5d0)
             pii=argi/(2.d0*pr)
        else     
        if (dabs(argi/argr).lt.4.d-7) then    
       
  102    axi=dsqrt(dabs(argr))
         if (argr.lt.0) then 
  103    pr=0.
         pii=axi
         if (argi.lt.0.) pii=-pii
         else
  104    pr=axi
         pii=0.
         end if
         end if
         end if
  112    fn=n
         fn2=4.d0*(fn**2)
         ar=1.d0
         if (iexpck.eq.0) ar=dexp(zr)
         br=dcos(zi)
         bi=dsin(zi)
         ai=1.d0/(pr*pr+pii*pii)
         cor=ar*(br*pr+bi*pii)*ai
         coi=ar*(-br*pii+bi*pr)*ai
         c2r=(br*pr-bi*pii)*ai/ar
         c2i=(-br*pii-bi*pr)*ai/ar
         ai=1+2*(2*(n/2)-n)
         if (zi.lt.0.) ai=-ai
         co2r=-ai*c2i
         co2i=ai*c2r
         if (iexpck.ne.0) then
         tes0=dexp(-2.d0*zr)
         co2r=co2r*tes0
         co2i=co2i*tes0
         end if
   12    tes0=8.d0*dsqrt(zr*zr+zi*zi)
         z8r=zr*8.d0
         z8i=zi*8.d0
         nk=37
         psr=1.d0
         psi=0.
         pkr=1.d0
         pki=0.
         qsr=1.d0
         qsi=0.
         l=1
         do 10 k=1,nk
            l=-l
            qk=l
            kk=k-1
            ts=(2*kk+1)**2
            tt=k
            tes1=tt*tes0
            tes2=ts-fn2
            if (tes2.ge.tes1) cycle
            ar=pkr*tes2
            ai=pki*tes2
            br=tt*z8r
            bi=tt*z8i
            tem1=br*br+bi*bi
            tem2=ai*br-ar*bi
            pkr=(ar*br+ai*bi)/tem1
            pki=tem2/tem1
            psr=psr+pkr
            psi=psi+pki
            qsr=qsr+qk*pkr
            qsi=qsi+qk*pki
   10    continue
   15    ar=cor*psr-coi*psi
         ai=cor*psi+coi*psr
         ansr=ar+co2r*qsr-co2i*qsi
         ansi=ai+co2r*qsi+co2i*qsr
         if (zi.eq.0.) ansi=0.
         anskr=pi*(qsr*c2r-qsi*c2i)
         anski=pi*(qsr*c2i+qsi*c2r)
!------------------------------------------                    
         return
         end
!**********************************************
      attributes(device) subroutine zk24(zr,zi,n,zk1r,zk1i)
      implicit double precision (a-h,o-z)
!----------------------------------------------                    
      real x(12),c(12)
       x( 1)= .361913603606156014d+02
       x( 2)= .276611087798460900d+02
       x( 3)= .213967559361661098d+02
       x( 4)= .164321950876753130d+02
       x( 5)= .123904479638094713d+02
       x( 6)= .907543423096120287d+01
       x( 7)= .636997538803063507d+01
       x( 8)= .419841564487841315d+01
       x( 9)= .250984809723212788d+01
       x(10)= .126958994010396155d+01
       x(11)= .454506681563780283d+00
       x(12)= .503618891172939534d-01
       c( 1)= .332873699297821780d-15
       c( 2)= .131692404861563402d-11
       c( 3)= .609250853997510780d-09
       c( 4)= .803794234988285940d-07
       c( 5)= .431649140980466720d-05
       c( 6)= .113773832728087596d-03
       c( 7)= .164738496537683500d-02
       c( 8)= .140967116201453420d-01
       c( 9)= .748909410064614920d-01
       c(10)= .255479243569118320d+00
       c(11)= .572359070692886040d+00
       c(12)= .853862327737398500d+00
      if (n.ge.7) then
          call zk32(zr,zi,n,zk1r,zk1i)
          return
      end if
      denr=1.d0
      deni=0.
      deni1=0.
      sz=dsin(-zi)
      cz=dcos(-zi)
      if (n.ne.0) then
      do  i=1,n
         temp=denr*zr-deni*zi
         deni=denr*zi+deni*zr
         denr=temp
      end do
      denr1=denr/(denr*denr+deni*deni)
      if (zi.ne.0.) deni1=deni/(deni*deni+denr*denr)
      end if
      yr0=0.
      yi0=0.
      yr1=0.
      yi1=0.
      zrt=zr+zr
      zit=zi+zi
      do 10 i=1,12
         tr=zrt+x(i)
         temp=dsqrt(tr*tr+zit*zit)
         ar=dsqrt((tr+temp)*.5d0)
         if (ar.lt.0) ai=zit/(2.d0*ar)
         if (ar.gt.0) ai=zit/(2.d0*ar)
    
        if (ar.eq.o)  ai=dsqrt(temp)
    5    if (n.eq.0) then
         cc=ar*ar+ai*ai
         br0=ar/cc
         bi0=-ai/cc
         yr0=yr0+br0*c(i)
         yi0=yi0+bi0*c(i)
         else
    6    br1=ar*x(i)
         bi1=ai*x(i)
         if (n.ne.1) then
         br2=tr*x(i)
         bi2=zit*x(i)
         do  k=2,n
            temp=br1*br2-bi1*bi2
            tkm=2*k-1
            bi1=(br1*bi2+bi1*br2)/tkm
            br1=temp/tkm
         end do
         end if
    9    yr1=yr1+br1*c(i)
         yi1=yi1+bi1*c(i)
         end if
   10 continue
      et=dexp(-zr)
      if (n.eq.0) then
      zk1r=(yr0*cz-yi0*sz)*et
      zk1i=(yi0*cz+yr0*sz)*et
      else
   12 zk2r=yr1*denr1+yi1*deni1
      zk2i=yi1*denr1-yr1*deni1
      zk1r=(zk2r*cz-zk2i*sz)*et
      zk1i=(zk2r*sz+zk2i*cz)*et
      end if  
   20 continue

   
!------------------------------------------                    
      return
      end
!******************************************
      attributes(device) subroutine zk32(zr,zi,n,zk1r,zk1i)
      implicit double precision (a-h,o-z)
!------------------------------------------                    
      real x(16),c(16)
       x( 1)= .507772238775370808d+02
       x( 2)= .410816665254912019d+02
       x( 3)= .337819704882261658d+02
       x( 4)= .278314382113286757d+02
       x( 5)= .228213006935252084d+02
       x( 6)= .185377431786066933d+02
       x( 7)= .148514313418012497d+02
       x( 8)= .116770336739759564d+02
       x( 9)= .895500133772339017d+01
       x(10)= .664221517974144425d+01
       x(11)= .470672670766758733d+01
       x(12)= .312460105070214431d+01
       x(13)= .187793150769607417d+01
       x(14)= .953553155390865424d+00
       x(15)= .342200156010947678d+00
       x(16)= .379629145753134561d-01
       c( 1)= .146213528547683240d-21
       c( 2)= .184634730730365840d-17
       c( 3)= .239468803418569740d-14
       c( 4)= .843002042265289520d-12
       c( 5)= .118665829267932772d-09
       c( 6)= .819766432954179320d-08
       c( 7)= .314833558509118800d-06
       c( 8)= .730117025912475220d-05
       c( 9)= .108331681236399652d-03
       c(10)= .107253673105594410d-02
       c(11)= .730978065330885620d-02
       c(12)= .351068576631468600d-01
       c(13)= .120916261911825228d+00
       c(14)= .302539468153284960d+00
       c(15)= .554916284605059800d+00
       c(16)= .750476705185604780d+00
      denr=1.d0
      deni=0.
      deni1=0.
      sz=dsin(-zi)
      cz=dcos(-zi)
      if (n.ne.0) then
      do  i=1,n
         temp=denr*zr-deni*zi
         deni=denr*zi+deni*zr
         denr=temp
      end do
      denr1=denr/(denr*denr+deni*deni)
      if (zi.ne.0.) deni1=deni/(deni*deni+denr*denr)
      end if   
    4 yr0=0.
      yi0=0.
      yr1=0.
      yi1=0.
      zrt=zr+zr
      zit=zi+zi
      do 10 i=1,16
         tr=zrt+x(i)
         temp=dsqrt(tr*tr+zit*zit)
         ar=dsqrt((tr+temp)*.5d0)
         if (ar.lt.0) ai=zit/(2.d0*ar)
         if (ar.eq.0) ai=dsqrt(temp)
         if (ar.gt.0) ai=zit/(2.d0*ar)
    5    if (n.eq.0) then
         cc=ar*ar+ai*ai
         br0=ar/cc
         bi0=-ai/cc
         yr0=yr0+br0*c(i)
         yi0=yi0+bi0*c(i)
         else
    6    br1=ar
         bi1=ai
         p=x(i)
         if (n.ne.1) then
         br2=tr
         bi2=zit
         do 8 k=2,n
            p=p*x(i)
            temp=br1*br2-bi1*bi2
            tkm=2*k-1
            bi1=(br1*bi2+bi1*br2)/tkm
            br1=temp/tkm
    8    continue
        end if
    9    yr1=yr1+br1*(c(i)*p)
         yi1=yi1+bi1*(c(i)*p)
        end if
   10 continue
      et=dexp(-zr)
      if (n.eq.0) then
      zk1r=(yr0*cz-yi0*sz)*et
      zk1i=(yi0*cz+yr0*sz)*et
      else
   12 zk2r=yr1*denr1+yi1*deni1
      zk2i=yi1*denr1-yr1*deni1
      zk1r=(zk2r*cz-zk2i*sz)*et
      zk1i=(zk2r*sz+zk2i*cz)*et
      end if
   20 continue
!------------------------------------------                    
      return
      end




end module test

Hi leonado2004,

Your kernel is too large and is causing the NVIDIA assembler (ptxas) to take a very long time to assemble the code.

Keep in mind that the GPU doesn’t support true function calling. Instead, all routines must me inlined at compilation. Hence, when you have the main kernel call GreenOnePointEJ, which calls linintEF twice, which calls Spectral three times, the kernel can grow quite large. Even if you are willing to wait for ptxas to finish, the very large kernel size will most likely result in poor performance.

The best thing to do here is rewrite the code to use several smaller kernels.

  • Mat