problems in accelerating code

Hello,
I’m sending the following code (with a minor correction).
I tried implementing the code on the GPU, till now unsuccessfully.
(memory problems). Could anyone help me in fixing this code?
Best regards,
Barak


IMPLICIT NONE
integer L
!real*8, device , dimension(:,:), allocatable  :: e_f_1r
!real*8, device , dimension(:,:), allocatable  :: e_f_1i
!real*8, device , dimension(:,:), allocatable  :: tempr,tempi
real*8,  dimension(:,:), allocatable  :: e_f_1r
real*8,  dimension(:,:), allocatable  :: e_f_1i
real*8,  dimension(:,:), allocatable  :: tempr,tempi
integer isize

integer ipb(10),ipe(10)
integer plt(0:10),plp(0:10)
          plp(0)=1285
          plt(0)=1477

          ipb(1)=2
          ipe(1)=5
          plp(1)=645
          plt(1)=741

          ipb(2)=6
          ipe(2)=21
          plp(2)=325
          plt(2)=373

          ipb(3)=22
          ipe(3)=85
          plp(3)=165
          plt(3)=189

          ipb(4)=86
          ipe(4)=341
          plp(4)=85
          plt(4)=97

          ipb(5)=342
          ipe(5)=1365
          plp(5)=45                
           plt(5)=51   
          
          isize=9000000/2
          ALLOCATE(tempr(3,isize))
          ALLOCATE(tempi(3,isize))
          ALLOCATE(e_f_1r(3,isize))
          ALLOCATE(e_f_1i(3,isize))

        do L=5,1,-1
        call suba(l,ipb,ipe,plp,plt,e_f_1r,e_f_1i,tempr,tempi)
        enddo
        
      stop
      end

        
        subroutine suba(l,ipb,ipe,plp,plt,e_f_1r,e_f_1i,tempr,tempi)

IMPLICIT NONE
integer p1,p
integer ipb(10),ipe(10)
integer plt(0:10),plp(0:10)

integer i1,i2
integer :: ip,io,ioo,ii,i,L
real*8, device :: tempr(3,ipb(l):ipe(L),plp(l),plt(l))   ! in device
real*8, device :: tempi(3,ipb(l):ipe(L),plp(l),plt(l))   ! in device
!real*8:: tempr(3,ipb(l):ipe(L),plp(l),plt(l))   ! in device
!real*8 :: tempi(3,ipb(l):ipe(L),plp(l),plt(l))   ! in device

real*8, device ::  e_f_1r(3,plp(l-1),plt(l-1),ipb(l):ipe(l)) !in device
real*8, device ::  e_f_1i(3,plp(l-1),plt(l-1),ipb(l):ipe(l)) !in device
!real*8  ::  e_f_1r(3,plp(l-1),plt(l-1),ipb(l):ipe(l)) !in device
!real*8 ::  e_f_1i(3,plp(l-1),plt(l-1),ipb(l):ipe(l)) !in device
real*8, dimension(3,(-P+1):PLp(0)+P,PLt(0)) :: E_Fr
real*8, dimension(3,(-P+1):PLp(0)+P,PLt(0)) :: E_Fi
real*8 tempr1(3,1600,1600)
real*8 tempi1(3,1600,1600)
integer :: mP
parameter (p1=1)
parameter (p=2)
real*8 cc(p1:p)
data cc/-6.25E-002,0.5625,0.5625,-6.250E-002/
       
        

        print*,' l=',l
        
        print*,' ipb,ipe=',ipb(l),ipe(l)
        print*,' plp,plt=',plp(l),plt(l)
        print*,'p,p1=',p,p1
        print*,cc

        tempr1=1
        tempi1=1
        e_f_1r=0
        e_f_1i=0
       do 2110 ip=ipb(L),ipe(L)
          do  ii=1,PLp(L)
          do  i=1,PLt(L)
          tempr1(:,ii,i)=tempr(:,ip,ii,i)
          tempi1(:,ii,i)=tempi(:,ip,ii,i)
         enddo
          enddo
          do 221 ii=1,PLp(L)
             i1=(ii-P1)*2-1
             do 27 i=(P1+1),(PLt(L)-P1)
              i2=(i-P1)*2-1
              E_Fr(:,i1,i2)=tempr1(:,ii,i)
              E_Fi(:,i1,i2)=tempi1(:,ii,i)
       27    continue

         do 28 i=(P1+1),(PLt(L)-P1-1)
          i2=(i-P1)*2
          E_Fr(:,i1,i2)=0
          E_Fi(:,i1,i2)=0
          do 29 mP=-P1,P
            E_Fr(:,i1,i2)=E_Fr(:,i1,i2)+CC(mP)*tempr1(:,ii,i+mp)
            E_Fi(:,i1,i2)=E_Fi(:,i1,i2)+CC(mP)*tempi1(:,ii,i+mp)
   29     continue
   28    continue

   221   continue

        do 34 i=1,PLt(L-1)
         do 35 ii=1,(PLp(L-1)-1),2
          E_Fr(:,ii+1,i)=0
          E_Fi(:,ii+1,i)=0
          do 32 mP=-P1,P
           E_Fr(:,ii+1,i)=E_Fr(:,ii+1,i)+CC(mP)*E_Fr(:,ii+2*mP,i)
           E_Fi(:,ii+1,i)=E_Fi(:,ii+1,i)+CC(mP)*E_Fi(:,ii+2*mP,i)
   32     continue
   35    continue
   34   continue

            do i=1,PLt(L-1)
           do  ii=1,PLp(L-1)
           E_F_1r(:,ii,i,ip)=E_Fr(:,ii,i)
           E_F_1i(:,ii,i,ip)=E_Fi(:,ii,i)
           enddo
           enddo

   2110   continue

        return
        end

Hi Barak,

The problem here is that you’re trying to use device arrays on the host.

Are you wanting to create a CUDA Fortran program? If so, I would suggests taking a step back and review some of our training material on CUDA Fortran, http://www.pgroup.com/resources/cudafortran.htm, and/or get a copy of the book “CUDA Fortran for Scientists and Engineers” http://store.elsevier.com/CUDA-Fortran-for-Scientists-and-Engineers/Gregory-Ruetsch/isbn-9780124169708/.

Another way is take your host program and then add OpenACC directives to offload your code to the GPU. Please see our tutorial articles for more information (http://www.pgroup.com/resources/accel.htm) or those on OpenACC.org (http://www.openacc.org/node/1).

Hope this helps,
Mat