Reflected with subroutines within CONTAINS

Hi,

I have a code where some subroutines are defined with contains from an other subroutine. Here is an example:

module computation
  implicit none
  
CONTAINS

!---------------------------------------------
subroutine organize(nvec,nlev,a,option)
  real, intent(inout) :: a(:,:)
  integer, intent(in) :: nvec,nlev,option
  integer :: i,k
  !$acc reflected(a)  
  
  IF (option==1) THEN
     call compute_1
  ELSE
     call compute_2
  END IF


CONTAINS

subroutine compute_1
  integer :: i,k
  !$acc region do kernel 
     do i=1,nvec       
        do k=2,nlev 
           a(i,k)=a(i,k)*a(i,k-1)
        end do
     end do
     !$acc end region
end subroutine compute_1

subroutine compute_2
  integer :: i,k
       !$acc region do kernel 
     do i=1,nvec       
        do k=2,nlev 
           a(i,k)=2*a(i,k)*a(i,k-1)
        end do
     end do
     !$acc end region
end subroutine compute_2

end subroutine organize
!-----------------------------
end module computation
  
program main
  USE computation
  implicit none
  real, allocatable :: a(:,:)
  !$acc mirror(a)
  integer, parameter :: n1=10000, nlev=60
  integer, parameter :: option=0

   
  allocate(a(n1,nlev))
  !init a
  a=0.1
  !$acc update device(a) 
     
  call organize(n1,nlev,a,option)
 
  !$acc update host(a)
  print*, sum(a)

end program main

when I compile I get the following message:

compute_1:
     27, Generating allocate(a(:,:))
         Generating copyin(a(1:nvec,1:nlev))
     ...

which indicates that the compiler does not see that “a” is already on the device (as it should be from the reflected statement in organize subroutine).

What am I doing wrong here ?

Thanks,

Xavier

Hi Xavior,

I’ve passed this on to our engineers since I think it should work (TPR#18076). My guess is that they just missed this case. I’ll let you know.

As a work around, you can move “a” in the module and the “mirror” will be visible across all routines in the module.

  • Mat
% cat reflect.f90 
module computation
  implicit none
  real, allocatable :: a(:,:)
  !$acc mirror(a)
 
CONTAINS

!---------------------------------------------
subroutine organize(nvec,nlev,option)
  integer, intent(in) :: nvec,nlev,option
  integer :: i,k
 
  IF (option==1) THEN
     call compute_1
  ELSE
     call compute_2
  END IF


CONTAINS

subroutine compute_1
  integer :: i,k
  !$acc region do kernel
     do i=1,nvec       
        do k=2,nlev
           a(i,k)=a(i,k)*a(i,k-1)
        end do
     end do
     !$acc end region
end subroutine compute_1

subroutine compute_2
  integer :: i,k
       !$acc region do kernel
     do i=1,nvec       
        do k=2,nlev
           a(i,k)=2*a(i,k)*a(i,k-1)
        end do
     end do
     !$acc end region
end subroutine compute_2

end subroutine organize
!-----------------------------
end module computation
 
program main
  USE computation
  implicit none
  integer, parameter :: n1=10000, nlev=60
  integer, parameter :: option=0

   
  allocate(a(n1,nlev))
  !init a
  a=0.1
  !$acc update device(a)
     
  call organize(n1,nlev,option)
 
  !$acc update host(a)
  print*, sum(a)

end program main

danger3:/home/colgrove/tmp% pgf90 -ta=nvidia -Minfo=accel reflect.f90
compute_1:
     24, Generating compute capability 1.0 binary
         Generating compute capability 1.3 binary
         Generating compute capability 2.0 binary
     25, Loop is parallelizable
         Accelerator kernel generated
         25, !$acc do parallel, vector(256) ! blockidx%x threadidx%x
             CC 1.0 : 13 registers; 68 shared, 8 constant, 0 local memory bytes; 66% occupancy
             CC 1.3 : 13 registers; 68 shared, 8 constant, 0 local memory bytes; 100% occupancy
             CC 2.0 : 20 registers; 8 shared, 76 constant, 0 local memory bytes; 100% occupancy
     26, Loop carried dependence of 'a' prevents parallelization
         Loop carried backward dependence of 'a' prevents vectorization
compute_2:
     35, Generating compute capability 1.0 binary
         Generating compute capability 1.3 binary
         Generating compute capability 2.0 binary
     36, Loop is parallelizable
         Accelerator kernel generated
         36, !$acc do parallel, vector(256) ! blockidx%x threadidx%x
             CC 1.0 : 13 registers; 68 shared, 8 constant, 0 local memory bytes; 66% occupancy
             CC 1.3 : 13 registers; 68 shared, 8 constant, 0 local memory bytes; 100% occupancy
             CC 2.0 : 20 registers; 8 shared, 76 constant, 0 local memory bytes; 100% occupancy
     37, Loop carried dependence of 'a' prevents parallelization
         Loop carried backward dependence of 'a' prevents vectorization
main:
     58, Generating !$acc update device(a(:,:))
     62, Generating !$acc update host(a(:,:))

TPR 18076 - ACC: reflected variable copied in when region is located in contained subroutine

regards,
dave