call to 'pgf90_copy_f77_argl' with no acc routine info

Hi,

When building the below subroutine, I am getting the following error. From the compile messages I am not able to understand which variable it is trying to copy and why it fails. Is there a workaround for this issue?

Thanks in advance.

PGF90-S-0155-Procedures called in a compute region must have acc routine information: pgf90_copy_f77_argl (psy/psykal_lite.F90: 280)
PGF90-S-0155-Accelerator region ignored; see -Minfo messages (psy/psykal_lite.F90: 267)
invoke_calc_exner_kernel:
263, Generating copyin(chi_proxy%data(1:undf_w0),theta_proxy%data(1:undf_w0),basis_func(z_b_9_1:z_b_8_1,1:ndf,1:nqp_h,1:nqp_v),basis_f_w0(z_b_22:z_b_21,1:ndf_w0,1:nqp_h,1:nqp_v),dbasis_w0(z_b_35:z_b_34,1:ndf_w0,1:nqp_h,1:nqp_v))
267, Accelerator region ignored
280, Accelerator restriction: call to ‘pgf90_copy_f77_argl’ with no acc routine information
0 inform, 0 warnings, 2 severes, 0 fatal for invoke_calc_exner_kernel



196 !-------------------------------------------------------------------------------
 197 !> Invoke_calc_exner_kernel: Invoke the calculation of exner pressure
 198   subroutine invoke_calc_exner_kernel( exner, rho, theta, chi, qr )
 199 
 200     use calc_exner_kernel_mod, only : calc_exner_code
 201     use log_mod,                 only : log_event,         &
 202                                       log_set_level,     &
 203                                       log_scratch_space, &
 204                                       LOG_LEVEL_ERROR,   &
 205                                       LOG_LEVEL_INFO,    &
 206                                       LOG_LEVEL_DEBUG,   &
 207                                       LOG_LEVEL_TRACE
 208 
 209     type( field_type ), intent( in ) :: exner, rho, theta
 210     type( field_type ), intent( in ) :: chi(3)
 211     type( quadrature_type), intent( in ) :: qr
 212 
 213     integer                 :: cell, ndf, dim, ndf_w0,dim_w0,diff_dim
 214     integer                 :: undf_w0, undf_w3, df1
 215     integer                 :: nqp_v, nqp_h, nlayers, ncells
 216 !    integer, pointer        :: map_w3(:), map_w0(:) => null()
 217     integer, allocatable        :: map_w3(:), map_w0(:)
 218 
 219     type( field_proxy_type )        :: exner_proxy, rho_proxy, theta_proxy
 220     type( field_proxy_type )        :: chi_proxy(3)
 221 
 222     real(kind=r_def),allocatable         :: basis_func(:,:,:,:), &
 223                                             basis_f_w0(:,:,:,:), &
 224                                             dbasis_w0(:,:,:,:)
 225     real(kind=r_def), pointer :: xp(:,:) => null()
 226     real(kind=r_def), pointer :: zp(:)   => null()
 227     real(kind=r_def), pointer :: wh(:), wv(:) => null()
 228 
 229     exner_proxy  = exner%get_proxy()
 230     rho_proxy    = rho%get_proxy()
 231     theta_proxy  = theta%get_proxy()
 232     chi_proxy(1) = chi(1)%get_proxy()
 233     chi_proxy(2) = chi(2)%get_proxy()
 234     chi_proxy(3) = chi(3)%get_proxy()
 235 
 236     ndf = exner_proxy%vspace%get_ndf( )
 237     dim = exner_proxy%vspace%get_dim_space( )
 238     nqp_h=qr%get_nqp_h()
 239     nqp_v=qr%get_nqp_v()
 240     zp=>qr%get_xqp_v()
 241     xp=>qr%get_xqp_h()
 242     wh=>qr%get_wqp_h()
 243     wv=>qr%get_wqp_v()
245     allocate(basis_func(dim,ndf,nqp_h,nqp_v))
 246 
 247     ndf_w0 = chi_proxy(1)%vspace%get_ndf( )
 248     dim_w0 = chi_proxy(1)%vspace%get_dim_space( )
 249     diff_dim = chi_proxy(1)%vspace%get_dim_space_diff( )
 250     allocate(basis_f_w0(dim_w0,ndf_w0,nqp_h,nqp_v))
 251     allocate(dbasis_w0(diff_dim,ndf_w0,nqp_h,nqp_v) )
 252     undf_w0 = chi_proxy(1)%vspace%get_undf()
 253     undf_w3 = exner_proxy%vspace%get_undf()
 254     nlayers = exner_proxy%vspace%get_nlayers()
 255     ncells = exner_proxy%vspace%get_ncell()
 256    
 257     allocate(map_w3(ndf))
 258     allocate(map_w0(ndf_w0))
 259     call exner_proxy%vspace%compute_basis_function(basis_func,ndf,nqp_h,nqp_v,xp,zp)
 260     call chi_proxy(1)%vspace%compute_basis_function(basis_f_w0,ndf_w0,nqp_h,nqp_v,xp,zp)
 261     call chi_proxy(1)%vspace%compute_diff_basis_function(dbasis_w0,ndf_w0,nqp_h,nqp_v,xp,zp)
 262 
 263 !$acc data copyin(chi_proxy(1)%data(1:undf_w0), chi_proxy(2)%data(1:undf_w0), &
 264 !$acc chi_proxy(3)%data(1:undf_w0), theta_proxy%data(1:undf_w0), &
 265 !$acc basis_func(:,1:ndf,1:nqp_h,1:nqp_v), basis_f_w0(:,1:ndf_w0,1:nqp_h,1:nqp_v), dbasis_w0(:,1:ndf_w0,1:nqp_h,1:nqp_v)       )
 266 !!$acc copyout( exner_proxy%data(1:undf_w3) )
 267 !$acc parallel loop
 268     do cell = 1,ncells
 269 
 270      do df1 = 1, ndf
 271        map_w3(df1) = exner_proxy%vspace%dofmap(df1,cell)
 272      end do
 273 
 274      do df1 = 1, ndf_w0
 275         map_w0(df1) = theta_proxy%vspace%dofmap(df1,cell)
 276      end do
 277       ! map_w3 => exner_proxy%vspace%dofmap(:,cell)
 278       ! map_w0 => theta_proxy%vspace%dofmap(:,cell)
 279 
 280        call calc_exner_code( nlayers, &
 281                              ndf, &
 282                              undf_w3, &
 283                              map_w3, &
 284                              basis_func, &
 285                              exner_proxy%data, &
 286                              rho_proxy%data, &
 287                              ndf_w0, &
 288                              undf_w0, &
 289                              map_w0, &
 290                              basis_f_w0, &
 291                              theta_proxy%data, &
 292                              dbasis_w0, &
 293                              chi_proxy(1)%data, &
 294                              chi_proxy(2)%data, &
 295                              chi_proxy(3)%data,  &
 295                              chi_proxy(3)%data,  &
 296                              nqp_h, &
 297                              nqp_v, &
 298                              wh, wv )
 299    
 300 !   call exner%log_field( LOG_LEVEL_DEBUG, LOG_LEVEL_INFO, 'exner' )
 301    end do
 302 !$acc end parallel loop
 303 !$acc end data
 304     deallocate(basis_func,basis_f_w0,dbasis_w0)
 305   end subroutine invoke_calc_exner_kernel

Hi Karthee,

“pgf90_copy_f77_argl” is a runtime routine which checks if it needs to create a temp array that can be passed between a F90 array in the caller and F77 array in the callee. My guess is that you have an assumed size array in “calc_exner_code”.

We didn’t port this routine to the device since it could create a temp array and having every thread create a temp array would very poor for performance.

Can you post the interface to calc_exner_code?

  • Mat

Hi Mat,

Please find below the interface of calc_exner_

subroutine calc_exner_code(nlayers,ndf_w3,undf_w3, & ! integers
                           map_w3, w3_basis, &  ! arrays
                           exner, rho, & !data
                           ndf_w0, undf_w0, & !integers
                           map_w0,w0_basis, & ! arrays
                           theta, & ! data
                           w0_diff_basis, & ! arrays
                           chi_1, chi_2, chi_3,  & ! data
                           nqp_h, nqp_v, & !integers
                           wqp_h, wqp_v ) ! quadrature weights
  use coordinate_jacobian_mod, only: coordinate_jacobian
  use reference_profile_mod,   only: reference_profile

  !Arguments
  integer, intent(in) :: nlayers, ndf_w0, ndf_w3, undf_w3, undf_w0, nqp_h, nqp_v
  integer, intent(in) :: map_w0(ndf_w0), map_w3(ndf_w3)
  real(kind=r_def), intent(in), dimension(1,ndf_w3,nqp_h,nqp_v) :: w3_basis
  real(kind=r_def), intent(in), dimension(1,ndf_w0,nqp_h,nqp_v) :: w0_basis
  real(kind=r_def), intent(in), dimension(3,ndf_w0,nqp_h,nqp_v) :: w0_diff_basis
  real(kind=r_def), dimension(undf_w3), intent(inout) :: exner, rho
  real(kind=r_def), dimension(undf_w0), intent(in)    :: theta, &
                                                         chi_1, chi_2, chi_3
  real(kind=r_def), dimension(nqp_h), intent(in)      ::  wqp_h
  real(kind=r_def), dimension(nqp_v), intent(in)      ::  wqp_v

  !Internal variables
  integer               :: df1, df2, k
  integer               :: qp1, qp2

  real(kind=r_def), dimension(ndf_w3) :: exner_e, rho_e, rhs_e
  real(kind=r_def), dimension(ndf_w0) :: theta_e
  real(kind=r_def), dimension(ndf_w0) :: chi_1_e, chi_2_e, chi_3_e
  real(kind=r_def), dimension(nqp_h,nqp_v)     :: dj
  real(kind=r_def), dimension(3,3,nqp_h,nqp_v) :: jac
  real(kind=r_def), dimension(ndf_w3,ndf_w3) :: mass_matrix_w3, inv_mass_matrix_w3
  real(kind=r_def) :: rho_at_quad, rho_s_at_quad,                                 &
                   theta_at_quad, theta_s_at_quad,                             &
                  exner_s_at_quad
  real(kind=r_def) :: rhs_eos, x_at_quad(3)

What version of PGI compilers are you using?

It looks like you want to call this routine from a compute kernel. I don’t see the routine directive. Is this meant to run sequentially from all the threads?

Hi,

I am using the following compiler version.
pgfortran 15.5-0 64-bit target on Apple OS/X -tp haswell

I am using !$acc routine vector for the calc_exner_code routine and seems to work fine. The idea is to run the routine on a SM and run the calculation of different cells in parallel on different SMs. Please note that !$acc routine worker also works.

calc_exner_code:
86, Generating acc routine vector
131, !$acc loop vector ! threadidx%x
136, !$acc loop vector ! threadidx%x
151, !$acc loop vector ! threadidx%x
160, !$acc loop vector ! threadidx%x
164, !$acc loop vector ! threadidx%x
177, !$acc loop vector ! threadidx%x
193, !$acc loop vector ! threadidx%x
86, Generating Tesla code
reduction in routine disables compute capability 2.x kernel
128, Loop is parallelizable
131, Loop is parallelizable
136, Loop is parallelizable
147, Loop carried dependence of x_at_quad prevents parallelization
Loop carried dependence of x_at_quad prevents vectorization
Loop carried backward dependence of x_at_quad prevents vectorization
149, Loop carried reuse of rhs_e prevents parallelization
Loop carried dependence of x_at_quad prevents parallelization
Loop carried dependence of x_at_quad prevents vectorization
Loop carried backward dependence of x_at_quad prevents vectorization
150, Complex loop carried dependence of rhs_e prevents parallelization
Loop carried dependence of rhs_e prevents parallelization
Loop carried backward dependence of rhs_e prevents vectorization
Loop carried dependence of x_at_quad prevents parallelization
Loop carried backward dependence of x_at_quad prevents vectorization
151, Loop is parallelizable
152, Complex loop carried dependence of x_at_quad prevents parallelization
Loop carried reuse of x_at_quad prevents parallelization
160, Loop is parallelizable
164, Loop is parallelizable
176, Loop is parallelizable
177, Loop is parallelizable
179, Loop carried reuse of mass_matrix_w3 prevents parallelization
180, Complex loop carried dependence of mass_matrix_w3 prevents parallelization
Loop carried dependence of mass_matrix_w3 prevents parallelization
Loop carried backward dependence of mass_matrix_w3 prevents vectorization
193, Loop is parallelizable
195, Complex loop carried dependence of exner_e prevents parallelization
Loop carried reuse of exner_e prevents parallelization
201, Loop carried reuse of exner prevents parallelization

subroutine calc_exner_code(nlayers,ndf_w3,undf_w3, & ! integers
                           map_w3, w3_basis, &  ! arrays
                           exner, rho, & !data
                           ndf_w0, undf_w0, & !integers
                           map_w0,w0_basis, & ! arrays  
                           theta, & ! data
                           w0_diff_basis, & ! arrays 
                           chi_1, chi_2, chi_3,  & ! data
                           nqp_h, nqp_v, & !integers
                           wqp_h, wqp_v ) ! quadrature weights
  use coordinate_jacobian_mod, only: coordinate_jacobian
  use reference_profile_mod,   only: reference_profile
!$acc routine vector  
  !Arguments
  integer, intent(in) :: nlayers, ndf_w0, ndf_w3, undf_w3, undf_w0, nqp_h, nqp_v
  !integer, intent(in) :: map_w0(ndf_w0), map_w3(ndf_w3)
  integer, intent(in), dimension(ndf_w0) :: map_w0
  integer, intent(in), dimension(ndf_w3) :: map_w3
  real(kind=r_def), intent(in), dimension(1,ndf_w3,nqp_h,nqp_v) :: w3_basis  
  real(kind=r_def), intent(in), dimension(1,ndf_w0,nqp_h,nqp_v) :: w0_basis 
  real(kind=r_def), intent(in), dimension(3,ndf_w0,nqp_h,nqp_v) :: w0_diff_basis 
  real(kind=r_def), dimension(undf_w3), intent(inout) :: exner, rho
  real(kind=r_def), dimension(undf_w0), intent(in)    :: theta, &
                                                         chi_1, chi_2, chi_3
  real(kind=r_def), dimension(nqp_h), intent(in)      ::  wqp_h
  real(kind=r_def), dimension(nqp_v), intent(in)      ::  wqp_v

  !Internal variables
  integer               :: df1, df2, k
  integer               :: qp1, qp2
  
  real(kind=r_def), dimension(ndf_w3) :: exner_e, rho_e, rhs_e   
  real(kind=r_def), dimension(ndf_w0) :: theta_e
  real(kind=r_def), dimension(ndf_w0) :: chi_1_e, chi_2_e, chi_3_e
  real(kind=r_def), dimension(nqp_h,nqp_v)     :: dj
  real(kind=r_def), dimension(3,3,nqp_h,nqp_v) :: jac
  real(kind=r_def), dimension(ndf_w3,ndf_w3) :: mass_matrix_w3, inv_mass_matrix_w3
  real(kind=r_def) :: rho_at_quad, rho_s_at_quad,                                 &
                   theta_at_quad, theta_s_at_quad,                             &
                  exner_s_at_quad
  real(kind=r_def) :: rhs_eos, x_at_quad(3)

  do k = 0, nlayers-1
  ! Extract element arrays of rho & theta
!!$acc parallel loop 
    do df1 = 1, ndf_w3
      rho_e(df1) = rho( map_w3(df1) + k )
    end do
!!$acc end parallel loop
!!$acc parallel loop 
    do df1 = 1, ndf_w0
      theta_e(df1) = theta( map_w0(df1) + k )  
      chi_1_e(df1) = chi_1( map_w0(df1) + k )
      chi_2_e(df1) = chi_2( map_w0(df1) + k )
      chi_3_e(df1) = chi_3( map_w0(df1) + k )
    end do
!!$acc end parallel loop
    call coordinate_jacobian(ndf_w0, nqp_h, nqp_v, chi_1_e, chi_2_e, chi_3_e,  &
                             w0_diff_basis, jac, dj)
  ! compute the RHS integrated over one cell
!!$acc parallel loop
    do df1 = 1, ndf_w3  
      rhs_e(df1) = 0.0_r_def
      do qp2 = 1, nqp_v
        do qp1 = 1, nqp_h
          x_at_quad(:) = 0.0_r_def
          do df2 = 1, ndf_w0
            x_at_quad(1) = x_at_quad(1) + chi_1_e(df2)*w0_basis(1,df2,qp1,qp2)
            x_at_quad(2) = x_at_quad(2) + chi_2_e(df2)*w0_basis(1,df2,qp1,qp2)
            x_at_quad(3) = x_at_quad(3) + chi_3_e(df2)*w0_basis(1,df2,qp1,qp2)
          end do          
          call reference_profile(exner_s_at_quad, rho_s_at_quad, &
                                 theta_s_at_quad, x_at_quad)
          rho_at_quad = 0.0_r_def
          do df2 = 1, ndf_w3
            rho_at_quad = rho_at_quad + rho_e(df2)*w3_basis(1,df2,qp1,qp2)
          end do
          theta_at_quad   = 0.0_r_def
          do df2 = 1, ndf_w0
            theta_at_quad  = theta_at_quad   + theta_e(df2) * w0_basis(1,df2,qp1,qp2)
          end do
          rhs_eos = kappa / (1.0_r_def - kappa) * exner_s_at_quad                 &
                  *( rho_at_quad/rho_s_at_quad + theta_at_quad/theta_s_at_quad )
          rhs_e(df1) = rhs_e(df1) + wqp_h(qp1)*wqp_v(qp2)*w3_basis(1,df1,qp1,qp2) * rhs_eos * dj(qp1,qp2)
        end do
      end do
    end do
!!$acc end parallel loop
  ! compute the LHS integrated over one cell and solve 
!!$acc parallel loop  
    do df1 = 1, ndf_w3
       do df2 = 1, ndf_w3
          mass_matrix_w3(df1,df2) = 0.0_r_def
          do qp2 = 1, nqp_v
             do qp1 = 1, nqp_h
                 mass_matrix_w3(df1,df2) = mass_matrix_w3(df1,df2) &
                                         + wqp_h(qp1)*wqp_v(qp2)* &
                                         w3_basis(1,df1,qp1,qp2) * &
                                         w3_basis(1,df2,qp1,qp2) * dj(qp1,qp2)
             end do
          end do
       end do
    end do
!!$acc end parallel loop
    call matrix_invert(mass_matrix_w3,inv_mass_matrix_w3,ndf_w3)
    !exner_e(:) = matmul(inv_mass_matrix_w3(:,:),rhs_e(:))
!!$acc parallel loop 
    do df1 = 1, ndf_w3
       exner_e(df1) = 0.0_r_def
       do df2 = 1, ndf_w3
          exner_e(df1) = exner_e(df1) + inv_mass_matrix_w3(df1,df2) * rhs_e(df2)
       end do
    end do    
!!$acc end parallel loop
!!$acc parallel loop 
    do df1 = 1,ndf_w3
      exner(map_w3(df1)+k) = exner_e(df1) 
    end do    
!!$acc end parallel loop
  end do
end subroutine calc_exner_code