I was able to isolate the error to one function call inside a kernel. Manually inlining the subroutine lets the program run correctly. I’ve tried to construct a minimal example (with an device subroutine that gets 1D-arrays passed in and uses some module array data as well - this got inlined correctly and wouldn’t provoke the error. Since I’ve pretty much tested all the code features with that example (let me know if you still want it), my guess is that there is a bug with the inliner in relation to register spilling or something else that needs to be allocated on global memory, which wouldn’t show up in smaller examples. It seems like some component in the toolchain would create host code (cudaMalloc/cudaFree) in a place where it shouldn’t go. I cannot post the whole code here since it’s not OSS, but if you require something else please let me know.
My workaround will be to define the device subroutine code in a Macro (my Hybrid Fortran toolchain already supports multiline-macros, so that’s not a big problem).
The problematic kernel goes as follows (please see comments):
subroutine pbl_run_kernels_tend_mom( tend_mom_z_v_l, tend_rqke_v )
use prm, only : nx, ny, nz, nz_w
use dvar, only : vel_z
use const, only : one_third
use fvar, only : id_qke
use dvar, only : qa, dens_ptb
use ref, only : dens_ref_f
use metrics, only : rjd_ph, dzidz_pf, dzidz_ph, dz_ph, dz_pf
use timeset_vars, only : dt
implicit none
real(rp), intent(inout) :: tend_mom_z_v_l(nz_w, nx, ny)
real(rp), intent(in) :: tend_rqke_v(nz, nx, ny)
real(rp) :: l(nx, ny, 2:nz)
real(rp) :: d(nx, ny, 1:nz)
real(rp) :: u(nx, ny, 1:nz-1)
real(rp) :: tend_z(nx, ny, nz_w)
real(rp) :: a1, a2
real(rp) :: e1, e2
real(rp) :: r, nu1, nu2
integer(4) :: k
integer(4) :: i, j
integer(4) :: hf_symbols_are_device_present
!$acc data present(tend_mom_z_v_l, tend_rqke_v, l, d, u, tend_z, vel_z, qa, dens_ptb, dens_ref_f, rjd_ph, dzidz_pf, dzidz_ph, eddykm, l_mix, qa, dz_pf, dz_ph)
!$acc kernels if(hf_symbols_are_device_present)
!$acc loop independent vector(16)
do j=1,ny
!$acc loop independent vector(16)
do i=1,nx
! scalar variables (rqke_v)
!Michel Mueller: The following subroutine call will provoke the linker error.
! call make_matrix_tke( i, j, l(i, j, :), d(i, j, :), u(i, j, :) )
!Michel Mueller: This is the manually inlined version of the subroutine.
r = dz_pf(1,i,j) / dz_ph(1,i,j)
nu1 = dzidz_ph(1,i,j) * ( ( 2.0_rp - r ) * eddykm(1, i, j)+ r * eddykm(2, i, j))
d(1, i, j) = 1.0_rp + dt * ( nu1 * dzidz_pf(1,i,j) &
& + c_e_deardorff * sqrt(0.5_rp * qa(1,i,j,id_qke)) / l_mix(1, i, j))
u(1, i, j) = - dt * nu1 * dzidz_pf(2,i,j)
do k = 2, nz-1
r = dz_pf(k-1,i,j) / dz_ph(k-1,i,j)
nu2 = dzidz_ph(k,i,j) * ( ( 2.0_rp - r ) * eddykm(k, i, j)+ r * eddykm(k+1, i, j))
l(k, i, j) = -dt * nu1 * dzidz_pf(k-1,i,j)
d(k, i, j) = 1.0_rp &
& + dt * ( ( nu1 + nu2 ) * dzidz_pf(k ,i,j) &
& + c_e_deardorff * sqrt(0.5_rp * qa(k,i,j,id_qke)) / l_mix(k, i, j))
u(k, i, j) = -dt * nu2 * dzidz_pf(k+1,i,j)
end do
l(nz, i, j) = - dt * nu2 * dzidz_pf(nz-1,i,j)
d(nz, i, j) = 1.0_rp + dt * ( nu2 * dzidz_pf(nz ,i,j) &
& + c_e_deardorff * sqrt(0.5_rp * qa(nz,i,j,id_qke)) / l_mix(nz, i, j))
!Michel Mueller: This is the end of the manually inlined version.
! call tridiag_lu_decompose( l(i, j, :), d(i, j, :), u(i, j, :), &
! & tend_rqke_v(1, i, j), nz )
! call tridiag_solve( l(i, j, :), d(i, j, :), tend_rqke_v(1, i, j), nz )
! momentum z
a1 = ( dens_ref_f(1, i, j)+ dens_ptb(1, i, j)) * eddykm(1, i, j)* &
& dzidz_pf(1, i, j)
a2 = ( dens_ref_f(2, i, j)+ dens_ptb(2, i, j)) * eddykm(2, i, j)* &
& dzidz_pf(2, i, j)
e1 = ( dens_ref_f(1, i, j)+ dens_ptb(1, i, j)) * qa(1, i, j, id_qke)* one_third
e2 = ( dens_ref_f(2, i, j)+ dens_ptb(2, i, j)) * qa(2, i, j, id_qke)* one_third
!$acc loop seq
do k = 1, nz_w-1
tend_z(i, j, k)= ( a2 * ( vel_z(k+1, i, j)- vel_z(k, i, j)) - e2 &
& - a1 * ( vel_z(k, i, j)- vel_z(k-1, i, j)) + e1 ) &
& * dzidz_ph(k, i, j)
a1 = a2
a2 = ( dens_ref_f(k+1, i, j)+ dens_ptb(k+1, i, j)) * eddykm(k+1, i, j)* &
& dzidz_pf(k+1, i, j)
e1 = e2
e2 = ( dens_ref_f(k+1, i, j)+ dens_ptb(k+1, i, j)) * qa(k+1, i, j, id_qke)* &
& one_third
end do
tend_z(i, j, nz_w)= 0.0_rp
! call make_matrix_w( i, j, l(i, j, :), d(i, j, :), u(i, j, :) )
! call tridiag_lu_decompose( l(i, j, :), d(i, j, :), u(i, j, :), &
! & tend_z(i, j, :), nz_w )
! call tridiag_solve( l(i, j, :), d(i, j, :), tend_z(i, j, :), nz_w )
!$acc loop seq
do k = 1, nz_w
tend_mom_z_v_l(k, i, j)= tend_mom_z_v_l(k, i, j)+ tend_z(i, j, k)&
& * ( 0.5_rp * ( dens_ref_f(k, i, j)+ dens_ref_f(k+1, i, j)) &
& + 0.5_rp * ( dens_ptb(k, i, j)+ dens_ptb(k+1, i, j)) ) &
& * rjd_ph(k, i, j)
end do
end do
end do
!$acc end kernels
!$acc end data
end subroutine pbl_run_kernels_tend_mom
The subroutine:
subroutine make_matrix_tke( i, j, l, d, u )
use prm, only : nz
use fvar, only : id_qke
use dvar, only : qa
use metrics, only : dz_pf, dz_ph, dzidz_pf, dzidz_ph
use timeset_vars, only : dt
implicit none
integer(4), intent(in) :: i
integer(4), intent(in) :: j
real(rp), intent(out) :: l(2:nz)
real(rp), intent(out) :: d(1:nz)
real(rp), intent(out) :: u(1:nz-1)
real(rp) :: r, nu1, nu2
integer(4) :: k
integer(4) :: hf_symbols_are_device_present
r = dz_pf(1,i,j) / dz_ph(1,i,j)
nu1 = dzidz_ph(1,i,j) * ( ( 2.0_rp - r ) * eddykm(1, i, j)+ r * eddykm(2, i, j))
d(1) = 1.0_rp + dt * ( nu1 * dzidz_pf(1,i,j) &
& + c_e_deardorff * sqrt(0.5_rp * qa(1,i,j,id_qke)) / l_mix(1, i, j))
u(1) = - dt * nu1 * dzidz_pf(2,i,j)
do k = 2, nz-1
r = dz_pf(k-1,i,j) / dz_ph(k-1,i,j)
nu2 = dzidz_ph(k,i,j) * ( ( 2.0_rp - r ) * eddykm(k, i, j)+ r * eddykm(k+1, i, j))
l(k) = -dt * nu1 * dzidz_pf(k-1,i,j)
d(k) = 1.0_rp &
& + dt * ( ( nu1 + nu2 ) * dzidz_pf(k ,i,j) &
& + c_e_deardorff * sqrt(0.5_rp * qa(k,i,j,id_qke)) / l_mix(k, i, j))
u(k) = -dt * nu2 * dzidz_pf(k+1,i,j)
end do
l(nz) = - dt * nu2 * dzidz_pf(nz-1,i,j)
d(nz) = 1.0_rp + dt * ( nu2 * dzidz_pf(nz ,i,j) &
& + c_e_deardorff * sqrt(0.5_rp * qa(nz,i,j,id_qke)) / l_mix(nz, i, j))
end subroutine make_matrix_tke
Compiler call pbl_shared.f90:
pgf90 -DGPU -I /home/michel/asuca/hybrid/Nusdas13/src -I //home/michel/lib/netcdf3/include -Mcuda=cc3x,6.5 -ta=tesla:loadcache:L1,cc3x -Minline=levels:5,reshape -Mipa=inline,reshape -Minfo=accel,inline,ipa -Mneginfo -Minform=inform -byteswapio -Mmpi=mpich -DGPU -c pbl_shared.f90 -o pbl_shared.o
dxi2dx_scalar:
129, Generating update device(h_pbl(:,:))
130, Generating update device(prandtl_t(:,:,:))
131, Generating update device(l_mix(:,:,:))
132, Generating update device(eddykm(:,:,:))
133, Generating update device(eddykh(:,:,:))
134, Generating copyin(dx_pf(nx_mn:nx_mx-1,ny_mn+1:ny_mx-1),dx_uf(nx_mn:nx_mx-1,ny_mn+1:ny_mx-1),dsdyi(1:nz,nx_mn:nx_mx,ny_mn:ny_mx-1),dsdzi(0:nz,nx_mn:nx_mx,ny_mn+1:ny_mx-1),dzidx_uf(1:nz,nx_mn:nx_mx-1,ny_mn+1:ny_mx-1),dyidx_uf(1:nz,nx_mn:nx_mx-1,ny_mn+1:ny_mx-1),dsdxi(1:nz,nx_mn:nx_mx-1,ny_mn+1:ny_mx-1),dxidx_uf(1:nz,nx_mn:nx_mx-1,ny_mn+1:ny_mx-1))
Generating copyout(dsdx(1:nz,nx_mn:nx_mx-1,ny_mn+1:ny_mx-1))
136, Loop is parallelizable
138, Loop is parallelizable
140, Loop is parallelizable
Accelerator kernel generated
136, !$acc loop gang, vector(16) ! blockidx%y threadidx%y
138, !$acc loop gang, vector(16) ! blockidx%x threadidx%x
155, Generating update host(h_pbl(:,:))
156, Generating update host(prandtl_t(:,:,:))
157, Generating update host(l_mix(:,:,:))
158, Generating update host(eddykm(:,:,:))
159, Generating update host(eddykh(:,:,:))
162, Generating update device(h_pbl(:,:))
163, Generating update device(prandtl_t(:,:,:))
164, Generating update device(l_mix(:,:,:))
165, Generating update device(eddykm(:,:,:))
166, Generating update device(eddykh(:,:,:))
167, Generating copyin(dy_pf(nx_mn+1:nx_mx-1,ny_mn:ny_mx-1),dy_vf(nx_mn+1:nx_mx-1,ny_mn:ny_mx-1),dsdxi(1:nz,nx_mn:nx_mx-1,ny_mn:ny_mx),dsdzi(0:nz,nx_mn+1:nx_mx-1,ny_mn:ny_mx),dzidy_vf(1:nz,nx_mn+1:nx_mx-1,ny_mn:ny_mx-1),dsdyi(1:nz,nx_mn+1:nx_mx-1,ny_mn:ny_mx-1),dyidy_vf(1:nz,nx_mn+1:nx_mx-1,ny_mn:ny_mx-1),dxidy_vf(1:nz,nx_mn+1:nx_mx-1,ny_mn:ny_mx-1))
Generating copyout(dsdy(1:nz,nx_mn+1:nx_mx-1,ny_mn:ny_mx-1))
169, Loop is parallelizable
171, Loop is parallelizable
173, Loop is parallelizable
Accelerator kernel generated
169, !$acc loop gang, vector(16) ! blockidx%y threadidx%y
171, !$acc loop gang, vector(16) ! blockidx%x threadidx%x
188, Generating update host(h_pbl(:,:))
189, Generating update host(prandtl_t(:,:,:))
190, Generating update host(l_mix(:,:,:))
191, Generating update host(eddykm(:,:,:))
192, Generating update host(eddykh(:,:,:))
195, Generating update device(h_pbl(:,:))
196, Generating update device(prandtl_t(:,:,:))
197, Generating update device(l_mix(:,:,:))
198, Generating update device(eddykm(:,:,:))
199, Generating update device(eddykh(:,:,:))
200, Generating copyin(dz_pf(1:nz-1,nx_mn+1:nx_mx-1,ny_mn+1:ny_mx-1),dz_ph(1:nz-1,nx_mn+1:nx_mx-1,ny_mn+1:ny_mx-1),dsdxi(1:nz,nx_mn:nx_mx-1,ny_mn+1:ny_mx-1),dsdyi(1:nz,nx_mn+1:nx_mx-1,ny_mn:ny_mx-1),dsdzi(1:nz-1,nx_mn+1:nx_mx-1,ny_mn+1:ny_mx-1),dzidz_ph(1:nz-1,nx_mn+1:nx_mx-1,ny_mn+1:ny_mx-1),dyidz_ph(1:nz-1,nx_mn+1:nx_mx-1,ny_mn+1:ny_mx-1),dxidz_ph(1:nz-1,nx_mn+1:nx_mx-1,ny_mn+1:ny_mx-1))
Generating copyout(dsdz(0:nz,nx_mn+1:nx_mx-1,ny_mn+1:ny_mx-1))
202, Loop is parallelizable
204, Loop is parallelizable
Accelerator kernel generated
202, !$acc loop gang, vector(16) ! blockidx%y threadidx%y
204, !$acc loop gang, vector(16) ! blockidx%x threadidx%x
206, Loop is parallelizable
223, Generating update host(h_pbl(:,:))
224, Generating update host(prandtl_t(:,:,:))
225, Generating update host(l_mix(:,:,:))
226, Generating update host(eddykm(:,:,:))
227, Generating update host(eddykh(:,:,:))
dx2dxi_scalar:
313, Generating update device(h_pbl(:,:))
314, Generating update device(prandtl_t(:,:,:))
315, Generating update device(l_mix(:,:,:))
316, Generating update device(eddykm(:,:,:))
317, Generating update device(eddykh(:,:,:))
318, Generating copyin(dx_pf(0:nx,1:ny+1),dx_uf(0:nx,1:ny+1),factor(:nz,:nx+1,1:ny+1),dsdy(1:nz,0:nx+1,0:ny+1),dsdz(0:nz,0:nx+1,1:ny+1),dxidz_uf(1:nz,0:nx,1:ny+1),dxidy_uf(1:nz,0:nx,1:ny+1),dsdx(1:nz,0:nx,1:ny+1),dxidx_uf(1:nz,0:nx,1:ny+1),rjd_uf(1:nz,0:nx,1:ny+1))
Generating copyout(dsdxi(1:nz,0:nx,1:ny+1))
320, Loop is parallelizable
322, Loop is parallelizable
324, Loop is parallelizable
Accelerator kernel generated
320, !$acc loop gang, vector(16) ! blockidx%y threadidx%y
322, !$acc loop gang, vector(16) ! blockidx%x threadidx%x
341, Generating update host(h_pbl(:,:))
342, Generating update host(prandtl_t(:,:,:))
343, Generating update host(l_mix(:,:,:))
344, Generating update host(eddykm(:,:,:))
345, Generating update host(eddykh(:,:,:))
348, Generating update device(h_pbl(:,:))
349, Generating update device(prandtl_t(:,:,:))
350, Generating update device(l_mix(:,:,:))
351, Generating update device(eddykm(:,:,:))
352, Generating update device(eddykh(:,:,:))
353, Generating copyin(dy_pf(0:nx+1,1:ny),dy_vf(0:nx+1,1:ny),factor(:nz,:nx+1,1:ny+1),dsdx(1:nz,-1:nx+1,1:ny+1),dsdz(0:nz,0:nx+1,1:ny+1),dyidz_vf(1:nz,0:nx+1,1:ny),dsdy(1:nz,0:nx+1,1:ny),dyidy_vf(1:nz,0:nx+1,1:ny),dyidx_vf(1:nz,0:nx+1,1:ny),rjd_vf(1:nz,0:nx+1,1:ny))
Generating copyout(dsdyi(1:nz,0:nx+1,1:ny))
355, Loop is parallelizable
357, Loop is parallelizable
359, Loop is parallelizable
Accelerator kernel generated
355, !$acc loop gang, vector(16) ! blockidx%y threadidx%y
357, !$acc loop gang, vector(16) ! blockidx%x threadidx%x
376, Generating update host(h_pbl(:,:))
377, Generating update host(prandtl_t(:,:,:))
378, Generating update host(l_mix(:,:,:))
379, Generating update host(eddykm(:,:,:))
380, Generating update host(eddykh(:,:,:))
383, Generating update device(h_pbl(:,:))
384, Generating update device(prandtl_t(:,:,:))
385, Generating update device(l_mix(:,:,:))
386, Generating update device(eddykm(:,:,:))
387, Generating update device(eddykh(:,:,:))
388, Generating copyin(dz_pf(1:nz-1,1:nx+1,1:ny+1),dz_ph(1:nz-1,1:nx+1,1:ny+1),factor(:nz,1:nx+1,1:ny+1),dsdx(1:nz,0:nx+1,1:ny+1),dsdy(1:nz,1:nx+1,0:ny+1),dsdz(1:nz-1,1:nx+1,1:ny+1),dzidz_ph(1:nz-1,1:nx+1,1:ny+1),dzidy_ph(1:nz-1,1:nx+1,1:ny+1),dzidx_ph(1:nz-1,1:nx+1,1:ny+1),rjd_ph(1:nz-1,1:nx+1,1:ny+1))
Generating copyout(dsdzi(0:nz,1:nx+1,1:ny+1))
390, Loop is parallelizable
392, Loop is parallelizable
Accelerator kernel generated
390, !$acc loop gang, vector(16) ! blockidx%y threadidx%y
392, !$acc loop gang, vector(16) ! blockidx%x threadidx%x
394, Loop is parallelizable
413, Generating update host(h_pbl(:,:))
414, Generating update host(prandtl_t(:,:,:))
415, Generating update host(l_mix(:,:,:))
416, Generating update host(eddykm(:,:,:))
417, Generating update host(eddykh(:,:,:))
pbl_run_kernels_sgs:
1535, Generating update device(h_pbl(:,:))
1536, Generating update device(prandtl_t(:,:,:))
1537, Generating update device(l_mix(:,:,:))
1538, Generating update device(eddykm(:,:,:))
1539, Generating update device(eddykh(:,:,:))
1540, Generating copyin(qa(nz_mn:nz_mx,nx_mn:nx_mx,ny_mn:nx_mx,id_qke),dens_ptb(nz_mn:nz_mx,nx_mn:nx_mx,ny_mn:nx_mx),dens_ref_f(nz_mn:nz_mx,nx_mn:nx_mx,ny_mn:nx_mx))
Generating copyout(rqke(nz_mn:nz_mx,nx_mn:nx_mx,ny_mn:nx_mx))
1542, Loop is parallelizable
1544, Loop is parallelizable
1546, Loop is parallelizable
Accelerator kernel generated
1542, !$acc loop gang, vector(16) ! blockidx%y threadidx%y
1544, !$acc loop gang, vector(16) ! blockidx%x threadidx%x
1553, Generating update host(h_pbl(:,:))
1554, Generating update host(prandtl_t(:,:,:))
1555, Generating update host(l_mix(:,:,:))
1556, Generating update host(eddykm(:,:,:))
1557, Generating update host(eddykh(:,:,:))
1559, Generating update device(h_pbl(:,:))
1560, Generating update device(prandtl_t(:,:,:))
1561, Generating update device(l_mix(:,:,:))
1562, Generating update device(eddykm(:,:,:))
1563, Generating update device(eddykh(:,:,:))
1564, Generating copyout(tend_rqke_v(:nz,:nx,:ny))
1566, Loop is parallelizable
1568, Loop is parallelizable
1570, Loop is parallelizable
Accelerator kernel generated
1566, !$acc loop gang, vector(16) ! blockidx%y threadidx%y
1568, !$acc loop gang, vector(16) ! blockidx%x threadidx%x
1570, Memory zero idiom, loop replaced by call to __c_mzero8
1576, Generating update host(h_pbl(:,:))
1577, Generating update host(prandtl_t(:,:,:))
1578, Generating update host(l_mix(:,:,:))
1579, Generating update host(eddykm(:,:,:))
1580, Generating update host(eddykh(:,:,:))
pbl_run_kernels_rqke:
1598, Generating update device(h_pbl(:,:))
1599, Generating update device(prandtl_t(:,:,:))
1600, Generating update device(l_mix(:,:,:))
1601, Generating update device(eddykm(:,:,:))
1602, Generating update device(eddykh(:,:,:))
1603, Generating copyin(rjd_pf(1:nz,1:nx,1:ny))
Generating copy(tend_rqke_v(:nz,:nx,:ny))
1605, Loop is parallelizable
1607, Loop is parallelizable
1609, Loop is parallelizable
Accelerator kernel generated
1605, !$acc loop gang, vector(16) ! blockidx%y threadidx%y
1607, !$acc loop gang, vector(16) ! blockidx%x threadidx%x
1615, Generating update host(h_pbl(:,:))
1616, Generating update host(prandtl_t(:,:,:))
1617, Generating update host(l_mix(:,:,:))
1618, Generating update host(eddykm(:,:,:))
1619, Generating update host(eddykh(:,:,:))
pbl_run_kernels_tend_mom:
1646, Generating enter data create(u(:,:,:),l(:,:,:),d(:,:,:),tend_z(:,:,:))
1650, Generating update device(tend_z(:,:,:))
1651, Generating update device(h_pbl(:,:))
1652, Generating update device(prandtl_t(:,:,:))
1653, Generating update device(d(:,:,:))
1654, Generating update device(l_mix(:,:,:))
1655, Generating update device(l(:,:,:))
1656, Generating update device(eddykm(:,:,:))
1657, Generating update device(u(:,:,:))
1658, Generating update device(eddykh(:,:,:))
1659, Generating copy(tend_mom_z_v_l(:,:,:))
Generating copyin(rjd_ph(:,:,:))
Generating copy(tmp$r965)
Generating copyin(…inline(:,:,:))
Generating copy(tmp$r945,tmp$r942(:),d(:,:,:),tmp$r)
Generating copy(.g0193)
Generating copy(l(:,:,:z_e_305-1+1),tmp$r(:),tmp$r942,u(:,:,:),tmp$r945(:))
Generating copyin(…inline(:,:,:,:))
Generating copy(tend_rqke_v(:,:,:))
Generating copyin(dzidz_pf(:,:,:),eddykm(:,:,:),dens_ptb(:,:,:),dens_ref_f(:,:,:),qa(:,:,:,:))
Generating copy(tend_z(:,:,:))
Generating copyin(vel_z(:,:,:),dzidz_ph(:,:,:))
Generating copy(tmp$r965(:))
1663, Accelerator kernel generated
1665, !$acc loop vector(128) ! threadidx%x
1666, !$acc loop vector(128) ! threadidx%x
1668, !$acc loop vector(128) ! threadidx%x
1689, !$acc loop vector(128) ! threadidx%x
1690, !$acc loop vector(128) ! threadidx%x
1692, !$acc loop vector(128) ! threadidx%x
1665, Copy in and copy out of l in call to make_matrix_tke
Copy in and copy out of d in call to make_matrix_tke
Copy in and copy out of u in call to make_matrix_tke
make_matrix_tke inlined, size=44, file pbl_shared.f90 (1432)
1665, Scalar last value needed after loop for …inline at line 1665
1665, Loop is parallelizable
1666, Copy in of l in call to tridiag_lu_decompose
Copy in and copy out of d in call to tridiag_lu_decompose
Copy in of u in call to tridiag_lu_decompose
tridiag_lu_decompose inlined, size=12, file pbl_shared.f90 (1324)
1666, Complex loop carried dependence of …inline prevents parallelization
Loop carried dependence due to exposed use of …inline(:) prevents parallelization
1666, Loop is parallelizable
1668, Copy in of l in call to tridiag_solve
Copy in and copy out of d in call to tridiag_solve
tridiag_solve inlined, size=9, file pbl_shared.f90 (1345)
1668, Loop carried dependence of …inline prevents parallelization
Loop carried backward dependence of …inline prevents vectorization
1668, Loop is parallelizable
1677, Loop carried scalar dependence for a1,a2,a2,e1,e2,e2 at line 1678
1689, Copy in and copy out of l in call to make_matrix_w
Copy in and copy out of d in call to make_matrix_w
Copy in and copy out of u in call to make_matrix_w
make_matrix_w inlined, size=44, file pbl_shared.f90 (1471)
1689, Loop carried scalar dependence for …inline at line 1689
1689, Loop is parallelizable
1690, Copy in of l in call to tridiag_lu_decompose
Copy in and copy out of d in call to tridiag_lu_decompose
Copy in of u in call to tridiag_lu_decompose
Copy in and copy out of tend_z in call to tridiag_lu_decompose
tridiag_lu_decompose inlined, size=12, file pbl_shared.f90 (1324)
1690, Complex loop carried dependence of …inline prevents parallelization
Loop carried dependence due to exposed use of …inline(:) prevents parallelization
1690, Loop is parallelizable
1692, Copy in of l in call to tridiag_solve
Copy in and copy out of d in call to tridiag_solve
Copy in and copy out of tend_z in call to tridiag_solve
tridiag_solve inlined, size=9, file pbl_shared.f90 (1345)
1692, Loop carried dependence of …inline prevents parallelization
Loop carried backward dependence of …inline prevents vectorization
1692, Loop is parallelizable
1694, Loop is parallelizable
1703, Generating update host(tend_z(:,:,:))
1704, Generating update host(h_pbl(:,:))
1705, Generating update host(prandtl_t(:,:,:))
1706, Generating update host(d(:,:,:))
1707, Generating update host(l_mix(:,:,:))
1708, Generating update host(l(:,:,:))
1709, Generating update host(eddykm(:,:,:))
1710, Generating update host(u(:,:,:))
1711, Generating update host(eddykh(:,:,:))
1712, Generating exit data delete(u(:,:,:),l(:,:,:),d(:,:,:),tend_z(:,:,:))
The linker command with the error message. Please note that nm pbl_shared.o
doesn’t show references to cudaMalloc/cudaFree.:
pgf90 -Mcuda=cc3x,6.5 -ta=nvidia,cc3x,time -Mipa=inline,reshape -Minfo=accel,inline -Mneginfo -byteswapio -Mmpi=mpich -o ideal_make_grid -L/home/michel/lib/nusdas/lib -L/home/michel/lib/netcdf3/lib -L/home/michel/asuca/hybrid/asuca-kij/build/gpu/Framework/…/HybridSources ideal_make_grid.o prepsub.o parameter_control.o restart_vars.o vertical.o checktool_vars.o sf_parameters.o -lasuca -lnusdas -lnwp -lnetcdf
IPA inhibited: no main routine
nvlink error : Undefined reference to ‘cudaMalloc’ in ‘/home/michel/asuca/hybrid/asuca-kij/build/gpu/Framework/…/HybridSources/libasuca.a:pbl_shared.o’
nvlink error : Undefined reference to ‘cudaFree’ in ‘/home/michel/asuca/hybrid/asuca-kij/build/gpu/Framework/…/HybridSources/libasuca.a:pbl_shared.o’
pgacclnk: child process exit status 2: /opt/pgi/linux86-64/15.1/bin/pgnvd
child process exit status 2: /opt/pgi/linux86-64/15.1/bin/pgacclnk