Undefined reference to 'cudaMalloc' in 'lib/liblibsensei.a:e

I have a question which has frustrated me for nearly one month and I still cannot figure it out. The snippet of the code is attached here:

!============================= set_ghost_bc_kmax =============================80
!
!
!
!=============================================================================80
subroutine set_ghost_bc_kmax( soln, grid, n_mmtm, bound )

use set_precision, only : dp
use grid_derived_type, only : grid_block, boundary_t
use soln_derived_type, only : soln_block
use set_inputs, only : bc_order

type(grid_block), intent(in) :: grid
type(soln_block), intent(inout) :: soln
integer, intent(in) :: n_mmtm
type(boundary_t), intent(in) :: bound

integer :: i, j, k, klow, khigh, bclow, bchigh, korder, length
!$acc declare create(khigh, klow, korder)

!real(dp), dimension(n_ghost_cells(3)-1 + max(2, bc_order)+1) &
! :: rho, p, temp
!real(dp), dimension(n_mmtm, n_ghost_cells(3)-1 + max(2, bc_order)+1) &
! :: vel
!real(dp), dimension(n_ghost_cells(1)-1 + max(2, bc_order)) &
! :: vol

real(dp) :: molweight

integer :: sign_s = 1

continue

bclow = 1 - (n_ghost_cells(3)-1)
bchigh = max( 2, bc_order )

length = bchigh - bclow + 2

klow = grid%k_cells-bchigh+1
khigh = grid%k_cells-bclow+1
korder = -1

k = grid%kmax

molweight = sum(soln%molecular_weight)

!$acc data present(soln, soln%rho, soln%vel, soln%p, soln%temp, &
!$acc soln%molecular_weight, grid%grid_vars%volume, &
!$acc grid%grid_vars%zeta_n, grid, grid%grid_vars) &
!$acc copyin(bound, bclow, bchigh, n_mmtm)

!$acc data
!$acc kernels
!$acc loop independent
#if OMP_LOOP
!$OMP PARALLEL DO PRIVATE(i)
#endif
do j = bound%indx_min(2),bound%indx_max(2)
!!$acc loop independent vector private(rho, vel, p, temp, molweight, vol)
!$acc loop independent vector
do i = bound%indx_min(1),bound%indx_max(1)
!rho(1:length) = soln%rho(i,j,high+1:low:order)
!vel(1:n_mmtm,1:length) = soln%vel(1:n_mmtm,i,j,high+1:low:order)
!p(1:length) = soln%p(i,j,high+1:low:order)
!temp(1:length) = soln%temp(i,j,high+1:low:order)

!molweight = sum(soln%molecular_weight)
!vol = grid%grid_vars%volume(i,j,high:low:order)
call set_bc_k(bound%bc_label, &
soln%rho(i,j,khigh:klow:korder), &
soln%vel(1:3,i,j,khigh:klow:korder), &
soln%p(i,j,khigh:klow:korder), &
soln%temp(i,j,khigh:klow:korder), &
molweight, &
grid%grid_vars%volume(i,j,khigh:klow:korder), &
grid%grid_vars%zeta_n(1:3,i,j,k), &
bclow, &
bchigh, &
n_mmtm,i,j,bound, sign_s)

!soln%rho(i,j,high+1:low:order) = rho(1:length)
!soln%vel(1:n_mmtm,i,j,high+1:low:order) = vel(1:n_mmtm,1:length)
!soln%p(i,j,high+1:low:order) = p(1:length)
!soln%temp(i,j,high+1:low:order) = temp(1:length)
!molweight = sum(soln%molecular_weight)

end do
end do
#if OMP_LOOP
!$OMP END PARALLEL DO
#endif
!$acc end kernels
!$acc end data

end subroutine set_ghost_bc_kmax

In the snippet of the code, if I use constant variables to indicate the range of the parameters, eg. using “soln%rho(i,j,khigh:klow:korder)”, then when I compile it also returns the error "nvlink error : Undefined reference to ‘cudaMalloc’ in ‘lib/liblibsensei.a:euler_bc_ghost.F90.o’
nvlink error : Undefined reference to ‘cudaFree’ in ‘lib/liblibsensei.a:euler_bc_ghost.F90.o’.

However, if I specify the values of the index, i.e. using “soln%rho(i,j,13:9:-1)” then it could compile.

But why? Thanks a lot!
"

I just formulated a very simple code and compiled it, and got errors. Thank you very much!

module test
contains
subroutine test_para_inout(jmax,vel_i,vel_ir)
!$acc routine seq
implicit none

integer :: jmax
!real(8), dimension(-100:-98), intent(inout) :: vel_i
!real(8), dimension(-100:-98), intent(inout) :: vel_ir
real(8), dimension(2:3), intent(inout) :: vel_i
real(8), dimension(2:3), intent(inout) :: vel_ir

vel_ir(3) = vel_i(3) + 100

end subroutine test_para_inout
end module test

program routine_para_inout
use test
implicit none

real(8), dimension(3,4) :: vel
real(8), dimension(3,4) :: vel_r
integer :: i, j
integer :: imax, jmax

imax = 3
jmax = 4

!$acc data

!$acc kernels
!$acc loop
do j = 1, jmax
!$acc loop
do i = 1, imax
vel(i,j) = j+i**2
vel_r(i,j) = 0.0D0
enddo
enddo
!$acc end kernels

!$acc kernels
!$acc loop independent
do i = 1, imax
!$acc loop independent
do j = 1, jmax
!call test_para_inout(jmax,vel(i,1:3),vel_r(i,1:3))
call test_para_inout(jmax,vel(i,jmax-2:jmax-1),vel_r(i,jmax-2:jmax-1))
enddo
enddo
!$acc end kernels

!$acc end data

write(,) vel_r(3,2)

end program routine_para_inout


[weich97@thermisto Fortran_Test]$ pgfortran -acc -Minfo=accel routine_para_inout.f90 -o routine_para_inout
test_para_inout:
3, Generating acc routine seq
routine_para_inout:
32, Generating copyout(vel(:,:),vel_r(:,:))
34, Loop is parallelizable
36, Loop is parallelizable
Accelerator kernel generated
Generating Tesla code
34, !$acc loop gang, vector(4) ! blockidx%y threadidx%y
36, !$acc loop gang, vector(32) ! blockidx%x threadidx%x
43, Generating copy(jmax)
Generating copy(tmp$r,tmp$r(:),tmp$r4,tmp$r4(:),vel(:,:),vel_r(:,:))
45, Loop is parallelizable
47, Loop is parallelizable
Accelerator kernel generated
Generating Tesla code
45, !$acc loop gang, vector(32) ! blockidx%x threadidx%x
47, !$acc loop gang ! blockidx%y
49, !$acc loop seq
nvlink error : Undefined reference to ‘cudaMalloc’ in ‘/tmp/pgfortranWo9fqXik6n7W.o’
nvlink error : Undefined reference to ‘cudaFree’ in ‘/tmp/pgfortranWo9fqXik6n7W.o’
pgacclnk: child process exit status 2: /aoe/pgi165/linux86-64/16.5/bin/pgnvd

Hi Sunsincer97,

This was a known problem in PGI 16.5 where compiler generated temp arrays in device code wasn’t supported. This support was added in PGI 16.9.

Even though the code will now link, you may consider reworking your code so the temp array isn’t needed. Here the problem is that you’re passing in the non-contiguous dimension of 2-D array and reshaping it to a 1-D array.

-Mat

Hi Mat,

Thank you very much for your reply! Now I can get the simple code working (the dummy variable in the routine test_para_inout should have an intent(in) property but I forgot), but still not for the CFD code. For that code, I guess there may be some other errors. I’ll try to figure it out.

[/code]module test
contains
subroutine test_para_inout(jmax1,vel_i,vel_ir)
!$acc routine seq
implicit none

integer, intent(in) :: jmax1
!real(8), dimension(-100:-98), intent(inout) :: vel_i
!real(8), dimension(-100:-98), intent(inout) :: vel_ir
real(8), dimension(jmax1-2:jmax1-1), intent(inout) :: vel_i
real(8), dimension(jmax1-2:jmax1-1), intent(inout) :: vel_ir

!vel_ir(-99) = vel_i(-99) + 100
vel_ir(jmax1-2) = vel_i(jmax1-2) + 100

end subroutine test_para_inout
end module test

program routine_para_inout
use test
implicit none

real(8), dimension(3,4) :: vel
real(8), dimension(3,4) :: vel_r
integer :: i, j
integer :: imax, jmax
integer :: imax1, jmax1

imax = 3
jmax = 4

imax1 = imax
jmax1 = jmax

!$acc data

!$acc kernels
!$acc loop
do j = 1, jmax
!$acc loop
do i = 1, imax
vel(i,j) = j+i**2
vel_r(i,j) = 0.0D0
enddo
enddo
!$acc end kernels

!$acc kernels
!$acc loop independent
do i = 1, imax
!$acc loop independent
do j = 1, jmax
!call test_para_inout(jmax,vel(i,1:3),vel_r(i,1:3)) ! if the bounds are
!explicitly specified, then the code can compile successfully and return
!correct solutions
call test_para_inout(jmax1,vel(i,jmax-2:jmax-1),vel_r(i,jmax-2:jmax-1))
enddo
enddo
!$acc end kernels

!$acc end data

write(,) vel_r(3,2)

end program routine_para_inout[/code]

Best Regards,

Weicheng Xue