module device_test
implicit none
integer, device :: d_i
contains
attributes(global) subroutine cu_test(i)
use cudafor
implicit none
integer, value, intent(in) :: i
d_i = i
end subroutine cu_test
attributes(device) subroutine test(i)
use cudafor
implicit none
integer, value, intent(in) :: i
call cu_test<<<1, 1>>>(i)
end subroutine test
end module device_test
pgfortran -Minfo=all -g -Kieee -mp -Mcuda=9,cc35,rdc,debug,ptxinfo,keepptx -mcmodel=medium -Mcuda=maxregcount:64 -c test.F90
ptxas test.n001.ptx, line 257; error : Call has wrong number of parameters
ptxas test.n001.ptx, line 257; error : Call has wrong number of parameters
ptxas fatal : Ptx assembly aborted due to errors
PGF90-F-0155-Compiler failed to translate accelerator region (see -Minfo messages): Device compiler exited with error status code (test.F90: 1)
PGF90/x86-64 Linux 18.5-0: compilation aborted
Looks like an issue with using debug information with dynamic parallelism when using CUDA 9.0 or higher. As a work around, please either remove “-g” and “-Mcuda=debug” from your compiler flags.
Though, are you meaning to do dynamic parallelism (i.e. calling a global routine from another device routine)? I ask because it’s rare to find an application that benefits from dynamic parallelism so I want to make sure this is intentional. Typically global device routines are only called from host code.
-Mat
% pgfortran -Minfo=all -Kieee -Mcuda=cc70,cuda9.0,debug -Mcuda=maxregcount:64 test.cuf -c -V18.5
ptxas /tmp/pgacccJIzgWkisqaA.ptx, line 243; error : Call has wrong number of parameters
ptxas /tmp/pgacccJIzgWkisqaA.ptx, line 243; error : Call has wrong number of parameters
ptxas fatal : Ptx assembly aborted due to errors
PGF90-F-0155-Compiler failed to translate accelerator region (see -Minfo messages): Device compiler exited with error status code (test.cuf: 1)
PGF90/x86-64 Linux 18.5-0: compilation aborted