Compiling Python wrappers with F2PY and CUDA Fortran

Hey guys,

I’m having some trouble using f2py with the pgi fortran compilers.

cpu_wrapper.f90

subroutine wrap_stuff(Z, ni)
  use cudafor

  implicit none

  real, dimension(:), intent(out) :: Z
  integer, intent(in) :: ni

  type(dim3) :: grid, tBlock

  real, device, dimension(:), allocatable :: Z_d

  allocate(Z_d(ni))

  tBlock = dim3(32,1,1)
  grid = dim3(1,1,1)

  Z_d = Z

  call  do_stuff<<<grid, tBlock>>>(Z_d, ni)

  Z = Z_d

end subroutine

gpu_code.f90

attributes(global) subroutine do_stuff(Z, N)

  use cudafor
  
  implicit none

  real, dimension(:), device :: Z
  integer, intent(in) :: N

  integer :: i

  do i = threadIdx%x, N, blockDim%x
    Z(i) = Z(i) ** 2.0
  enddo
  

end subroutine do_stuff

command line: f2py --fcompiler=pg -m test -c cpu_wrapper.f90 gpu_code.f90 --f90flags=“-Mcuda -fPIC”

everything compiles fine, however running ldd -r on test.cpython-37m-x86_64-linux-gnu.so yields the following:

ldd -r test.cpython-37m-x86_64-linux-gnu.so 
	linux-vdso.so.1 (0x00007ffd3058d000)
	libpgf90rtl.so => /opt/pgi/linux86-64/2019/lib/libpgf90rtl.so (0x00007f3c9e748000)
	libpgf90.so => /opt/pgi/linux86-64/2019/lib/libpgf90.so (0x00007f3c9e1b0000)
	libpgf90_rpm1.so => /opt/pgi/linux86-64/2019/lib/libpgf90_rpm1.so (0x00007f3c9dfae000)
	libpgf902.so => /opt/pgi/linux86-64/2019/lib/libpgf902.so (0x00007f3c9dd9b000)
	libpgftnrtl.so => /opt/pgi/linux86-64/2019/lib/libpgftnrtl.so (0x00007f3c9db5c000)
	libpgatm.so => /opt/pgi/linux86-64/2019/lib/libpgatm.so (0x00007f3c9d953000)
	libpgkomp.so => /opt/pgi/linux86-64/2019/lib/libpgkomp.so (0x00007f3c9d750000)
	libomp.so => /home/nick/anaconda3/lib/libomp.so (0x00007f3c9ecc2000)
	libpthread.so.0 => /lib/x86_64-linux-gnu/libpthread.so.0 (0x00007f3c9d531000)
	libpgmath.so => /opt/pgi/linux86-64/2019/lib/libpgmath.so (0x00007f3c9d11c000)
	libpgc.so => /opt/pgi/linux86-64/2019/lib/libpgc.so (0x00007f3c9cdc3000)
	librt.so.1 => /lib/x86_64-linux-gnu/librt.so.1 (0x00007f3c9cbbb000)
	libm.so.6 => /lib/x86_64-linux-gnu/libm.so.6 (0x00007f3c9c81d000)
	libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007f3c9c42c000)
	libgcc_s.so.1 => /home/nick/anaconda3/lib/libgcc_s.so.1 (0x00007f3c9ec83000)
	/lib64/ld-linux-x86-64.so.2 (0x00007f3c9eb72000)
	libdl.so.2 => /lib/x86_64-linux-gnu/libdl.so.2 (0x00007f3c9c228000)
undefined symbol: PyExc_ValueError	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: PyCapsule_Type	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: _Py_NoneStruct	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: PyExc_AttributeError	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: PyType_Type	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: PyExc_RuntimeError	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: PyExc_TypeError	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: PyExc_ImportError	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: PyComplex_Type	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: PyLong_AsLong	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: PyDict_GetItemString	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: PyObject_GetAttrString	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: PyMem_Free	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: PyType_IsSubtype	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: PyModule_GetDict	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: PyErr_NoMemory	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: PyDict_SetItemString	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: PyUnicode_FromFormat	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: __cudaRegisterFunction	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: PyArg_ParseTupleAndKeywords	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: _PyObject_New	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: PyNumber_Long	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: PyBytes_FromString	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: PyErr_Format	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: PyMem_Malloc	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: Py_BuildValue	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: PyImport_ImportModule	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: __pgiLaunchKernelFromStub	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: PyUnicode_FromString	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: PySequence_Check	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: pgf90_dev_allocated_i8	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: PyErr_Clear	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: pgf90_dev_copyin	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: __pgiLaunchKernel	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: PyOS_snprintf	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: PyDict_New	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: PyErr_SetString	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: PyCapsule_New	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: pgf90_dev_dealloc03_i8	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: __pgi_cuda_register_fat_binaryA	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: PyObject_SetAttrString	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: PyCapsule_GetPointer	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: pgf90_dev_copyout	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: PyObject_Free	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: PySequence_GetItem	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: PyErr_NewException	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: PyModule_Create2	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: pgf90_dev_alloc04_i8	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: PyErr_Occurred	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: PyObject_GenericGetAttr	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: pgf90_dev_dealloc_mbr03_i8	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: PyDict_DelItemString	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: PyErr_Print	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: PyUnicode_Concat	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: PyUnicode_FromStringAndSize	(./test.cpython-37m-x86_64-linux-gnu.so)

clearly, the python3 libraries are missing, so adding this with the following command line:

f2py --fcompiler=pg --f90exec=/opt/pgi/linux86-64/2019/bin/pgfortran -m test -c cpu_wrapper.f90 gpu_code.f90 --f90flags=“-Mcuda -fPIC” /home/nick/anaconda3/lib/libpython3.so

yields:

ldd -r test.cpython-37m-x86_64-linux-gnu.so 
	linux-vdso.so.1 (0x00007fff510fb000)
	libpython3.so => /home/nick/anaconda3/lib/libpython3.so (0x00007f7c79f56000)
	libpgf90rtl.so => /opt/pgi/linux86-64/2019/lib/libpgf90rtl.so (0x00007f7c7990c000)
	libpgf90.so => /opt/pgi/linux86-64/2019/lib/libpgf90.so (0x00007f7c79374000)
	libpgf90_rpm1.so => /opt/pgi/linux86-64/2019/lib/libpgf90_rpm1.so (0x00007f7c79172000)
	libpgf902.so => /opt/pgi/linux86-64/2019/lib/libpgf902.so (0x00007f7c78f5f000)
	libpgftnrtl.so => /opt/pgi/linux86-64/2019/lib/libpgftnrtl.so (0x00007f7c78d20000)
	libpgatm.so => /opt/pgi/linux86-64/2019/lib/libpgatm.so (0x00007f7c78b17000)
	libpgkomp.so => /opt/pgi/linux86-64/2019/lib/libpgkomp.so (0x00007f7c78914000)
	libomp.so => /home/nick/anaconda3/lib/libomp.so (0x00007f7c79e81000)
	libpthread.so.0 => /lib/x86_64-linux-gnu/libpthread.so.0 (0x00007f7c786f5000)
	libpgmath.so => /opt/pgi/linux86-64/2019/lib/libpgmath.so (0x00007f7c782e0000)
	libpgc.so => /opt/pgi/linux86-64/2019/lib/libpgc.so (0x00007f7c77f87000)
	librt.so.1 => /lib/x86_64-linux-gnu/librt.so.1 (0x00007f7c77d7f000)
	libm.so.6 => /lib/x86_64-linux-gnu/libm.so.6 (0x00007f7c779e1000)
	libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007f7c775f0000)
	libgcc_s.so.1 => /home/nick/anaconda3/lib/libgcc_s.so.1 (0x00007f7c79e40000)
	libpython3.7m.so.1.0 => /home/nick/anaconda3/lib/./libpython3.7m.so.1.0 (0x00007f7c77287000)
	/lib64/ld-linux-x86-64.so.2 (0x00007f7c79d36000)
	libdl.so.2 => /lib/x86_64-linux-gnu/libdl.so.2 (0x00007f7c77083000)
	libutil.so.1 => /lib/x86_64-linux-gnu/libutil.so.1 (0x00007f7c76e80000)
undefined symbol: __cudaRegisterFunction	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: __pgiLaunchKernelFromStub	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: pgf90_dev_allocated_i8	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: pgf90_dev_copyin	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: __pgiLaunchKernel	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: pgf90_dev_dealloc03_i8	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: __pgi_cuda_register_fat_binaryA	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: pgf90_dev_copyout	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: pgf90_dev_alloc04_i8	(./test.cpython-37m-x86_64-linux-gnu.so)
undefined symbol: pgf90_dev_dealloc_mbr03_i8	(./test.cpython-37m-x86_64-linux-gnu.so)

I can’t keep adding individual libraries, so I was wondering if anyone has any ideas?

This code works fine when used from a program file and an executable is created the “usual” way.

I’ve not used f2py before so may be of limited help. But the second error appears to be that the CUDA Fortran runtime libraries aren’t getting added to the link. Presumably pgfortran is being used to link the code given the default runtime libraries are there, but I’m wondering if the --f90flags are used only for compilation and not added to the link? The CUDA Fortran runtime gets added only if “-Mcuda” is on the link line so if not, then this would explain the error.

Does f2py have a verbose mode where we can see what commands are being executed? From: Using F2PY — NumPy v1.23 Manual , I see a “-debug-capi” flag, but I’m not sure if this give the verbose output, in particular the link command being used.

I’m not seeing any link specific flag (except “-l”) but since you don’t want to add any libraries that wont be helpful. Maybe the “–opt” command could be used assuming it’s applied to both the compile and link?

Hey! Thanks for the reply.

Running f2py in verbose gives the following:

f2py --verbose --fcompiler=pg -m test -c cpu_wrapper.f90 gpu_code.f90 --f90flags=“-Mcuda”

running build
running config_cc
unifing config_cc, config, build_clib, build_ext, build commands --compiler options
running config_fc
unifing config_fc, config, build_clib, build_ext, build commands --fcompiler options
running build_src
build_src
building extension "test" sources
f2py options: []
f2py:> /tmp/tmprcvpt6w2/src.linux-x86_64-3.7/testmodule.c
creating /tmp/tmprcvpt6w2/src.linux-x86_64-3.7
Reading fortran codes...
	Reading file 'cpu_wrapper.f90' (format:free)
Line #21 in cpu_wrapper.f90:"  call  do_stuff<<<grid, tBlock>>>(Z_d, ni) "
	analyzeline: No name/args pattern found for line.
	Reading file 'gpu_code.f90' (format:free)
crackline: groupcounter=1 groupname={0: '', 1: 'module', 2: 'interface', 3: 'subroutine'}
crackline: Mismatch of blocks encountered. Trying to fix it by assuming "end" statement.
Post-processing...
	Block: test
In: :test:unknown_interface
get_useparameters: no module cudafor info used by unknown_interface
			Block: wrap_stuff
In: :test:cpu_wrapper.f90:wrap_stuff
get_useparameters: no module cudafor info used by wrap_stuff
Post-processing (stage 2)...
Building modules...
	Building module "test"...
		Creating wrapper for Fortran subroutine "wrap_stuff"("wrap_stuff")...
		Constructing wrapper function "wrap_stuff"...
		  z = wrap_stuff(ni)
	Wrote C/API module "test" to file "/tmp/tmprcvpt6w2/src.linux-x86_64-3.7/testmodule.c"
	Fortran 77 wrappers are saved to "/tmp/tmprcvpt6w2/src.linux-x86_64-3.7/test-f2pywrappers.f"
  adding '/tmp/tmprcvpt6w2/src.linux-x86_64-3.7/fortranobject.c' to sources.
  adding '/tmp/tmprcvpt6w2/src.linux-x86_64-3.7' to include_dirs.
copying /home/nick/anaconda3/lib/python3.7/site-packages/numpy/f2py/src/fortranobject.c -> /tmp/tmprcvpt6w2/src.linux-x86_64-3.7
copying /home/nick/anaconda3/lib/python3.7/site-packages/numpy/f2py/src/fortranobject.h -> /tmp/tmprcvpt6w2/src.linux-x86_64-3.7
  adding '/tmp/tmprcvpt6w2/src.linux-x86_64-3.7/test-f2pywrappers.f' to sources.
build_src: building npy-pkg config files
running build_ext
new_compiler returns <class 'distutils.unixccompiler.UnixCCompiler'>
customize UnixCCompiler
customize UnixCCompiler using build_ext
********************************************************************************
<class 'distutils.unixccompiler.UnixCCompiler'>
preprocessor  = ['gcc', '-pthread', '-B', '/home/nick/anaconda3/compiler_compat', '-Wl,--sysroot=/', '-E']
compiler      = ['gcc', '-pthread', '-B', '/home/nick/anaconda3/compiler_compat', '-Wl,--sysroot=/', '-Wsign-compare', '-DNDEBUG', '-g', '-fwrapv', '-O3', '-Wall', '-Wstrict-prototypes']
compiler_so   = ['gcc', '-pthread', '-B', '/home/nick/anaconda3/compiler_compat', '-Wl,--sysroot=/', '-Wsign-compare', '-DNDEBUG', '-g', '-fwrapv', '-O3', '-Wall', '-Wstrict-prototypes', '-fPIC']
compiler_cxx  = ['g++', '-pthread', '-B', '/home/nick/anaconda3/compiler_compat', '-Wl,--sysroot=/']
linker_so     = ['gcc', '-pthread', '-shared', '-B', '/home/nick/anaconda3/compiler_compat', '-L/home/nick/anaconda3/lib', '-Wl,-rpath=/home/nick/anaconda3/lib', '-Wl,--no-as-needed', '-Wl,--sysroot=/']
linker_exe    = ['gcc', '-pthread', '-B', '/home/nick/anaconda3/compiler_compat', '-Wl,--sysroot=/']
archiver      = ['ar', 'rc']
ranlib        = None
libraries     = []
library_dirs  = []
include_dirs  = ['/home/nick/anaconda3/include/python3.7m']
********************************************************************************
customize PGroupFCompiler
find_executable('pgfortran')
Found executable /opt/pgi/linux86-64-llvm/19.10/bin/pgfortran
customize PGroupFCompiler using build_ext
********************************************************************************
<class 'numpy.distutils.fcompiler.pg.PGroupFCompiler'>
version_cmd     = ['/opt/pgi/linux86-64-llvm/19.10/bin/pgfortran', '-V']
compiler_f77    = ['/opt/pgi/linux86-64-llvm/19.10/bin/pgfortran', '-fpic', '-Minform=inform', '-Mnosecond_underscore', '-fast']
compiler_fix    = ['/opt/pgi/linux86-64-llvm/19.10/bin/pgfortran', '-Mfixed', '-Mcuda', '-fpic', '-Minform=inform', '-Mnosecond_underscore', '-fast']
compiler_f90    = ['/opt/pgi/linux86-64-llvm/19.10/bin/pgfortran', '-Mcuda', '-fpic', '-Minform=inform', '-Mnosecond_underscore', '-fast']
linker_so       = ['/opt/pgi/linux86-64-llvm/19.10/bin/pgfortran', '-shared', '-fpic']
archiver        = None
ranlib          = None
linker_exe      = None
version         = LooseVersion ('19.10-0')
libraries       = []
library_dirs    = []
object_switch   = '-o '
compile_switch  = '-c'
include_dirs    = ['/home/nick/anaconda3/include/python3.7m']
********************************************************************************
.....

using --opt=“-Mcuda” instead of f90 flags yields the .so but still the same issues when running ldd-r test.so

I could try manually specifying all the required libraries that -MCuda provides (specifying this from f2py does not work for the link stage)… Do you know which libraries Mcuda proxies for?

Yes, this looks to be the issue in that “-Mcuda” isn’t being added to the link when creating the share object. If you can figure out how add “-Mcuda” to the link flags, that would be ideal.

I did find this post on StackOverflow which suggests that you can set the environment variable “LDFLAGS=-Mcuda” to set the f2py linker flags so you may want to try it. Setting NPY_DISTUTILS_APPEND_FLAGS=1 looks necessary as well so it doesn’t overwrite the other linker flags.

If NPY_DISTUTILS_APPEND_FLAGS isn’t functional in your version of f2py (it seems to be numpy specific), then you might need to set LDFLAGS to “-shared -fpic -Mcuda”.

I could try manually specifying all the required libraries that -MCuda provides (specifying this from f2py does not work for the link stage)… Do you know which libraries Mcuda proxies for?

You can, but it’s a little more complex that just adding the libraries. In addition, the “-Mcuda” flag tells the compiler to also run a device code link step when creating the shared object. If you add the libraries but hand, you’ll need to also compile the code with “-Mcuda=nordc” so the device link isn’t required. Though without RDC enabled, some CUDA Fortran features are disabled such the ability to call device routines not in the same module or accessing device module variables outside of the module in which their defined.

Second, the “-Mcuda” flag can use different CUDA versions, selecting the one to use based on the NVIDIA driver version being used, if “CUDA_HOME” is set, or if the users has selected a particular CUDA version via “-Mcuda=cudaX.y”. The included libraries can be different depending of the CUDA version being used.

Finally, the libraries can change from release to release, so the exact libraries used is release dependent.

The best way to determine what flags to add, is to run the command: “pgfortran -dryrun -Mcuda=nordc -shared test.o -o libtest.so”. “-dryrun” will show you the commands the compiler driver would execute, but not actually do them. “-v” (verbose) also shows the driver commands, but does perform them.

Here’s the ld command with 19.10 using my local install:

/usr/bin/ld /usr/lib/x86_64-linux-gnu/crti.o /proj/pgi/linux86-64-llvm/19.10/lib/trace_init.o /home/sw/thirdparty/gcc/gcc-9.2.0/linux86-64/lib/gcc/x86_64-pc-linux-gnu/9.2.0/crtbeginS.o --eh-frame-hdr -m elf_x86_64 -dynamic-linker /lib64/ld-linux-x86-64.so.2 /proj/pgi/linux86-64-llvm/19.10/lib/pgi.ld -L/proj/pgi/linux86-64-llvm/19.10/lib -L/usr/lib64 -L/home/sw/thirdparty/gcc/gcc-9.2.0/linux86-64/lib/gcc/x86_64-pc-linux-gnu/9.2.0 test2.o -rpath /proj/pgi/linux86-64-llvm/19.10/lib -rpath /proj/pgi/linux86-64-llvm/2019/cuda/10.1/lib64 -rpath /home/sw/thirdparty/gcc/gcc-9.2.0/linux86-64/lib/gcc/x86_64-pc-linux-gnu/9.2.0/../../../../lib64 -o libtest.so -shared /proj/pgi/linux86-64-llvm/19.10/lib/pgiloc.ld -L/home/sw/thirdparty/gcc/gcc-9.2.0/linux86-64/lib/gcc/x86_64-pc-linux-gnu/9.2.0/../../../../lib64 -lcudafor101 -lcudafor -lcudaforblas101 /proj/pgi/linux86-64-llvm/19.10/lib/cuda_init_register_end.o -L/proj/pgi/linux86-64-llvm/2019/cuda/10.1/lib64 -lcudadevrt -lcudart -lcudafor2 -lpgf90rtl -lpgf90 -lpgf90_rpm1 -lpgf902 -lpgf90rtl -lpgftnrtl -lpgatm -lpgkomp -lomp -as-needed -lomptarget -no-as-needed -lpthread --start-group -lpgmath -lpgc --end-group -lrt -lpthread -lm -lgcc -lc -lgcc -lgcc_s /home/sw/thirdparty/gcc/gcc-9.2.0/linux86-64/lib/gcc/x86_64-pc-linux-gnu/9.2.0/crtendS.o /usr/lib/x86_64-linux-gnu/crtn.o

You can then compare this to another dryrun without -Mcuda=nordc to see the added library paths, libraries and objects.

So in my case where CUDA 10.1 is being used, I’d want to add “-L/proj/pgi/linux86-64-llvm/2019/cuda/10.1/lib64 -lcudafor101 -lcudafor -lcudaforblas101 /proj/pgi/linux86-64-llvm/19.10/lib/cuda_init_register_end.o -lcudadevrt -lcudart -lcudafor2”

Matt you fantastic beast!

writing the following:

LDFLAGS="-Mcuda" f2py --verbose --fcompiler=pg -m test -c cpu_wrapper.f90 gpu_code.f90 --f90flags="-Mcuda" /home/nick/anaconda3/lib/libpython3.so

worked like a charm and imports to python!

It’s a bit odd that I need to manually link to libpython3.so in order to get the Python methods recognised, would have thought f2py handles this. Setting NPY_DISTUTILS_APPEND_FLAGS=1 didn’t seem to help in this case, but I think I’ll post this issue over at the Numpy github repository to bring this to their attention.

many thanks again!