I am trying to run a fortran code that uses both OpenMP (for CPU parallelism) and OpenACC (for GPU offloading). I use a 24 core intel machine (hyperthreading thus 48). I have set export OMP_NUM_THREADS=24
and if I compile my program with the ifort-compiler everything works fine (cores 0-23 are at 100% load).
However, if I compile with nvfortran I get 100% load only on cores 0 and 24 (which is just 0, but through hyperthreading). I printed thread-id and the core-id (through a c-call to sched_getcpu();
):
me: 0 my cpu: 0
me: 22 my cpu: 24
me: 23 my cpu: 24
me: 11 my cpu: 0
me: 2 my cpu: 0
me: 16 my cpu: 24
me: 3 my cpu: 0
me: 21 my cpu: 24
me: 12 my cpu: 24
me: 1 my cpu: 0
me: 17 my cpu: 24
me: 8 my cpu: 0
me: 20 my cpu: 24
me: 6 my cpu: 0
me: 15 my cpu: 24
me: 4 my cpu: 0
me: 7 my cpu: 0
me: 18 my cpu: 24
me: 10 my cpu: 0
me: 19 my cpu: 24
me: 14 my cpu: 24
me: 9 my cpu: 0
me: 5 my cpu: 0
me: 13 my cpu: 24
How can I convince nvfortran, to use proper OpenMP parallelism? I explicitly made sure, that PGI OpenMP is the only one linked:
ldd ~/fleur/build.openacc/fleur_MPI
linux-vdso.so.1 => (0x00007ffd243ac000)
libnvToolsExt.so.1 => /tmp_mnt/el8/nvidia/hpc_sdk/Linux_x86_64/20.9/cuda/11.0/lib64/libnvToolsExt.so.1 (0x00002b506d15d000)
libmkl_scalapack_lp64.so => /usr/local/intel/compilers_and_libraries_2020.0.166/linux/mkl/lib/intel64_lin/libmkl_scalapack_lp64.so (0x00002b506d366000)
libmkl_blacs_intelmpi_lp64.so => /usr/local/intel/compilers_and_libraries_2020.0.166/linux/mkl/lib/intel64_lin/libmkl_blacs_intelmpi_lp64.so (0x00002b506dc84000)
libxml2.so.2 => /lib64/libxml2.so.2 (0x00002b506dec6000)
libmkl_intel_lp64.so => /usr/local/intel/compilers_and_libraries_2020.0.166/linux/mkl/lib/intel64_lin/libmkl_intel_lp64.so (0x00002b506e230000)
libmkl_sequential.so => /usr/local/intel/compilers_and_libraries_2020.0.166/linux/mkl/lib/intel64_lin/libmkl_sequential.so (0x00002b506ed9c000)
libmkl_core.so => /usr/local/intel/compilers_and_libraries_2020.0.166/linux/mkl/lib/intel64_lin/libmkl_core.so (0x00002b50703b4000)
libm.so.6 => /lib64/libm.so.6 (0x00002b50746d4000)
libdl.so.2 => /lib64/libdl.so.2 (0x00002b50749d6000)
libmpi_usempif08.so.40 => /tmp_mnt/el8/nvidia/hpc_sdk/Linux_x86_64/20.9/comm_libs/openmpi/openmpi-3.1.5/lib/libmpi_usempif08.so.40 (0x00002b5074bda000)
libmpi_usempi_ignore_tkr.so.40 => /tmp_mnt/el8/nvidia/hpc_sdk/Linux_x86_64/20.9/comm_libs/openmpi/openmpi-3.1.5/lib/libmpi_usempi_ignore_tkr.so.40 (0x00002b5074e06000)
libmpi_mpifh.so.40 => /tmp_mnt/el8/nvidia/hpc_sdk/Linux_x86_64/20.9/comm_libs/openmpi/openmpi-3.1.5/lib/libmpi_mpifh.so.40 (0x00002b507500b000)
libmpi.so.40 => /tmp_mnt/el8/nvidia/hpc_sdk/Linux_x86_64/20.9/comm_libs/openmpi/openmpi-3.1.5/lib/libmpi.so.40 (0x00002b507526a000)
libcublas.so.11 => /tmp_mnt/el8/nvidia/hpc_sdk/Linux_x86_64/20.9/math_libs/11.0/lib64/libcublas.so.11 (0x00002b5075709000)
libcublasLt.so.11 => /tmp_mnt/el8/nvidia/hpc_sdk/Linux_x86_64/20.9/math_libs/11.0/lib64/libcublasLt.so.11 (0x00002b507b559000)
libcudart.so.11.0 => /tmp_mnt/el8/nvidia/hpc_sdk/Linux_x86_64/20.9/cuda/11.0/lib64/libcudart.so.11.0 (0x00002b50866e8000)
libcudafor101.so => /tmp_mnt/el8/nvidia/hpc_sdk/Linux_x86_64/20.9/compilers/lib/libcudafor101.so (0x00002b5086966000)
libcudafor.so => /tmp_mnt/el8/nvidia/hpc_sdk/Linux_x86_64/20.9/compilers/lib/libcudafor.so (0x00002b5086b91000)
libcurand.so.10 => /tmp_mnt/el8/nvidia/hpc_sdk/Linux_x86_64/20.9/math_libs/11.0/lib64/libcurand.so.10 (0x00002b508a2b1000)
libcudaforwrapblas.so => /tmp_mnt/el8/nvidia/hpc_sdk/Linux_x86_64/20.9/compilers/lib/libcudaforwrapblas.so (0x00002b508ee1d000)
libacchost.so => /tmp_mnt/el8/nvidia/hpc_sdk/Linux_x86_64/20.9/compilers/lib/libacchost.so (0x00002b508f059000)
libacccuda.so => /tmp_mnt/el8/nvidia/hpc_sdk/Linux_x86_64/20.9/compilers/lib/libacccuda.so (0x00002b508f2c7000)
libcudadevice.so => /tmp_mnt/el8/nvidia/hpc_sdk/Linux_x86_64/20.9/compilers/lib/libcudadevice.so (0x00002b508f668000)
libcudafor2.so => /tmp_mnt/el8/nvidia/hpc_sdk/Linux_x86_64/20.9/compilers/lib/libcudafor2.so (0x00002b508f87b000)
libnvf.so => /tmp_mnt/el8/nvidia/hpc_sdk/Linux_x86_64/20.9/compilers/lib/libnvf.so (0x00002b508fa7d000)
libnvomp.so => /tmp_mnt/el8/nvidia/hpc_sdk/Linux_x86_64/20.9/compilers/lib/libnvomp.so (0x00002b509002f000)
libnvhpcatm.so => /tmp_mnt/el8/nvidia/hpc_sdk/Linux_x86_64/20.9/compilers/lib/libnvhpcatm.so (0x00002b5090c91000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x00002b5090e9a000)
libnvcpumath.so => /tmp_mnt/el8/nvidia/hpc_sdk/Linux_x86_64/20.9/compilers/lib/libnvcpumath.so (0x00002b50910b6000)
libnvc.so => /tmp_mnt/el8/nvidia/hpc_sdk/Linux_x86_64/20.9/compilers/lib/libnvc.so (0x00002b50914dd000)
librt.so.1 => /lib64/librt.so.1 (0x00002b5091734000)
libc.so.6 => /lib64/libc.so.6 (0x00002b509193c000)
libgcc_s.so.1 => /lib64/libgcc_s.so.1 (0x00002b5091d09000)
libstdc++.so.6 => /lib64/libstdc++.so.6 (0x00002b5091f1f000)
libz.so.1 => /lib64/libz.so.1 (0x00002b5092226000)
liblzma.so.5 => /lib64/liblzma.so.5 (0x00002b509243c000)
/lib64/ld-linux-x86-64.so.2 (0x00002b506cf39000)
libopen-rte.so.40 => /tmp_mnt/el8/nvidia/hpc_sdk/Linux_x86_64/20.9/comm_libs/openmpi/openmpi-3.1.5/lib/libopen-rte.so.40 (0x00002b5092662000)
libopen-pal.so.40 => /tmp_mnt/el8/nvidia/hpc_sdk/Linux_x86_64/20.9/comm_libs/openmpi/openmpi-3.1.5/lib/libopen-pal.so.40 (0x00002b50929a7000)
librdmacm.so.1 => /usr/lib64/librdmacm.so.1 (0x00002b5092e5d000)
libibverbs.so.1 => /usr/lib64/libibverbs.so.1 (0x00002b5093074000)
libnuma.so.1 => /usr/lib64/libnuma.so.1 (0x00002b509328d000)
libutil.so.1 => /usr/lib64/libutil.so.1 (0x00002b5093499000)
libnl-route-3.so.200 => /usr/lib64/libnl-route-3.so.200 (0x00002b509369c000)
libnl-3.so.200 => /usr/lib64/libnl-3.so.200 (0x00002b5093909000)