I am trying to run the NVSHMEM perftest shmem_put_bw
. However, it reports errors. My SLURM script is :
#!/bin/bash
#SBATCH -p g078t2
#SBATCH -N 1
#SBATCH --ntasks=2
#SBATCH --ntasks-per-node=2
#SBATCH --gres=gpu:2
#SBATCH --time=120:00
#SBATCH --comment=idmg_bupt
#SBATCH --output=perftest.out
module purge
module load gcc/9.3.0
echo "Running on nodes: $SLURM_JOB_NODELIST"
nvidia-smi
export NVSHMEM_DEBUG=TRACE
export NVSHMEM_DEBUG_SUBSYS=ALL
export NVSHMEM_DEBUG_FILE=nvdebug
# run nvshmem perftest
mpirun -np 2 ~/nvshmem_src/build/perftest/device/pt-to-pt/shmem_put_bw
The output is:
Running on nodes: gpu8
Running nvshmem perftest on 2 GPUs
Sun Dec 22 17:36:06 2024
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.154.05 Driver Version: 535.154.05 CUDA Version: 12.2 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA A800-SXM4-80GB On | 00000000:B1:00.0 Off | 0 |
| N/A 39C P0 64W / 500W | 0MiB / 81920MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+----------------------+----------------------+
| 1 NVIDIA A800-SXM4-80GB On | 00000000:D0:00.0 Off | 0 |
| N/A 37C P0 62W / 500W | 0MiB / 81920MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+----------------------+----------------------+
+---------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=======================================================================================|
| No running processes found |
+---------------------------------------------------------------------------------------+
gpu8:34575:34575 [0] NVSHMEM INFO DEBUG file is 'nvdebug'
NVSHMEM configuration:
CUDA API 11080
CUDA Runtime 11080
CUDA Driver 12020
Build Timestamp Dec 22 2024 05:14:09
Build Variables
NVSHMEM_DEBUG=OFF NVSHMEM_DEVEL=OFF NVSHMEM_DEFAULT_PMI2=ON
NVSHMEM_DEFAULT_PMIX=OFF NVSHMEM_DEFAULT_UCX=OFF NVSHMEM_DISABLE_COLL_POLL=ON
NVSHMEM_ENABLE_ALL_DEVICE_INLINING=OFF NVSHMEM_GPU_COLL_USE_LDST=OFF
NVSHMEM_IBGDA_SUPPORT=OFF NVSHMEM_IBGDA_SUPPORT_GPUMEM_ONLY=OFF
NVSHMEM_IBDEVX_SUPPORT=OFF NVSHMEM_IBRC_SUPPORT=ON
NVSHMEM_MPI_SUPPORT=ON NVSHMEM_NVTX=ON NVSHMEM_PMIX_SUPPORT=OFF
NVSHMEM_SHMEM_SUPPORT=OFF NVSHMEM_TEST_STATIC_LIB=OFF
NVSHMEM_TIMEOUT_DEVICE_POLLING=OFF NVSHMEM_TRACE=OFF NVSHMEM_UCX_SUPPORT=OFF
NVSHMEM_USE_DLMALLOC=OFF NVSHMEM_USE_NCCL=OFF NVSHMEM_USE_GDRCOPY=OFF
NVSHMEM_VERBOSE=OFF CUDA_HOME=/usr/local/cuda GDRCOPY_HOME=/usr/local/gdrdrv
LIBFABRIC_HOME=/usr/local/libfabric
MPI_HOME=/home/u2022110987/mylib/openmpi-4.1.1 NCCL_HOME=/usr/local/nccl
NVSHMEM_PREFIX=/home/u2022110987/mylib/nvshmem-3.1.7 PMIX_HOME=/usr
SHMEM_HOME=/home/u2022110987/mylib/openmpi-4.1.1 UCX_HOME=/usr/local/ucx
/home/u2022110987/lib_src/nvshmem_src/src/host/init/init.cu:943: non-zero status: 5 nvshmem get cucontext failed
gpu8:34574:34574 [0] NVSHMEM INFO DEBUG file is 'nvdebug'
mype: 0 mype_node: 0 device name: NVIDIA A800-SXM4-80GB bus id: 177
NVSHMEM configuration:
CUDA API 11080
CUDA Runtime 11080
CUDA Driver 12020
Build Timestamp Dec 22 2024 05:14:09
Build Variables
NVSHMEM_DEBUG=OFF NVSHMEM_DEVEL=OFF NVSHMEM_DEFAULT_PMI2=ON
NVSHMEM_DEFAULT_PMIX=OFF NVSHMEM_DEFAULT_UCX=OFF NVSHMEM_DISABLE_COLL_POLL=ON
NVSHMEM_ENABLE_ALL_DEVICE_INLINING=OFF NVSHMEM_GPU_COLL_USE_LDST=OFF
NVSHMEM_IBGDA_SUPPORT=OFF NVSHMEM_IBGDA_SUPPORT_GPUMEM_ONLY=OFF
NVSHMEM_IBDEVX_SUPPORT=OFF NVSHMEM_IBRC_SUPPORT=ON
NVSHMEM_MPI_SUPPORT=ON NVSHMEM_NVTX=ON NVSHMEM_PMIX_SUPPORT=OFF
NVSHMEM_SHMEM_SUPPORT=OFF NVSHMEM_TEST_STATIC_LIB=OFF
NVSHMEM_TIMEOUT_DEVICE_POLLING=OFF NVSHMEM_TRACE=OFF NVSHMEM_UCX_SUPPORT=OFF
NVSHMEM_USE_DLMALLOC=OFF NVSHMEM_USE_NCCL=OFF NVSHMEM_USE_GDRCOPY=OFF
NVSHMEM_VERBOSE=OFF CUDA_HOME=/usr/local/cuda GDRCOPY_HOME=/usr/local/gdrdrv
LIBFABRIC_HOME=/usr/local/libfabric
MPI_HOME=/home/u2022110987/mylib/openmpi-4.1.1 NCCL_HOME=/usr/local/nccl
NVSHMEM_PREFIX=/home/u2022110987/mylib/nvshmem-3.1.7 PMIX_HOME=/usr
SHMEM_HOME=/home/u2022110987/mylib/openmpi-4.1.1 UCX_HOME=/usr/local/ucx
/home/u2022110987/lib_src/nvshmem_src/src/host/init/init.cu:943: non-zero status: 5 nvshmem get cucontext failed
mype: 0 mype_node: 0 device name: NVIDIA A800-SXM4-80GB bus id: 177
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/ibrc/ibrc.cpp 1631 Begin - Enumerating IB devices in the system ([<dev_id, device_name, num_ports>]) -
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/ibrc/ibrc.cpp 1631 Begin - Enumerating IB devices in the system ([<dev_id, device_name, num_ports>]) -
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/ibrc/ibrc.cpp 1651 Enumerated IB devices in the system - device id=0 (of 6), name=mlx5_0, num_ports=1
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/ibrc/ibrc.cpp 1651 Enumerated IB devices in the system - device id=0 (of 6), name=mlx5_0, num_ports=1
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/common/transport_ib_common.h 69 NVSHMEM_IB_ADDR_FAMILY set by environment to AF_INET
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/common/transport_ib_common.h 92 NVSHMEM_IB_ADDR_RANGE set by environment to ::/0
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/common/transport_ib_common.h 104 NET/IB: Ip address '::' is invalid for family AF_INET, ignoring address
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/common/transport_ib_common.h 69 NVSHMEM_IB_ADDR_FAMILY set by environment to AF_INET
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/common/transport_ib_common.h 92 NVSHMEM_IB_ADDR_RANGE set by environment to ::/0
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/common/transport_ib_common.h 104 NET/IB: Ip address '::' is invalid for family AF_INET, ignoring address
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/ibrc/ibrc.cpp 1651 Enumerated IB devices in the system - device id=1 (of 6), name=mlx5_1, num_ports=1
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/common/transport_ib_common.h 69 NVSHMEM_IB_ADDR_FAMILY set by environment to AF_INET
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/common/transport_ib_common.h 92 NVSHMEM_IB_ADDR_RANGE set by environment to ::/0
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/common/transport_ib_common.h 104 NET/IB: Ip address '::' is invalid for family AF_INET, ignoring address
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/ibrc/ibrc.cpp 1651 Enumerated IB devices in the system - device id=1 (of 6), name=mlx5_1, num_ports=1
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/common/transport_ib_common.h 69 NVSHMEM_IB_ADDR_FAMILY set by environment to AF_INET
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/common/transport_ib_common.h 92 NVSHMEM_IB_ADDR_RANGE set by environment to ::/0
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/common/transport_ib_common.h 104 NET/IB: Ip address '::' is invalid for family AF_INET, ignoring address
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/ibrc/ibrc.cpp 1651 Enumerated IB devices in the system - device id=2 (of 6), name=mlx5_2, num_ports=1
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/common/transport_ib_common.h 69 NVSHMEM_IB_ADDR_FAMILY set by environment to AF_INET
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/common/transport_ib_common.h 92 NVSHMEM_IB_ADDR_RANGE set by environment to ::/0
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/common/transport_ib_common.h 104 NET/IB: Ip address '::' is invalid for family AF_INET, ignoring address
WARN: IB: read failed in ib_roce_get_version_num: Invalid argument
WARN: IB: read failed in ib_roce_get_version_num: Invalid argument
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/ibrc/ibrc.cpp 1651 Enumerated IB devices in the system - device id=2 (of 6), name=mlx5_2, num_ports=1
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/common/transport_ib_common.h 69 NVSHMEM_IB_ADDR_FAMILY set by environment to AF_INET
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/common/transport_ib_common.h 92 NVSHMEM_IB_ADDR_RANGE set by environment to ::/0
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/common/transport_ib_common.h 104 NET/IB: Ip address '::' is invalid for family AF_INET, ignoring address
WARN: IB: read failed in ib_roce_get_version_num: Invalid argument
WARN: IB: read failed in ib_roce_get_version_num: Invalid argument
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/ibrc/ibrc.cpp 1651 Enumerated IB devices in the system - device id=3 (of 6), name=mlx5_3, num_ports=1
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/common/transport_ib_common.h 69 NVSHMEM_IB_ADDR_FAMILY set by environment to AF_INET
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/common/transport_ib_common.h 92 NVSHMEM_IB_ADDR_RANGE set by environment to ::/0
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/common/transport_ib_common.h 104 NET/IB: Ip address '::' is invalid for family AF_INET, ignoring address
WARN: IB: read failed in ib_roce_get_version_num: Invalid argument
WARN: IB: read failed in ib_roce_get_version_num: Invalid argument
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/ibrc/ibrc.cpp 1651 Enumerated IB devices in the system - device id=3 (of 6), name=mlx5_3, num_ports=1
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/common/transport_ib_common.h 69 NVSHMEM_IB_ADDR_FAMILY set by environment to AF_INET
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/common/transport_ib_common.h 92 NVSHMEM_IB_ADDR_RANGE set by environment to ::/0
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/common/transport_ib_common.h 104 NET/IB: Ip address '::' is invalid for family AF_INET, ignoring address
WARN: IB: read failed in ib_roce_get_version_num: Invalid argument
WARN: IB: read failed in ib_roce_get_version_num: Invalid argument
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/ibrc/ibrc.cpp 1651 Enumerated IB devices in the system - device id=4 (of 6), name=mlx5_4, num_ports=1
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/common/transport_ib_common.h 69 NVSHMEM_IB_ADDR_FAMILY set by environment to AF_INET
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/common/transport_ib_common.h 92 NVSHMEM_IB_ADDR_RANGE set by environment to ::/0
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/common/transport_ib_common.h 104 NET/IB: Ip address '::' is invalid for family AF_INET, ignoring address
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/ibrc/ibrc.cpp 1651 Enumerated IB devices in the system - device id=4 (of 6), name=mlx5_4, num_ports=1
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/common/transport_ib_common.h 69 NVSHMEM_IB_ADDR_FAMILY set by environment to AF_INET
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/common/transport_ib_common.h 92 NVSHMEM_IB_ADDR_RANGE set by environment to ::/0
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/common/transport_ib_common.h 104 NET/IB: Ip address '::' is invalid for family AF_INET, ignoring address
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/ibrc/ibrc.cpp 1651 Enumerated IB devices in the system - device id=5 (of 6), name=mlx5_5, num_ports=1
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/common/transport_ib_common.h 69 NVSHMEM_IB_ADDR_FAMILY set by environment to AF_INET
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/common/transport_ib_common.h 92 NVSHMEM_IB_ADDR_RANGE set by environment to ::/0
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/common/transport_ib_common.h 104 NET/IB: Ip address '::' is invalid for family AF_INET, ignoring address
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/ibrc/ibrc.cpp 1738 End - Enumerating IB devices in the system
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/ibrc/ibrc.cpp 1741 Begin - Ordered list of devices for assignment (after processing user provdied env vars (if any)) -
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/ibrc/ibrc.cpp 1745 Ordered list of devices for assignment - idx=0 (of 6), device id=0, port_num=1
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/ibrc/ibrc.cpp 1745 Ordered list of devices for assignment - idx=1 (of 6), device id=1, port_num=1
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/ibrc/ibrc.cpp 1745 Ordered list of devices for assignment - idx=2 (of 6), device id=2, port_num=1
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/ibrc/ibrc.cpp 1745 Ordered list of devices for assignment - idx=3 (of 6), device id=3, port_num=1
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/ibrc/ibrc.cpp 1745 Ordered list of devices for assignment - idx=4 (of 6), device id=4, port_num=1
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/ibrc/ibrc.cpp 1745 Ordered list of devices for assignment - idx=5 (of 6), device id=5, port_num=1
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/ibrc/ibrc.cpp 1749 End - Ordered list of devices for assignment (after processing user provdied env vars (if any))
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/ibrc/ibrc.cpp 1651 Enumerated IB devices in the system - device id=5 (of 6), name=mlx5_5, num_ports=1
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/common/transport_ib_common.h 69 NVSHMEM_IB_ADDR_FAMILY set by environment to AF_INET
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/common/transport_ib_common.h 92 NVSHMEM_IB_ADDR_RANGE set by environment to ::/0
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/common/transport_ib_common.h 104 NET/IB: Ip address '::' is invalid for family AF_INET, ignoring address
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/ibrc/ibrc.cpp 1738 End - Enumerating IB devices in the system
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/ibrc/ibrc.cpp 1741 Begin - Ordered list of devices for assignment (after processing user provdied env vars (if any)) -
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/ibrc/ibrc.cpp 1745 Ordered list of devices for assignment - idx=0 (of 6), device id=0, port_num=1
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/ibrc/ibrc.cpp 1745 Ordered list of devices for assignment - idx=1 (of 6), device id=1, port_num=1
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/ibrc/ibrc.cpp 1745 Ordered list of devices for assignment - idx=2 (of 6), device id=2, port_num=1
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/ibrc/ibrc.cpp 1745 Ordered list of devices for assignment - idx=3 (of 6), device id=3, port_num=1
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/ibrc/ibrc.cpp 1745 Ordered list of devices for assignment - idx=4 (of 6), device id=4, port_num=1
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/ibrc/ibrc.cpp 1745 Ordered list of devices for assignment - idx=5 (of 6), device id=5, port_num=1
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/ibrc/ibrc.cpp 1749 End - Ordered list of devices for assignment (after processing user provdied env vars (if any))
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/ibrc/ibrc.cpp 212 /home/u2022110987/lib_src/nvshmem_src/src/modules/transport/ibrc/ibrc.cpp:1791 Ib Alloc Size 2097152 pointer 0x1512608c7000
/home/u2022110987/lib_src/nvshmem_src/src/modules/transport/ibrc/ibrc.cpp 212 /home/u2022110987/lib_src/nvshmem_src/src/modules/transport/ibrc/ibrc.cpp:1791 Ib Alloc Size 2097152 pointer 0x1514e66b9000
/home/u2022110987/lib_src/nvshmem_src/src/modules/bootstrap/pmi/bootstrap_pmi.cpp:235: non-zero status: 14 WRAP_PMI_Barrier failed
/home/u2022110987/lib_src/nvshmem_src/src/host/transport/transport.cpp:402: non-zero status: 7 barrier failed
/home/u2022110987/lib_src/nvshmem_src/src/host/init/init.cu:1001: non-zero status: 7 nvshmem setup connections failed
/home/u2022110987/lib_src/nvshmem_src/src/host/init/init.cu:nvshmemi_check_state_and_init:1074: nvshmem initialization failed, exiting
/home/u2022110987/lib_src/nvshmem_src/src/modules/bootstrap/pmi/bootstrap_pmi.cpp:235: non-zero status: 14 WRAP_PMI_Barrier failed
/home/u2022110987/lib_src/nvshmem_src/src/host/transport/transport.cpp:402: non-zero status: 7 barrier failed
/home/u2022110987/lib_src/nvshmem_src/src/host/init/init.cu:1001: non-zero status: 7 nvshmem setup connections failed
/home/u2022110987/lib_src/nvshmem_src/src/host/init/init.cu:nvshmemi_check_state_and_init:1074: nvshmem initialization failed, exiting
--------------------------------------------------------------------------
Primary job terminated normally, but 1 process returned
a non-zero exit code. Per user-direction, the job has been aborted.
--------------------------------------------------------------------------
--------------------------------------------------------------------------
mpirun detected that one or more processes exited with non-zero status, thus causing
the job to be terminated. The first process to do so was:
Process name: [[22430,1],0]
Exit code: 255
--------------------------------------------------------------------------
It seems to run NVSHMEM on the same GPU, reporting two same mype_node
:
mype: 0 mype_node: 0 device name: NVIDIA A800-SXM4-80GB bus id: 177
mype: 0 mype_node: 0 device name: NVIDIA A800-SXM4-80GB bus id: 177
The nvdebug
file is:
gpu8:52176:52176 [0] NVSHMEM INFO host name: gpu8 hash 6385268297
gpu8:52176:52176 [0] NVSHMEM INFO PE distribution has been identified as NVSHMEMI_PE_DIST_BLOCK
gpu8:52176:52176 [0] NVSHMEM INFO PE 0 (process) affinity to 1 CPUs:
9
gpu8:52176:52176 [0] NVSHMEM INFO cudaDriverVersion 12020
gpu8:52176:52176 [0] NVSHMEM INFO NVSHMEM symmetric heap kind = DEVICE selected
gpu8:52176:52176 [0] NVSHMEM INFO nvshmemi_common_init failed, continuing
gpu8:52176:52176 [0] NVSHMEM INFO NVSHMEM symmetric heap kind = DEVICE selected
gpu8:52176:52176 [0] NVSHMEM INFO [0] nvshmemi_get_cucontext->cuCtxSynchronize->CUDA_SUCCESS) my_stream (nil)
gpu8:52176:52176 [0] NVSHMEM INFO in get_cucontext, queried and saved context for device: 0 context: 0x5ff430
gpu8:52176:52176 [0] NVSHMEM INFO [0] Created stream 0xeb8e60 for device 0
gpu8:52176:52176 [0] NVSHMEM INFO CUDA 64-bit stream memops support is not available
gpu8:52176:52176 [0] NVSHMEM INFO NVML library found. libnvidia-ml.so.1
gpu8:52176:52176 [0] NVSHMEM INFO Symmetric Memory Heap Handle Type: POSIX File Descriptor
gpu8:52176:52176 [0] NVSHMEM INFO cuMulticast is not supported on CUDA or disabled by user
gpu8:52176:52176 [0] NVSHMEM INFO host name: gpu8 hash 6385268297
gpu8:52176:52176 [0] NVSHMEM INFO team psync mem req 508928 bytes, team mem total req 130286120 bytes, max teams 32
gpu8:52176:52176 [0] NVSHMEM INFO [0] heap type: P42nvshmemi_symmetric_heap_vidmem_dynamic_vmm allocate_local_heap, heapextra = 415511080
gpu8:52176:52176 [0] NVSHMEM INFO [0] heap type: P42nvshmemi_symmetric_heap_vidmem_dynamic_vmm heap base: 0x10020000000 NVSHMEM_SYMMETRIC_SIZE 1073741824 total 137438953472 heapextra 415511080
gpu8:52176:52176 [0] NVSHMEM INFO host name: gpu8 hash 6385268297
gpu8:52176:52176 [0] NVSHMEM INFO [0x1a24990] ndev 2 pcie_devid 0 cudevice 0 peer host hash 17c977649 p2p host hash 17c977649
gpu8:52176:52176 [0] NVSHMEM INFO [0] reach 15 to peer 0 over transport 0
gpu8:52176:52176 [0] NVSHMEM INFO [0] reach 112 to peer 0 over transport 1
gpu8:52176:52176 [0] NVSHMEM INFO [0] transport bitmap: 3
gpu8:52176:52176 [0] NVSHMEM INFO PE 0: /sys/devices/pci0000:a6/0000:a6:02.0/0000:a7:00.0/0000:a8:00.0/0000:a9:00.0/0000:aa:10.0/0000:af:00.0/0000:b0:00.0/0000:b1:00.0 dev 1: /sys/devices/pci0000:5a/0000:5a:02.0/0000:5b:00.0/0000:5c:04.0/0000:63:00.0/0000:64:10.0/0000:68:00.0 distance: 4
gpu8:52176:52176 [0] NVSHMEM INFO PE 0: /sys/devices/pci0000:a6/0000:a6:02.0/0000:a7:00.0/0000:a8:00.0/0000:a9:00.0/0000:aa:10.0/0000:af:00.0/0000:b0:00.0/0000:b1:00.0 dev 2: /sys/devices/pci0000:97/0000:97:02.0/0000:98:00.0 distance: 3
gpu8:52176:52176 [0] NVSHMEM INFO PE 0: /sys/devices/pci0000:a6/0000:a6:02.0/0000:a7:00.0/0000:a8:00.0/0000:a9:00.0/0000:aa:10.0/0000:af:00.0/0000:b0:00.0/0000:b1:00.0 dev 3: /sys/devices/pci0000:97/0000:97:02.0/0000:98:00.1 distance: 3
gpu8:52176:52176 [0] NVSHMEM INFO PE 0: /sys/devices/pci0000:a6/0000:a6:02.0/0000:a7:00.0/0000:a8:00.0/0000:a9:00.0/0000:aa:10.0/0000:af:00.0/0000:b0:00.0/0000:b1:00.0 dev 4: /sys/devices/pci0000:a6/0000:a6:02.0/0000:a7:00.0/0000:a8:08.0/0000:c0:00.0/0000:c1:00.0/0000:c2:00.0 distance: 1
gpu8:52176:52176 [0] NVSHMEM INFO PE 0: /sys/devices/pci0000:a6/0000:a6:02.0/0000:a7:00.0/0000:a8:00.0/0000:a9:00.0/0000:aa:10.0/0000:af:00.0/0000:b0:00.0/0000:b1:00.0 dev 5: /sys/devices/pci0000:c9/0000:c9:02.0/0000:ca:00.0/0000:cb:08.0/0000:d9:00.0/0000:da:00.0/0000:db:00.0 distance: 3
gpu8:52176:52176 [0] NVSHMEM INFO Pairing PE 0 with device 4 at distance 1
gpu8:52176:52176 [0] NVSHMEM INFO Our PE is sharing its NIC at index 1 with 1 other PEs.
gpu8:52176:52176 [0] NVSHMEM INFO NVSHMEM_ENABLE_NIC_PE_MAPPING = 0, device 0 setting dev_id = 4
4
Is it because my SLURM script is wrong?