I’m trying to run the example program from nvshmem documentation
#include <stdio.h>
#include <cuda.h>
#include <nvshmem.h>
#include <nvshmemx.h>
#include <mpi/mpi.h>
__global__ void simple_shift(int *destination) {
int mype = nvshmem_my_pe();
int npes = nvshmem_n_pes();
int peer = (mype + 1) % npes;
nvshmem_int_p(destination, mype, peer);
}
int main(int argc, char *argv[]) {
int msg, rank, ndevices;
cudaStream_t stream;
nvshmemx_init_attr_t attr;
MPI_Comm comm = MPI_COMM_WORLD;
attr.mpi_comm = &comm;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
cudaGetDeviceCount(&ndevices);
cudaSetDevice(rank % ndevices);
nvshmemx_init_attr(NVSHMEMX_INIT_WITH_MPI_COMM, &attr);
cudaStreamCreate(&stream);
int *destination = (int *) nvshmem_malloc(sizeof(int));
simple_shift<<<1, 1, 0, stream>>>(destination);
nvshmemx_barrier_all_on_stream(stream);
cudaMemcpyAsync(&msg, destination, sizeof(int), cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);
printf("%d: received message %d\n", nvshmem_my_pe(), msg);
nvshmem_free(destination);
nvshmem_finalize();
MPI_Finalize();
return 0;
}
But if I try to run it on more than 2 GPUs I’m getting a bunch of errors:
> nvcc -rdc=true -ccbin g++ -arch=sm_90 -I /usr/include/nvshmem_12 -I /usr/include/x86_64-linux-gnu/openmpi example.cu -o example.out -L /usr/lib/x86_64-linux-gnu/nvshmem/12 -lmpi -lnvshmem -lnvidia-ml -lcuda -lcudart
> mpirun --allow-run-as-root -n 2 example.out
0: received message 1
1: received message 0
> mpirun --allow-run-as-root -n 3 example.out
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/transport/ibrc/ibrc.cpp:420: non-zero status: 22 ibv_modify_qp failed
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/transport/ibrc/ibrc.cpp:1511: non-zero status: 7 ep_connect failed
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/transport/ibrc/ibrc.cpp:1580: non-zero status: 7 transport create connect failed
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/host/transport/transport.cpp:420: non-zero status: 7 connect EPS failed
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/host/init/init.cu:1045: non-zero status: 7 nvshmem setup connections failed
What could be the cause for it?
Here’s my topology:
GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 NIC0 NIC1 NIC2 NIC3 NIC4 NIC5 CPU Affinity NUMA Affinity GPU NUMA ID
GPU0 X NV18 NV18 NV18 NV18 NV18 NV18 NV18 PIX NODE NODE NODE SYS SYS 0-31,64-95 0 N/A
GPU1 NV18 X NV18 NV18 NV18 NV18 NV18 NV18 NODE NODE NODE NODE SYS SYS 0-31,64-95 0 N/A
GPU2 NV18 NV18 X NV18 NV18 NV18 NV18 NV18 NODE PIX NODE NODE SYS SYS 0-31,64-95 0 N/A
GPU3 NV18 NV18 NV18 X NV18 NV18 NV18 NV18 NODE NODE NODE NODE SYS SYS 0-31,64-95 0 N/A
GPU4 NV18 NV18 NV18 NV18 X NV18 NV18 NV18 SYS SYS SYS SYS PIX NODE 32-63,96-127 1 N/A
GPU5 NV18 NV18 NV18 NV18 NV18 X NV18 NV18 SYS SYS SYS SYS NODE NODE 32-63,96-127 1 N/A
GPU6 NV18 NV18 NV18 NV18 NV18 NV18 X NV18 SYS SYS SYS SYS NODE PIX 32-63,96-127 1 N/A
GPU7 NV18 NV18 NV18 NV18 NV18 NV18 NV18 X SYS SYS SYS SYS NODE NODE 32-63,96-127 1 N/A
NIC0 PIX NODE NODE NODE SYS SYS SYS SYS X NODE NODE NODE SYS SYS
NIC1 NODE NODE PIX NODE SYS SYS SYS SYS NODE X NODE NODE SYS SYS
NIC2 NODE NODE NODE NODE SYS SYS SYS SYS NODE NODE X PIX SYS SYS
NIC3 NODE NODE NODE NODE SYS SYS SYS SYS NODE NODE PIX X SYS SYS
NIC4 SYS SYS SYS SYS PIX NODE NODE NODE SYS SYS SYS SYS X NODE
NIC5 SYS SYS SYS SYS NODE NODE PIX NODE SYS SYS SYS SYS NODE X
Legend:
X = Self
SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
PIX = Connection traversing at most a single PCIe bridge
NV# = Connection traversing a bonded set of # NVLinks
NIC Legend:
NIC0: mlx5_0
NIC1: mlx5_1
NIC2: mlx5_2
NIC3: mlx5_3
NIC4: mlx5_4
NIC5: mlx5_5
and my ibv_devinfo:
hca_id: mlx5_0
transport: InfiniBand (0)
fw_ver: 28.41.1000
node_guid: 9c63:c003:0055:48fe
sys_image_guid: 9c63:c003:0055:48fe
vendor_id: 0x02c9
vendor_part_id: 4129
hw_ver: 0x0
board_id: MT_0000000838
phys_port_cnt: 1
port: 1
state: PORT_ACTIVE (4)
max_mtu: 4096 (5)
active_mtu: 4096 (5)
sm_lid: 1167
port_lid: 975
port_lmc: 0x00
link_layer: InfiniBand
hca_id: mlx5_1
transport: InfiniBand (0)
fw_ver: 28.41.1000
node_guid: 9c63:c003:0055:48b6
sys_image_guid: 9c63:c003:0055:48b6
vendor_id: 0x02c9
vendor_part_id: 4129
hw_ver: 0x0
board_id: MT_0000000838
phys_port_cnt: 1
port: 1
state: PORT_ACTIVE (4)
max_mtu: 4096 (5)
active_mtu: 4096 (5)
sm_lid: 1167
port_lid: 976
port_lmc: 0x00
link_layer: InfiniBand
hca_id: mlx5_2
transport: InfiniBand (0)
fw_ver: 20.38.1900
node_guid: b83f:d203:0091:eb2a
sys_image_guid: b83f:d203:0091:eb2a
vendor_id: 0x02c9
vendor_part_id: 4123
hw_ver: 0x0
board_id: MT_0000000224
phys_port_cnt: 1
port: 1
state: PORT_ACTIVE (4)
max_mtu: 4096 (5)
active_mtu: 1024 (3)
sm_lid: 0
port_lid: 0
port_lmc: 0x00
link_layer: Ethernet
hca_id: mlx5_3
transport: InfiniBand (0)
fw_ver: 20.38.1900
node_guid: b83f:d203:0091:eb2b
sys_image_guid: b83f:d203:0091:eb2a
vendor_id: 0x02c9
vendor_part_id: 4123
hw_ver: 0x0
board_id: MT_0000000224
phys_port_cnt: 1
port: 1
state: PORT_DOWN (1)
max_mtu: 4096 (5)
active_mtu: 1024 (3)
sm_lid: 0
port_lid: 0
port_lmc: 0x00
link_layer: Ethernet
hca_id: mlx5_4
transport: InfiniBand (0)
fw_ver: 28.41.1000
node_guid: 9c63:c003:0055:46fe
sys_image_guid: 9c63:c003:0055:46fe
vendor_id: 0x02c9
vendor_part_id: 4129
hw_ver: 0x0
board_id: MT_0000000838
phys_port_cnt: 1
port: 1
state: PORT_ACTIVE (4)
max_mtu: 4096 (5)
active_mtu: 4096 (5)
sm_lid: 1167
port_lid: 981
port_lmc: 0x00
link_layer: InfiniBand
hca_id: mlx5_5
transport: InfiniBand (0)
fw_ver: 28.41.1000
node_guid: 9c63:c003:005b:1e14
sys_image_guid: 9c63:c003:005b:1e14
vendor_id: 0x02c9
vendor_part_id: 4129
hw_ver: 0x0
board_id: MT_0000000838
phys_port_cnt: 1
port: 1
state: PORT_ACTIVE (4)
max_mtu: 4096 (5)
active_mtu: 4096 (5)
sm_lid: 1167
port_lid: 984
port_lmc: 0x00
link_layer: InfiniBand