log:
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/bootstrap/uid/bootstrap_uid.cpp:bootstrap_uid_barrier:559: rank 9 nranks 16 tag 0 - ENTER
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/bootstrap/uid/bootstrap_uid.cpp:bootstrap_uid_barrier:575: rank 9 nranks 16 tag 4 - DONE
ca07837c9b46:49312:50209 [1] NCCL INFO comm 0xea31fe0 rank 9 nranks 16 cudaDev 1 busId 7e000 - Destroy COMPLETE
ca07837c9b46:49312:50217 [1] NCCL INFO comm 0xf104a50 rank 1 nranks 8 cudaDev 1 busId 7e000 - Destroy COMPLETE
ca07837c9b46:49312:50225 [1] NCCL INFO comm 0xf691b80 rank 1 nranks 8 cudaDev 1 busId 7e000 - Destroy COMPLETE
ca07837c9b46:49312:50231 [1] NCCL INFO comm 0xfc3ccd0 rank 1 nranks 2 cudaDev 1 busId 7e000 - Destroy COMPLETE
ca07837c9b46:49312:50235 [1] NCCL INFO comm 0x11074530 rank 0 nranks 1 cudaDev 1 busId 7e000 - Destroy COMPLETE
ca07837c9b46:49312:50245 [1] NCCL INFO comm 0x120ebdf0 rank 9 nranks 16 cudaDev 1 busId 7e000 - Destroy COMPLETE
ca07837c9b46:49312:49312 [1] NVSHMEM INFO In nvshmemi_proxy_finalize
ca07837c9b46:49312:49312 [1] NVSHMEM INFO In nvshmemi_teardown_handles
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/transport/common/transport_ib_common.cpp 129 ibv_dereg_mr handle 0xe9e3810 handle->mr 0xe9dc490
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/bootstrap/uid/bootstrap_uid.cpp:bootstrap_uid_barrier:559: rank 11 nranks 16 tag 0 - ENTER
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/bootstrap/uid/bootstrap_uid.cpp:bootstrap_uid_barrier:575: rank 11 nranks 16 tag 4 - DONE
ca07837c9b46:49314:50207 [3] NCCL INFO comm 0xe42f680 rank 11 nranks 16 cudaDev 3 busId c6000 - Destroy COMPLETE
ca07837c9b46:49314:50215 [3] NCCL INFO comm 0xeb17800 rank 3 nranks 8 cudaDev 3 busId c6000 - Destroy COMPLETE
ca07837c9b46:49314:50223 [3] NCCL INFO comm 0xf0a4100 rank 3 nranks 8 cudaDev 3 busId c6000 - Destroy COMPLETE
ca07837c9b46:49314:50234 [3] NCCL INFO comm 0xf64e990 rank 1 nranks 2 cudaDev 3 busId c6000 - Destroy COMPLETE
ca07837c9b46:49314:50236 [3] NCCL INFO comm 0x10a72e20 rank 0 nranks 1 cudaDev 3 busId c6000 - Destroy COMPLETE
ca07837c9b46:49314:50246 [3] NCCL INFO comm 0x11ae9a80 rank 11 nranks 16 cudaDev 3 busId c6000 - Destroy COMPLETE
ca07837c9b46:49314:49314 [3] NVSHMEM INFO In nvshmemi_proxy_finalize
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/transport/common/transport_ib_common.cpp 129 ibv_dereg_mr handle 0xe9e3a10 handle->mr 0xe9de540
ca07837c9b46:49314:49314 [3] NVSHMEM INFO In nvshmemi_teardown_handles
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/transport/common/transport_ib_common.cpp 129 ibv_dereg_mr handle 0xe3e1410 handle->mr 0xe3d9b30
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/transport/common/transport_ib_common.cpp 129 ibv_dereg_mr handle 0x15110470 handle->mr 0x1510cb30
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/transport/common/transport_ib_common.cpp 129 ibv_dereg_mr handle 0xe3e1610 handle->mr 0xe3dbbe0
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/transport/common/transport_ib_common.cpp 129 ibv_dereg_mr handle 0x14b37d60 handle->mr 0x14b33820
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/transport/common/transport_ib_common.cpp 129 ibv_dereg_mr handle 0x15110670 handle->mr 0x1510cbc0
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/transport/common/transport_ib_common.cpp 129 ibv_dereg_mr handle 0x14b37f60 handle->mr 0x14b338b0
ca07837c9b46:49314:49314 [3] NVSHMEM INFO In nvshmemi_transport_finalize
ca07837c9b46:49312:49312 [1] NVSHMEM INFO In nvshmemi_transport_finalize
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/bootstrap/uid/bootstrap_uid.cpp:bootstrap_uid_barrier:559: rank 14 nranks 16 tag 0 - ENTER
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/bootstrap/uid/bootstrap_uid.cpp:bootstrap_uid_barrier:575: rank 14 nranks 16 tag 4 - DONE
ca07837c9b46:49317:50205 [6] NCCL INFO comm 0xd9290b0 rank 14 nranks 16 cudaDev 6 busId 1a3000 - Destroy COMPLETE
ca07837c9b46:49317:50214 [6] NCCL INFO comm 0xe010e40 rank 6 nranks 8 cudaDev 6 busId 1a3000 - Destroy COMPLETE
ca07837c9b46:49317:50222 [6] NCCL INFO comm 0xe59d2f0 rank 6 nranks 8 cudaDev 6 busId 1a3000 - Destroy COMPLETE
ca07837c9b46:49317:50229 [6] NCCL INFO comm 0xeb47b20 rank 1 nranks 2 cudaDev 6 busId 1a3000 - Destroy COMPLETE
ca07837c9b46:49317:50240 [6] NCCL INFO comm 0xff6c890 rank 0 nranks 1 cudaDev 6 busId 1a3000 - Destroy COMPLETE
ca07837c9b46:49317:50250 [6] NCCL INFO comm 0x10fe34e0 rank 14 nranks 16 cudaDev 6 busId 1a3000 - Destroy COMPLETE
ca07837c9b46:49317:49317 [6] NVSHMEM INFO In nvshmemi_proxy_finalize
ca07837c9b46:49317:49317 [6] NVSHMEM INFO In nvshmemi_teardown_handles
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/transport/common/transport_ib_common.cpp 129 ibv_dereg_mr handle 0xd8dc610 handle->mr 0xd8d3550
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/transport/common/transport_ib_common.cpp 129 ibv_dereg_mr handle 0xd8dc810 handle->mr 0xd8d5600
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/transport/common/transport_ib_common.cpp 129 ibv_dereg_mr handle 0x14031bc0 handle->mr 0x1402c480
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/bootstrap/uid/bootstrap_uid.cpp:bootstrap_uid_barrier:559: rank 12 nranks 16 tag 0 - ENTER
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/bootstrap/uid/bootstrap_uid.cpp:bootstrap_uid_barrier:575: rank 12 nranks 16 tag 4 - DONE
ca07837c9b46:49315:50206 [4] NCCL INFO comm 0xdbff360 rank 12 nranks 16 cudaDev 4 busId 109000 - Destroy COMPLETE
ca07837c9b46:49315:50219 [4] NCCL INFO comm 0xe2d0f60 rank 4 nranks 8 cudaDev 4 busId 109000 - Destroy COMPLETE
ca07837c9b46:49315:50226 [4] NCCL INFO comm 0xe85f570 rank 4 nranks 8 cudaDev 4 busId 109000 - Destroy COMPLETE
ca07837c9b46:49315:50232 [4] NCCL INFO comm 0xee092a0 rank 1 nranks 2 cudaDev 4 busId 109000 - Destroy COMPLETE
ca07837c9b46:49315:50237 [4] NCCL INFO comm 0x10242020 rank 0 nranks 1 cudaDev 4 busId 109000 - Destroy COMPLETE
ca07837c9b46:49315:50243 [4] NCCL INFO comm 0x112b5820 rank 12 nranks 16 cudaDev 4 busId 109000 - Destroy COMPLETE
ca07837c9b46:49315:49315 [4] NVSHMEM INFO In nvshmemi_proxy_finalize
ca07837c9b46:49315:49315 [4] NVSHMEM INFO In nvshmemi_teardown_handles
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/transport/common/transport_ib_common.cpp 129 ibv_dereg_mr handle 0xdbb1a10 handle->mr 0xdba9810
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/transport/common/transport_ib_common.cpp 129 ibv_dereg_mr handle 0x14031dc0 handle->mr 0x1402c510
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/transport/common/transport_ib_common.cpp 129 ibv_dereg_mr handle 0xdbb1c10 handle->mr 0xdbab8c0
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/transport/common/transport_ib_common.cpp 129 ibv_dereg_mr handle 0x142d9970 handle->mr 0x142d4e30
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/transport/common/transport_ib_common.cpp 129 ibv_dereg_mr handle 0x142d9b70 handle->mr 0x142d4ec0
ca07837c9b46:49317:49317 [6] NVSHMEM INFO In nvshmemi_transport_finalize
ca07837c9b46:49315:49315 [4] NVSHMEM INFO In nvshmemi_transport_finalize
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/bootstrap/uid/bootstrap_uid.cpp:bootstrap_uid_barrier:559: rank 8 nranks 16 tag 0 - ENTER
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/bootstrap/uid/bootstrap_uid.cpp:bootstrap_uid_barrier:575: rank 8 nranks 16 tag 4 - DONE
ca07837c9b46:49311:50204 [0] NCCL INFO comm 0xf2704e0 rank 8 nranks 16 cudaDev 0 busId 8000 - Destroy COMPLETE
ca07837c9b46:49311:50218 [0] NCCL INFO comm 0xf92cd30 rank 0 nranks 8 cudaDev 0 busId 8000 - Destroy COMPLETE
ca07837c9b46:49311:50221 [0] NCCL INFO comm 0xfea2850 rank 0 nranks 8 cudaDev 0 busId 8000 - Destroy COMPLETE
ca07837c9b46:49311:50233 [0] NCCL INFO comm 0x10443760 rank 1 nranks 2 cudaDev 0 busId 8000 - Destroy COMPLETE
ca07837c9b46:49311:50241 [0] NCCL INFO comm 0x118b57f0 rank 0 nranks 1 cudaDev 0 busId 8000 - Destroy COMPLETE
ca07837c9b46:49311:50249 [0] NCCL INFO comm 0x12935450 rank 8 nranks 16 cudaDev 0 busId 8000 - Destroy COMPLETE
ca07837c9b46:49311:49311 [0] NVSHMEM INFO In nvshmemi_proxy_finalize
ca07837c9b46:49311:49311 [0] NVSHMEM INFO In nvshmemi_teardown_handles
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/transport/common/transport_ib_common.cpp 129 ibv_dereg_mr handle 0xf221210 handle->mr 0xf21a980
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/transport/common/transport_ib_common.cpp 129 ibv_dereg_mr handle 0xf221410 handle->mr 0xf21ca30
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/transport/common/transport_ib_common.cpp 129 ibv_dereg_mr handle 0x15939a80 handle->mr 0x15936740
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/transport/common/transport_ib_common.cpp 129 ibv_dereg_mr handle 0x15939c80 handle->mr 0x159367d0
ca07837c9b46:49311:49311 [0] NVSHMEM INFO In nvshmemi_transport_finalize
W0721 12:26:24.308000 49245 site-packages/torch/multiprocessing/spawn.py:169] Terminating process 49311 via signal SIGTERM
W0721 12:26:24.308000 49245 site-packages/torch/multiprocessing/spawn.py:169] Terminating process 49312 via signal SIGTERM
W0721 12:26:24.309000 49245 site-packages/torch/multiprocessing/spawn.py:169] Terminating process 49313 via signal SIGTERM
W0721 12:26:24.309000 49245 site-packages/torch/multiprocessing/spawn.py:169] Terminating process 49314 via signal SIGTERM
W0721 12:26:24.309000 49245 site-packages/torch/multiprocessing/spawn.py:169] Terminating process 49315 via signal SIGTERM
W0721 12:26:24.309000 49245 site-packages/torch/multiprocessing/spawn.py:169] Terminating process 49317 via signal SIGTERM
W0721 12:26:24.309000 49245 site-packages/torch/multiprocessing/spawn.py:169] Terminating process 49318 via signal SIGTERM
topo:
GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 NIC0 NIC1 NIC2 NIC3 CPU Affinity NUMA Affinity GPU NUMA ID
GPU0 X NV18 NV18 NV18 NV18 NV18 NV18 NV18 NODE NODE SYS SYS 0-47,96-143 0 N/A
GPU1 NV18 X NV18 NV18 NV18 NV18 NV18 NV18 PIX NODE SYS SYS 0-47,96-143 0 N/A
GPU2 NV18 NV18 X NV18 NV18 NV18 NV18 NV18 NODE NODE SYS SYS 0-47,96-143 0 N/A
GPU3 NV18 NV18 NV18 X NV18 NV18 NV18 NV18 NODE PIX SYS SYS 0-47,96-143 0 N/A
GPU4 NV18 NV18 NV18 NV18 X NV18 NV18 NV18 SYS SYS PIX NODE 48-95,144-191 1 N/A
GPU5 NV18 NV18 NV18 NV18 NV18 X NV18 NV18 SYS SYS NODE NODE 48-95,144-191 1 N/A
GPU6 NV18 NV18 NV18 NV18 NV18 NV18 X NV18 SYS SYS NODE PIX 48-95,144-191 1 N/A
GPU7 NV18 NV18 NV18 NV18 NV18 NV18 NV18 X SYS SYS NODE NODE 48-95,144-191 1 N/A
NIC0 NODE PIX NODE NODE SYS SYS SYS SYS X NODE SYS SYS
NIC1 NODE NODE NODE PIX SYS SYS SYS SYS NODE X SYS SYS
NIC2 SYS SYS SYS SYS PIX NODE NODE NODE SYS SYS X NODE
NIC3 SYS SYS SYS SYS NODE NODE PIX NODE SYS SYS NODE X
Legend:
X = Self
SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
PIX = Connection traversing at most a single PCIe bridge
NV# = Connection traversing a bonded set of # NVLinks
NIC Legend:
NIC0: mlx5_bond_0
NIC1: mlx5_bond_1
NIC2: mlx5_bond_2
NIC3: mlx5_bond_3
env:
export NVSHMEM_DEBUG=INFO
export NVSHMEM_ENABLE_NIC_PE_MAPPING=1
export NVSHMEM_HCA_PE_MAPPING="mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2"
my patch could fix this bug, I hope my patch can be applied in the next version of NVSHMEM.
diff --git a/src/modules/transport/ibgda/ibgda.cpp b/src/modules/transport/ibgda/ibgda.cpp
index d037ccc..24307b2 100644
--- a/src/modules/transport/ibgda/ibgda.cpp
+++ b/src/modules/transport/ibgda/ibgda.cpp
@@ -3228,12 +3228,14 @@ int nvshmemt_ibgda_finalize(nvshmem_transport_t transport) {
if (device->pd) {
status = ftable.dealloc_pd(device->pd);
NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, out, "ibv_dealloc_pd failed \n");
+ device->pd = NULL;
}
if (device->context) {
status = ftable.close_device(device->context);
NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, out,
"ibv_close_device failed \n");
+ device->context = NULL;
}
}
diff --git a/src/modules/transport/ibrc/ibrc.cpp b/src/modules/transport/ibrc/ibrc.cpp
index 9eed97c..796071a 100644
--- a/src/modules/transport/ibrc/ibrc.cpp
+++ b/src/modules/transport/ibrc/ibrc.cpp
@@ -761,32 +761,38 @@ int nvshmemt_ibrc_finalize(nvshmem_transport_t transport) {
status = ftable.dereg_mr(((struct ibrc_device *)state->devices)[dev_id].bpool_mr);
NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, out,
"ibv_dereg_mr failed \n");
+ ((struct ibrc_device *)state->devices)[dev_id].bpool_mr = NULL;
}
if (((struct ibrc_device *)state->devices)[dev_id].send_cq) {
status = ftable.destroy_cq(((struct ibrc_device *)state->devices)[dev_id].send_cq);
NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, out,
"ibv_destroy_cq failed \n");
+ ((struct ibrc_device *)state->devices)[dev_id].send_cq = NULL;
}
if (((struct ibrc_device *)state->devices)[dev_id].recv_cq) {
status = ftable.destroy_cq(((struct ibrc_device *)state->devices)[dev_id].recv_cq);
NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, out,
"ibv_destroy_cq failed \n");
+ ((struct ibrc_device *)state->devices)[dev_id].recv_cq = NULL;
}
if (((struct ibrc_device *)state->devices)[dev_id].srq) {
status = ftable.destroy_srq(((struct ibrc_device *)state->devices)[dev_id].srq);
NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, out,
"ibv_destroy_srq failed \n");
+ ((struct ibrc_device *)state->devices)[dev_id].srq = NULL;
}
if (((struct ibrc_device *)state->devices)[dev_id].pd) {
status = ftable.dealloc_pd(((struct ibrc_device *)state->devices)[dev_id].pd);
NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, out,
"ibv_dealloc_pd failed \n");
+ ((struct ibrc_device *)state->devices)[dev_id].pd = NULL;
}
if (((struct ibrc_device *)state->devices)[dev_id].context) {
status =
ftable.close_device(((struct ibrc_device *)state->devices)[dev_id].context);
NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, out,
"ibv_close_device failed \n");
+ ((struct ibrc_device *)state->devices)[dev_id].context = NULL;
}
}
free(state->devices);