NVSHMEM 3.3.9 version will segmentation fault when finalize

log:

/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/bootstrap/uid/bootstrap_uid.cpp:bootstrap_uid_barrier:559: rank 9 nranks 16 tag 0 - ENTER
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/bootstrap/uid/bootstrap_uid.cpp:bootstrap_uid_barrier:575: rank 9 nranks 16 tag 4 - DONE
ca07837c9b46:49312:50209 [1] NCCL INFO comm 0xea31fe0 rank 9 nranks 16 cudaDev 1 busId 7e000 - Destroy COMPLETE
ca07837c9b46:49312:50217 [1] NCCL INFO comm 0xf104a50 rank 1 nranks 8 cudaDev 1 busId 7e000 - Destroy COMPLETE
ca07837c9b46:49312:50225 [1] NCCL INFO comm 0xf691b80 rank 1 nranks 8 cudaDev 1 busId 7e000 - Destroy COMPLETE
ca07837c9b46:49312:50231 [1] NCCL INFO comm 0xfc3ccd0 rank 1 nranks 2 cudaDev 1 busId 7e000 - Destroy COMPLETE
ca07837c9b46:49312:50235 [1] NCCL INFO comm 0x11074530 rank 0 nranks 1 cudaDev 1 busId 7e000 - Destroy COMPLETE
ca07837c9b46:49312:50245 [1] NCCL INFO comm 0x120ebdf0 rank 9 nranks 16 cudaDev 1 busId 7e000 - Destroy COMPLETE
ca07837c9b46:49312:49312 [1] NVSHMEM INFO In nvshmemi_proxy_finalize
ca07837c9b46:49312:49312 [1] NVSHMEM INFO In nvshmemi_teardown_handles
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/transport/common/transport_ib_common.cpp 129 ibv_dereg_mr handle 0xe9e3810 handle->mr 0xe9dc490
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/bootstrap/uid/bootstrap_uid.cpp:bootstrap_uid_barrier:559: rank 11 nranks 16 tag 0 - ENTER
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/bootstrap/uid/bootstrap_uid.cpp:bootstrap_uid_barrier:575: rank 11 nranks 16 tag 4 - DONE
ca07837c9b46:49314:50207 [3] NCCL INFO comm 0xe42f680 rank 11 nranks 16 cudaDev 3 busId c6000 - Destroy COMPLETE
ca07837c9b46:49314:50215 [3] NCCL INFO comm 0xeb17800 rank 3 nranks 8 cudaDev 3 busId c6000 - Destroy COMPLETE
ca07837c9b46:49314:50223 [3] NCCL INFO comm 0xf0a4100 rank 3 nranks 8 cudaDev 3 busId c6000 - Destroy COMPLETE
ca07837c9b46:49314:50234 [3] NCCL INFO comm 0xf64e990 rank 1 nranks 2 cudaDev 3 busId c6000 - Destroy COMPLETE
ca07837c9b46:49314:50236 [3] NCCL INFO comm 0x10a72e20 rank 0 nranks 1 cudaDev 3 busId c6000 - Destroy COMPLETE
ca07837c9b46:49314:50246 [3] NCCL INFO comm 0x11ae9a80 rank 11 nranks 16 cudaDev 3 busId c6000 - Destroy COMPLETE
ca07837c9b46:49314:49314 [3] NVSHMEM INFO In nvshmemi_proxy_finalize
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/transport/common/transport_ib_common.cpp 129 ibv_dereg_mr handle 0xe9e3a10 handle->mr 0xe9de540
ca07837c9b46:49314:49314 [3] NVSHMEM INFO In nvshmemi_teardown_handles
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/transport/common/transport_ib_common.cpp 129 ibv_dereg_mr handle 0xe3e1410 handle->mr 0xe3d9b30
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/transport/common/transport_ib_common.cpp 129 ibv_dereg_mr handle 0x15110470 handle->mr 0x1510cb30
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/transport/common/transport_ib_common.cpp 129 ibv_dereg_mr handle 0xe3e1610 handle->mr 0xe3dbbe0
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/transport/common/transport_ib_common.cpp 129 ibv_dereg_mr handle 0x14b37d60 handle->mr 0x14b33820
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/transport/common/transport_ib_common.cpp 129 ibv_dereg_mr handle 0x15110670 handle->mr 0x1510cbc0
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/transport/common/transport_ib_common.cpp 129 ibv_dereg_mr handle 0x14b37f60 handle->mr 0x14b338b0
ca07837c9b46:49314:49314 [3] NVSHMEM INFO In nvshmemi_transport_finalize
ca07837c9b46:49312:49312 [1] NVSHMEM INFO In nvshmemi_transport_finalize
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/bootstrap/uid/bootstrap_uid.cpp:bootstrap_uid_barrier:559: rank 14 nranks 16 tag 0 - ENTER
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/bootstrap/uid/bootstrap_uid.cpp:bootstrap_uid_barrier:575: rank 14 nranks 16 tag 4 - DONE
ca07837c9b46:49317:50205 [6] NCCL INFO comm 0xd9290b0 rank 14 nranks 16 cudaDev 6 busId 1a3000 - Destroy COMPLETE
ca07837c9b46:49317:50214 [6] NCCL INFO comm 0xe010e40 rank 6 nranks 8 cudaDev 6 busId 1a3000 - Destroy COMPLETE
ca07837c9b46:49317:50222 [6] NCCL INFO comm 0xe59d2f0 rank 6 nranks 8 cudaDev 6 busId 1a3000 - Destroy COMPLETE
ca07837c9b46:49317:50229 [6] NCCL INFO comm 0xeb47b20 rank 1 nranks 2 cudaDev 6 busId 1a3000 - Destroy COMPLETE
ca07837c9b46:49317:50240 [6] NCCL INFO comm 0xff6c890 rank 0 nranks 1 cudaDev 6 busId 1a3000 - Destroy COMPLETE
ca07837c9b46:49317:50250 [6] NCCL INFO comm 0x10fe34e0 rank 14 nranks 16 cudaDev 6 busId 1a3000 - Destroy COMPLETE
ca07837c9b46:49317:49317 [6] NVSHMEM INFO In nvshmemi_proxy_finalize
ca07837c9b46:49317:49317 [6] NVSHMEM INFO In nvshmemi_teardown_handles
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/transport/common/transport_ib_common.cpp 129 ibv_dereg_mr handle 0xd8dc610 handle->mr 0xd8d3550
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/transport/common/transport_ib_common.cpp 129 ibv_dereg_mr handle 0xd8dc810 handle->mr 0xd8d5600
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/transport/common/transport_ib_common.cpp 129 ibv_dereg_mr handle 0x14031bc0 handle->mr 0x1402c480
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/bootstrap/uid/bootstrap_uid.cpp:bootstrap_uid_barrier:559: rank 12 nranks 16 tag 0 - ENTER
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/bootstrap/uid/bootstrap_uid.cpp:bootstrap_uid_barrier:575: rank 12 nranks 16 tag 4 - DONE
ca07837c9b46:49315:50206 [4] NCCL INFO comm 0xdbff360 rank 12 nranks 16 cudaDev 4 busId 109000 - Destroy COMPLETE
ca07837c9b46:49315:50219 [4] NCCL INFO comm 0xe2d0f60 rank 4 nranks 8 cudaDev 4 busId 109000 - Destroy COMPLETE
ca07837c9b46:49315:50226 [4] NCCL INFO comm 0xe85f570 rank 4 nranks 8 cudaDev 4 busId 109000 - Destroy COMPLETE
ca07837c9b46:49315:50232 [4] NCCL INFO comm 0xee092a0 rank 1 nranks 2 cudaDev 4 busId 109000 - Destroy COMPLETE
ca07837c9b46:49315:50237 [4] NCCL INFO comm 0x10242020 rank 0 nranks 1 cudaDev 4 busId 109000 - Destroy COMPLETE
ca07837c9b46:49315:50243 [4] NCCL INFO comm 0x112b5820 rank 12 nranks 16 cudaDev 4 busId 109000 - Destroy COMPLETE
ca07837c9b46:49315:49315 [4] NVSHMEM INFO In nvshmemi_proxy_finalize
ca07837c9b46:49315:49315 [4] NVSHMEM INFO In nvshmemi_teardown_handles
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/transport/common/transport_ib_common.cpp 129 ibv_dereg_mr handle 0xdbb1a10 handle->mr 0xdba9810
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/transport/common/transport_ib_common.cpp 129 ibv_dereg_mr handle 0x14031dc0 handle->mr 0x1402c510
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/transport/common/transport_ib_common.cpp 129 ibv_dereg_mr handle 0xdbb1c10 handle->mr 0xdbab8c0
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/transport/common/transport_ib_common.cpp 129 ibv_dereg_mr handle 0x142d9970 handle->mr 0x142d4e30
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/transport/common/transport_ib_common.cpp 129 ibv_dereg_mr handle 0x142d9b70 handle->mr 0x142d4ec0
ca07837c9b46:49317:49317 [6] NVSHMEM INFO In nvshmemi_transport_finalize
ca07837c9b46:49315:49315 [4] NVSHMEM INFO In nvshmemi_transport_finalize
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/bootstrap/uid/bootstrap_uid.cpp:bootstrap_uid_barrier:559: rank 8 nranks 16 tag 0 - ENTER
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/bootstrap/uid/bootstrap_uid.cpp:bootstrap_uid_barrier:575: rank 8 nranks 16 tag 4 - DONE
ca07837c9b46:49311:50204 [0] NCCL INFO comm 0xf2704e0 rank 8 nranks 16 cudaDev 0 busId 8000 - Destroy COMPLETE
ca07837c9b46:49311:50218 [0] NCCL INFO comm 0xf92cd30 rank 0 nranks 8 cudaDev 0 busId 8000 - Destroy COMPLETE
ca07837c9b46:49311:50221 [0] NCCL INFO comm 0xfea2850 rank 0 nranks 8 cudaDev 0 busId 8000 - Destroy COMPLETE
ca07837c9b46:49311:50233 [0] NCCL INFO comm 0x10443760 rank 1 nranks 2 cudaDev 0 busId 8000 - Destroy COMPLETE
ca07837c9b46:49311:50241 [0] NCCL INFO comm 0x118b57f0 rank 0 nranks 1 cudaDev 0 busId 8000 - Destroy COMPLETE
ca07837c9b46:49311:50249 [0] NCCL INFO comm 0x12935450 rank 8 nranks 16 cudaDev 0 busId 8000 - Destroy COMPLETE
ca07837c9b46:49311:49311 [0] NVSHMEM INFO In nvshmemi_proxy_finalize
ca07837c9b46:49311:49311 [0] NVSHMEM INFO In nvshmemi_teardown_handles
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/transport/common/transport_ib_common.cpp 129 ibv_dereg_mr handle 0xf221210 handle->mr 0xf21a980
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/transport/common/transport_ib_common.cpp 129 ibv_dereg_mr handle 0xf221410 handle->mr 0xf21ca30
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/transport/common/transport_ib_common.cpp 129 ibv_dereg_mr handle 0x15939a80 handle->mr 0x15936740
/dvs/p4/build/sw/rel/gpgpu/toolkit/r12.8/main_nvshmem/src/modules/transport/common/transport_ib_common.cpp 129 ibv_dereg_mr handle 0x15939c80 handle->mr 0x159367d0
ca07837c9b46:49311:49311 [0] NVSHMEM INFO In nvshmemi_transport_finalize
W0721 12:26:24.308000 49245 site-packages/torch/multiprocessing/spawn.py:169] Terminating process 49311 via signal SIGTERM
W0721 12:26:24.308000 49245 site-packages/torch/multiprocessing/spawn.py:169] Terminating process 49312 via signal SIGTERM
W0721 12:26:24.309000 49245 site-packages/torch/multiprocessing/spawn.py:169] Terminating process 49313 via signal SIGTERM
W0721 12:26:24.309000 49245 site-packages/torch/multiprocessing/spawn.py:169] Terminating process 49314 via signal SIGTERM
W0721 12:26:24.309000 49245 site-packages/torch/multiprocessing/spawn.py:169] Terminating process 49315 via signal SIGTERM
W0721 12:26:24.309000 49245 site-packages/torch/multiprocessing/spawn.py:169] Terminating process 49317 via signal SIGTERM
W0721 12:26:24.309000 49245 site-packages/torch/multiprocessing/spawn.py:169] Terminating process 49318 via signal SIGTERM

topo:

	GPU0	GPU1	GPU2	GPU3	GPU4	GPU5	GPU6	GPU7	NIC0	NIC1	NIC2	NIC3	CPU Affinity	NUMA Affinity	GPU NUMA ID
GPU0	 X 	NV18	NV18	NV18	NV18	NV18	NV18	NV18	NODE	NODE	SYS	SYS	0-47,96-143	0		N/A
GPU1	NV18	 X 	NV18	NV18	NV18	NV18	NV18	NV18	PIX	NODE	SYS	SYS	0-47,96-143	0		N/A
GPU2	NV18	NV18	 X 	NV18	NV18	NV18	NV18	NV18	NODE	NODE	SYS	SYS	0-47,96-143	0		N/A
GPU3	NV18	NV18	NV18	 X 	NV18	NV18	NV18	NV18	NODE	PIX	SYS	SYS	0-47,96-143	0		N/A
GPU4	NV18	NV18	NV18	NV18	 X 	NV18	NV18	NV18	SYS	SYS	PIX	NODE	48-95,144-191	1		N/A
GPU5	NV18	NV18	NV18	NV18	NV18	 X 	NV18	NV18	SYS	SYS	NODE	NODE	48-95,144-191	1		N/A
GPU6	NV18	NV18	NV18	NV18	NV18	NV18	 X 	NV18	SYS	SYS	NODE	PIX	48-95,144-191	1		N/A
GPU7	NV18	NV18	NV18	NV18	NV18	NV18	NV18	 X 	SYS	SYS	NODE	NODE	48-95,144-191	1		N/A
NIC0	NODE	PIX	NODE	NODE	SYS	SYS	SYS	SYS	 X 	NODE	SYS	SYS
NIC1	NODE	NODE	NODE	PIX	SYS	SYS	SYS	SYS	NODE	 X 	SYS	SYS
NIC2	SYS	SYS	SYS	SYS	PIX	NODE	NODE	NODE	SYS	SYS	 X 	NODE
NIC3	SYS	SYS	SYS	SYS	NODE	NODE	PIX	NODE	SYS	SYS	NODE	 X

Legend:

  X    = Self
  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
  PIX  = Connection traversing at most a single PCIe bridge
  NV#  = Connection traversing a bonded set of # NVLinks

NIC Legend:

  NIC0: mlx5_bond_0
  NIC1: mlx5_bond_1
  NIC2: mlx5_bond_2
  NIC3: mlx5_bond_3

env:

export NVSHMEM_DEBUG=INFO
export NVSHMEM_ENABLE_NIC_PE_MAPPING=1
export NVSHMEM_HCA_PE_MAPPING="mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2"

my patch could fix this bug, I hope my patch can be applied in the next version of NVSHMEM.

diff --git a/src/modules/transport/ibgda/ibgda.cpp b/src/modules/transport/ibgda/ibgda.cpp
index d037ccc..24307b2 100644
--- a/src/modules/transport/ibgda/ibgda.cpp
+++ b/src/modules/transport/ibgda/ibgda.cpp
@@ -3228,12 +3228,14 @@ int nvshmemt_ibgda_finalize(nvshmem_transport_t transport) {
         if (device->pd) {
             status = ftable.dealloc_pd(device->pd);
             NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, out, "ibv_dealloc_pd failed \n");
+            device->pd = NULL;
         }

         if (device->context) {
             status = ftable.close_device(device->context);
             NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, out,
                                   "ibv_close_device failed \n");
+            device->context = NULL;
         }
     }

diff --git a/src/modules/transport/ibrc/ibrc.cpp b/src/modules/transport/ibrc/ibrc.cpp
index 9eed97c..796071a 100644
--- a/src/modules/transport/ibrc/ibrc.cpp
+++ b/src/modules/transport/ibrc/ibrc.cpp
@@ -761,32 +761,38 @@ int nvshmemt_ibrc_finalize(nvshmem_transport_t transport) {
                 status = ftable.dereg_mr(((struct ibrc_device *)state->devices)[dev_id].bpool_mr);
                 NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, out,
                                       "ibv_dereg_mr failed \n");
+                ((struct ibrc_device *)state->devices)[dev_id].bpool_mr = NULL;
             }
             if (((struct ibrc_device *)state->devices)[dev_id].send_cq) {
                 status = ftable.destroy_cq(((struct ibrc_device *)state->devices)[dev_id].send_cq);
                 NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, out,
                                       "ibv_destroy_cq failed \n");
+                ((struct ibrc_device *)state->devices)[dev_id].send_cq = NULL;
             }
             if (((struct ibrc_device *)state->devices)[dev_id].recv_cq) {
                 status = ftable.destroy_cq(((struct ibrc_device *)state->devices)[dev_id].recv_cq);
                 NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, out,
                                       "ibv_destroy_cq failed \n");
+                ((struct ibrc_device *)state->devices)[dev_id].recv_cq = NULL;
             }
             if (((struct ibrc_device *)state->devices)[dev_id].srq) {
                 status = ftable.destroy_srq(((struct ibrc_device *)state->devices)[dev_id].srq);
                 NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, out,
                                       "ibv_destroy_srq failed \n");
+                ((struct ibrc_device *)state->devices)[dev_id].srq = NULL;
             }
             if (((struct ibrc_device *)state->devices)[dev_id].pd) {
                 status = ftable.dealloc_pd(((struct ibrc_device *)state->devices)[dev_id].pd);
                 NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, out,
                                       "ibv_dealloc_pd failed \n");
+                ((struct ibrc_device *)state->devices)[dev_id].pd = NULL;
             }
             if (((struct ibrc_device *)state->devices)[dev_id].context) {
                 status =
                     ftable.close_device(((struct ibrc_device *)state->devices)[dev_id].context);
                 NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, out,
                                       "ibv_close_device failed \n");
+                ((struct ibrc_device *)state->devices)[dev_id].context = NULL;
             }
         }
         free(state->devices);