Please see the log below:
root@5d5684b73250:/workspace# cd nccl-tests/
root@5d5684b73250:/workspace/nccl-tests# make
make -C src build BUILDDIR=/workspace/nccl-tests/build
make[1]: Entering directory '/workspace/nccl-tests/src'
Compiling timer.cc > /workspace/nccl-tests/build/timer.o
Compiling /workspace/nccl-tests/build/verifiable/verifiable.o
Compiling all_reduce.cu > /workspace/nccl-tests/build/all_reduce.o
Compiling common.cu > /workspace/nccl-tests/build/common.o
Linking /workspace/nccl-tests/build/all_reduce.o > /workspace/nccl-tests/build/all_reduce_perf
Compiling all_gather.cu > /workspace/nccl-tests/build/all_gather.o
Linking /workspace/nccl-tests/build/all_gather.o > /workspace/nccl-tests/build/all_gather_perf
Compiling broadcast.cu > /workspace/nccl-tests/build/broadcast.o
Linking /workspace/nccl-tests/build/broadcast.o > /workspace/nccl-tests/build/broadcast_perf
Compiling reduce_scatter.cu > /workspace/nccl-tests/build/reduce_scatter.o
Linking /workspace/nccl-tests/build/reduce_scatter.o > /workspace/nccl-tests/build/reduce_scatter_perf
Compiling reduce.cu > /workspace/nccl-tests/build/reduce.o
Linking /workspace/nccl-tests/build/reduce.o > /workspace/nccl-tests/build/reduce_perf
Compiling alltoall.cu > /workspace/nccl-tests/build/alltoall.o
Linking /workspace/nccl-tests/build/alltoall.o > /workspace/nccl-tests/build/alltoall_perf
Compiling scatter.cu > /workspace/nccl-tests/build/scatter.o
Linking /workspace/nccl-tests/build/scatter.o > /workspace/nccl-tests/build/scatter_perf
Compiling gather.cu > /workspace/nccl-tests/build/gather.o
Linking /workspace/nccl-tests/build/gather.o > /workspace/nccl-tests/build/gather_perf
Compiling sendrecv.cu > /workspace/nccl-tests/build/sendrecv.o
Linking /workspace/nccl-tests/build/sendrecv.o > /workspace/nccl-tests/build/sendrecv_perf
Compiling hypercube.cu > /workspace/nccl-tests/build/hypercube.o
Linking /workspace/nccl-tests/build/hypercube.o > /workspace/nccl-tests/build/hypercube_perf
make[1]: Leaving directory '/workspace/nccl-tests/src'
root@5d5684b73250:/workspace/nccl-tests# ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 4
# nThread 1 nGpus 4 minBytes 8 maxBytes 134217728 step: 2(factor) warmup iters: 5 iters: 20 agg iters: 1 validation: 1 graph: 0
#
# Using devices
# Rank 0 Group 0 Pid 15332 on 5d5684b73250 device 0 [0x3b] NVIDIA RTX A6000
# Rank 1 Group 0 Pid 15332 on 5d5684b73250 device 1 [0x5e] NVIDIA RTX A6000
# Rank 2 Group 0 Pid 15332 on 5d5684b73250 device 2 [0x86] NVIDIA RTX A6000
# Rank 3 Group 0 Pid 15332 on 5d5684b73250 device 3 [0xaf] NVIDIA RTX A6000
5d5684b73250:15332:15332 [0] NCCL INFO Bootstrap : Using eth0:172.17.0.2<0>
5d5684b73250:15332:15332 [0] NCCL INFO NET/Plugin: Failed to find ncclNetPlugin_v6 symbol.
5d5684b73250:15332:15332 [0] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin (v5)
5d5684b73250:15332:15332 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol.
5d5684b73250:15332:15332 [0] NCCL INFO NET/Plugin: Loaded coll plugin SHARP (v5)
5d5684b73250:15332:15332 [3] NCCL INFO cudaDriverVersion 12000
NCCL version 2.15.5+cuda11.8
5d5684b73250:15332:15346 [1] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
5d5684b73250:15332:15346 [1] NCCL INFO P2P plugin IBext
5d5684b73250:15332:15346 [1] NCCL INFO NET/IB : No device found.
5d5684b73250:15332:15346 [1] NCCL INFO NET/IB : No device found.
5d5684b73250:15332:15346 [1] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.2<0>
5d5684b73250:15332:15346 [1] NCCL INFO Using network Socket
5d5684b73250:15332:15348 [3] NCCL INFO Using network Socket
5d5684b73250:15332:15347 [2] NCCL INFO Using network Socket
5d5684b73250:15332:15345 [0] NCCL INFO Using network Socket
5d5684b73250:15332:15346 [1] NCCL INFO Setting affinity for GPU 1 to 0fffff00,000fffff
5d5684b73250:15332:15347 [2] NCCL INFO Setting affinity for GPU 2 to ffff,f00000ff,fff00000
5d5684b73250:15332:15348 [3] NCCL INFO Setting affinity for GPU 3 to ffff,f00000ff,fff00000
5d5684b73250:15332:15345 [0] NCCL INFO Setting affinity for GPU 0 to 0fffff00,000fffff
5d5684b73250:15332:15348 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] 2/-1/-1->3->0 [2] -1/-1/-1->3->2 [3] 2/-1/-1->3->0
5d5684b73250:15332:15347 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 1/-1/-1->2->3 [2] 3/-1/-1->2->1 [3] 1/-1/-1->2->3
5d5684b73250:15332:15345 [0] NCCL INFO Channel 00/04 : 0 1 2 3
5d5684b73250:15332:15345 [0] NCCL INFO Channel 01/04 : 0 3 2 1
5d5684b73250:15332:15346 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] -1/-1/-1->1->2 [2] 2/-1/-1->1->0 [3] -1/-1/-1->1->2
5d5684b73250:15332:15345 [0] NCCL INFO Channel 02/04 : 0 1 2 3
5d5684b73250:15332:15345 [0] NCCL INFO Channel 03/04 : 0 3 2 1
5d5684b73250:15332:15345 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 3/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 3/-1/-1->0->-1
5d5684b73250:15332:15347 [2] NCCL INFO Channel 00 : 2[86000] -> 3[af000] via SHM/direct/direct
5d5684b73250:15332:15345 [0] NCCL INFO Channel 00 : 0[3b000] -> 1[5e000] via SHM/direct/direct
5d5684b73250:15332:15347 [2] NCCL INFO Channel 02 : 2[86000] -> 3[af000] via SHM/direct/direct
5d5684b73250:15332:15345 [0] NCCL INFO Channel 02 : 0[3b000] -> 1[5e000] via SHM/direct/direct
5d5684b73250:15332:15348 [3] NCCL INFO Channel 00/0 : 3[af000] -> 0[3b000] via P2P/direct pointer
5d5684b73250:15332:15348 [3] NCCL INFO Channel 02/0 : 3[af000] -> 0[3b000] via P2P/direct pointer
5d5684b73250:15332:15346 [1] NCCL INFO Channel 00/0 : 1[5e000] -> 2[86000] via P2P/direct pointer
5d5684b73250:15332:15348 [3] NCCL INFO Channel 01 : 3[af000] -> 2[86000] via SHM/direct/direct
5d5684b73250:15332:15348 [3] NCCL INFO Channel 03 : 3[af000] -> 2[86000] via SHM/direct/direct
5d5684b73250:15332:15346 [1] NCCL INFO Channel 02/0 : 1[5e000] -> 2[86000] via P2P/direct pointer
5d5684b73250:15332:15346 [1] NCCL INFO Channel 01 : 1[5e000] -> 0[3b000] via SHM/direct/direct
5d5684b73250:15332:15346 [1] NCCL INFO Channel 03 : 1[5e000] -> 0[3b000] via SHM/direct/direct
[1679561506.255767] [5d5684b73250:15332:0] spinlock.c:29 UCX WARN ucs_recursive_spinlock_destroy() failed: busy
[1679561506.255769] [5d5684b73250:15332:1] debug.c:1289 UCX WARN ucs_debug_disable_signal: signal 8 was not set in ucs
[5d5684b73250:15332:0:15347] Caught signal 7 (Bus error: nonexistent physical address)
[5d5684b73250:15332:1:15345] Caught signal 7 (Bus error: nonexistent physical address)
==== backtrace (tid: 15345) ====
0 0x0000000000014420 __funlockfile() ???:0
1 0x000000000018bb41 __nss_database_lookup() ???:0
2 0x000000000007587d ncclGroupEnd() ???:0
3 0x000000000007b0ef ncclGroupEnd() ???:0
4 0x0000000000059e97 ncclGetUniqueId() ???:0
5 0x00000000000489b1 ???() /usr/lib/x86_64-linux-gnu/libnccl.so.2:0
6 0x000000000004a655 ???() /usr/lib/x86_64-linux-gnu/libnccl.so.2:0
7 0x0000000000063dcc ncclRedOpDestroy() ???:0
8 0x0000000000008609 start_thread() ???:0
9 0x000000000011f133 clone() ???:0
=================================
Bus error (core dumped)
root@5d5684b73250:/workspace/nccl-tests# ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 3
# nThread 1 nGpus 3 minBytes 8 maxBytes 134217728 step: 2(factor) warmup iters: 5 iters: 20 agg iters: 1 validation: 1 graph: 0
#
# Using devices
# Rank 0 Group 0 Pid 15353 on 5d5684b73250 device 0 [0x3b] NVIDIA RTX A6000
# Rank 1 Group 0 Pid 15353 on 5d5684b73250 device 1 [0x5e] NVIDIA RTX A6000
# Rank 2 Group 0 Pid 15353 on 5d5684b73250 device 2 [0x86] NVIDIA RTX A6000
5d5684b73250:15353:15353 [0] NCCL INFO Bootstrap : Using eth0:172.17.0.2<0>
5d5684b73250:15353:15353 [0] NCCL INFO NET/Plugin: Failed to find ncclNetPlugin_v6 symbol.
5d5684b73250:15353:15353 [0] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin (v5)
5d5684b73250:15353:15353 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol.
5d5684b73250:15353:15353 [0] NCCL INFO NET/Plugin: Loaded coll plugin SHARP (v5)
5d5684b73250:15353:15353 [2] NCCL INFO cudaDriverVersion 12000
NCCL version 2.15.5+cuda11.8
5d5684b73250:15353:15364 [0] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
5d5684b73250:15353:15364 [0] NCCL INFO P2P plugin IBext
5d5684b73250:15353:15364 [0] NCCL INFO NET/IB : No device found.
5d5684b73250:15353:15364 [0] NCCL INFO NET/IB : No device found.
5d5684b73250:15353:15364 [0] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.2<0>
5d5684b73250:15353:15364 [0] NCCL INFO Using network Socket
5d5684b73250:15353:15365 [1] NCCL INFO Using network Socket
5d5684b73250:15353:15366 [2] NCCL INFO Using network Socket
5d5684b73250:15353:15365 [1] NCCL INFO Setting affinity for GPU 1 to 0fffff00,000fffff
5d5684b73250:15353:15364 [0] NCCL INFO Setting affinity for GPU 0 to 0fffff00,000fffff
5d5684b73250:15353:15366 [2] NCCL INFO Setting affinity for GPU 2 to ffff,f00000ff,fff00000
5d5684b73250:15353:15365 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0
5d5684b73250:15353:15364 [0] NCCL INFO Channel 00/02 : 0 1 2
5d5684b73250:15353:15364 [0] NCCL INFO Channel 01/02 : 0 1 2
5d5684b73250:15353:15366 [2] NCCL INFO Trees [0] -1/-1/-1->2->1 [1] -1/-1/-1->2->1
5d5684b73250:15353:15364 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1
5d5684b73250:15353:15366 [2] NCCL INFO Channel 00 : 2[86000] -> 0[3b000] via SHM/direct/direct
5d5684b73250:15353:15366 [2] NCCL INFO Channel 01 : 2[86000] -> 0[3b000] via SHM/direct/direct
5d5684b73250:15353:15365 [1] NCCL INFO Channel 00/0 : 1[5e000] -> 2[86000] via P2P/direct pointer
5d5684b73250:15353:15364 [0] NCCL INFO Channel 00 : 0[3b000] -> 1[5e000] via SHM/direct/direct
5d5684b73250:15353:15364 [0] NCCL INFO Channel 01 : 0[3b000] -> 1[5e000] via SHM/direct/direct
5d5684b73250:15353:15365 [1] NCCL INFO Channel 01/0 : 1[5e000] -> 2[86000] via P2P/direct pointer
5d5684b73250:15353:15365 [1] NCCL INFO Connected all rings
5d5684b73250:15353:15364 [0] NCCL INFO Connected all rings
5d5684b73250:15353:15366 [2] NCCL INFO Connected all rings
5d5684b73250:15353:15366 [2] NCCL INFO Channel 00/0 : 2[86000] -> 1[5e000] via P2P/direct pointer
[5d5684b73250:15353:0:15364] Caught signal 7 (Bus error: nonexistent physical address)
Bus error (core dumped)