More than 1 GPU not working using Tao Train

Please see the log below:

root@5d5684b73250:/workspace# cd nccl-tests/
root@5d5684b73250:/workspace/nccl-tests# make
make -C src build BUILDDIR=/workspace/nccl-tests/build
make[1]: Entering directory '/workspace/nccl-tests/src'
Compiling  timer.cc                            > /workspace/nccl-tests/build/timer.o
Compiling /workspace/nccl-tests/build/verifiable/verifiable.o
Compiling  all_reduce.cu                       > /workspace/nccl-tests/build/all_reduce.o
Compiling  common.cu                           > /workspace/nccl-tests/build/common.o
Linking  /workspace/nccl-tests/build/all_reduce.o > /workspace/nccl-tests/build/all_reduce_perf
Compiling  all_gather.cu                       > /workspace/nccl-tests/build/all_gather.o
Linking  /workspace/nccl-tests/build/all_gather.o > /workspace/nccl-tests/build/all_gather_perf
Compiling  broadcast.cu                        > /workspace/nccl-tests/build/broadcast.o
Linking  /workspace/nccl-tests/build/broadcast.o > /workspace/nccl-tests/build/broadcast_perf
Compiling  reduce_scatter.cu                   > /workspace/nccl-tests/build/reduce_scatter.o
Linking  /workspace/nccl-tests/build/reduce_scatter.o > /workspace/nccl-tests/build/reduce_scatter_perf
Compiling  reduce.cu                           > /workspace/nccl-tests/build/reduce.o
Linking  /workspace/nccl-tests/build/reduce.o > /workspace/nccl-tests/build/reduce_perf
Compiling  alltoall.cu                         > /workspace/nccl-tests/build/alltoall.o
Linking  /workspace/nccl-tests/build/alltoall.o > /workspace/nccl-tests/build/alltoall_perf
Compiling  scatter.cu                          > /workspace/nccl-tests/build/scatter.o
Linking  /workspace/nccl-tests/build/scatter.o > /workspace/nccl-tests/build/scatter_perf
Compiling  gather.cu                           > /workspace/nccl-tests/build/gather.o
Linking  /workspace/nccl-tests/build/gather.o > /workspace/nccl-tests/build/gather_perf
Compiling  sendrecv.cu                         > /workspace/nccl-tests/build/sendrecv.o
Linking  /workspace/nccl-tests/build/sendrecv.o > /workspace/nccl-tests/build/sendrecv_perf
Compiling  hypercube.cu                        > /workspace/nccl-tests/build/hypercube.o
Linking  /workspace/nccl-tests/build/hypercube.o > /workspace/nccl-tests/build/hypercube_perf
make[1]: Leaving directory '/workspace/nccl-tests/src'
root@5d5684b73250:/workspace/nccl-tests# ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 4
# nThread 1 nGpus 4 minBytes 8 maxBytes 134217728 step: 2(factor) warmup iters: 5 iters: 20 agg iters: 1 validation: 1 graph: 0
#
# Using devices
#  Rank  0 Group  0 Pid  15332 on 5d5684b73250 device  0 [0x3b] NVIDIA RTX A6000
#  Rank  1 Group  0 Pid  15332 on 5d5684b73250 device  1 [0x5e] NVIDIA RTX A6000
#  Rank  2 Group  0 Pid  15332 on 5d5684b73250 device  2 [0x86] NVIDIA RTX A6000
#  Rank  3 Group  0 Pid  15332 on 5d5684b73250 device  3 [0xaf] NVIDIA RTX A6000
5d5684b73250:15332:15332 [0] NCCL INFO Bootstrap : Using eth0:172.17.0.2<0>
5d5684b73250:15332:15332 [0] NCCL INFO NET/Plugin: Failed to find ncclNetPlugin_v6 symbol.
5d5684b73250:15332:15332 [0] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin (v5)
5d5684b73250:15332:15332 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol.
5d5684b73250:15332:15332 [0] NCCL INFO NET/Plugin: Loaded coll plugin SHARP (v5)
5d5684b73250:15332:15332 [3] NCCL INFO cudaDriverVersion 12000
NCCL version 2.15.5+cuda11.8
5d5684b73250:15332:15346 [1] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
5d5684b73250:15332:15346 [1] NCCL INFO P2P plugin IBext
5d5684b73250:15332:15346 [1] NCCL INFO NET/IB : No device found.
5d5684b73250:15332:15346 [1] NCCL INFO NET/IB : No device found.
5d5684b73250:15332:15346 [1] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.2<0>
5d5684b73250:15332:15346 [1] NCCL INFO Using network Socket
5d5684b73250:15332:15348 [3] NCCL INFO Using network Socket
5d5684b73250:15332:15347 [2] NCCL INFO Using network Socket
5d5684b73250:15332:15345 [0] NCCL INFO Using network Socket
5d5684b73250:15332:15346 [1] NCCL INFO Setting affinity for GPU 1 to 0fffff00,000fffff
5d5684b73250:15332:15347 [2] NCCL INFO Setting affinity for GPU 2 to ffff,f00000ff,fff00000
5d5684b73250:15332:15348 [3] NCCL INFO Setting affinity for GPU 3 to ffff,f00000ff,fff00000
5d5684b73250:15332:15345 [0] NCCL INFO Setting affinity for GPU 0 to 0fffff00,000fffff
5d5684b73250:15332:15348 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] 2/-1/-1->3->0 [2] -1/-1/-1->3->2 [3] 2/-1/-1->3->0
5d5684b73250:15332:15347 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 1/-1/-1->2->3 [2] 3/-1/-1->2->1 [3] 1/-1/-1->2->3
5d5684b73250:15332:15345 [0] NCCL INFO Channel 00/04 :    0   1   2   3
5d5684b73250:15332:15345 [0] NCCL INFO Channel 01/04 :    0   3   2   1
5d5684b73250:15332:15346 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] -1/-1/-1->1->2 [2] 2/-1/-1->1->0 [3] -1/-1/-1->1->2
5d5684b73250:15332:15345 [0] NCCL INFO Channel 02/04 :    0   1   2   3
5d5684b73250:15332:15345 [0] NCCL INFO Channel 03/04 :    0   3   2   1
5d5684b73250:15332:15345 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 3/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 3/-1/-1->0->-1
5d5684b73250:15332:15347 [2] NCCL INFO Channel 00 : 2[86000] -> 3[af000] via SHM/direct/direct
5d5684b73250:15332:15345 [0] NCCL INFO Channel 00 : 0[3b000] -> 1[5e000] via SHM/direct/direct
5d5684b73250:15332:15347 [2] NCCL INFO Channel 02 : 2[86000] -> 3[af000] via SHM/direct/direct
5d5684b73250:15332:15345 [0] NCCL INFO Channel 02 : 0[3b000] -> 1[5e000] via SHM/direct/direct
5d5684b73250:15332:15348 [3] NCCL INFO Channel 00/0 : 3[af000] -> 0[3b000] via P2P/direct pointer
5d5684b73250:15332:15348 [3] NCCL INFO Channel 02/0 : 3[af000] -> 0[3b000] via P2P/direct pointer
5d5684b73250:15332:15346 [1] NCCL INFO Channel 00/0 : 1[5e000] -> 2[86000] via P2P/direct pointer
5d5684b73250:15332:15348 [3] NCCL INFO Channel 01 : 3[af000] -> 2[86000] via SHM/direct/direct
5d5684b73250:15332:15348 [3] NCCL INFO Channel 03 : 3[af000] -> 2[86000] via SHM/direct/direct
5d5684b73250:15332:15346 [1] NCCL INFO Channel 02/0 : 1[5e000] -> 2[86000] via P2P/direct pointer
5d5684b73250:15332:15346 [1] NCCL INFO Channel 01 : 1[5e000] -> 0[3b000] via SHM/direct/direct
5d5684b73250:15332:15346 [1] NCCL INFO Channel 03 : 1[5e000] -> 0[3b000] via SHM/direct/direct
[1679561506.255767] [5d5684b73250:15332:0]        spinlock.c:29   UCX  WARN  ucs_recursive_spinlock_destroy() failed: busy
[1679561506.255769] [5d5684b73250:15332:1]           debug.c:1289 UCX  WARN  ucs_debug_disable_signal: signal 8 was not set in ucs
[5d5684b73250:15332:0:15347] Caught signal 7 (Bus error: nonexistent physical address)
[5d5684b73250:15332:1:15345] Caught signal 7 (Bus error: nonexistent physical address)
==== backtrace (tid:  15345) ====
 0 0x0000000000014420 __funlockfile()  ???:0
 1 0x000000000018bb41 __nss_database_lookup()  ???:0
 2 0x000000000007587d ncclGroupEnd()  ???:0
 3 0x000000000007b0ef ncclGroupEnd()  ???:0
 4 0x0000000000059e97 ncclGetUniqueId()  ???:0
 5 0x00000000000489b1 ???()  /usr/lib/x86_64-linux-gnu/libnccl.so.2:0
 6 0x000000000004a655 ???()  /usr/lib/x86_64-linux-gnu/libnccl.so.2:0
 7 0x0000000000063dcc ncclRedOpDestroy()  ???:0
 8 0x0000000000008609 start_thread()  ???:0
 9 0x000000000011f133 clone()  ???:0
=================================
Bus error (core dumped)
root@5d5684b73250:/workspace/nccl-tests# ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 3
# nThread 1 nGpus 3 minBytes 8 maxBytes 134217728 step: 2(factor) warmup iters: 5 iters: 20 agg iters: 1 validation: 1 graph: 0
#
# Using devices
#  Rank  0 Group  0 Pid  15353 on 5d5684b73250 device  0 [0x3b] NVIDIA RTX A6000
#  Rank  1 Group  0 Pid  15353 on 5d5684b73250 device  1 [0x5e] NVIDIA RTX A6000
#  Rank  2 Group  0 Pid  15353 on 5d5684b73250 device  2 [0x86] NVIDIA RTX A6000
5d5684b73250:15353:15353 [0] NCCL INFO Bootstrap : Using eth0:172.17.0.2<0>
5d5684b73250:15353:15353 [0] NCCL INFO NET/Plugin: Failed to find ncclNetPlugin_v6 symbol.
5d5684b73250:15353:15353 [0] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin (v5)
5d5684b73250:15353:15353 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol.
5d5684b73250:15353:15353 [0] NCCL INFO NET/Plugin: Loaded coll plugin SHARP (v5)
5d5684b73250:15353:15353 [2] NCCL INFO cudaDriverVersion 12000
NCCL version 2.15.5+cuda11.8
5d5684b73250:15353:15364 [0] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
5d5684b73250:15353:15364 [0] NCCL INFO P2P plugin IBext
5d5684b73250:15353:15364 [0] NCCL INFO NET/IB : No device found.
5d5684b73250:15353:15364 [0] NCCL INFO NET/IB : No device found.
5d5684b73250:15353:15364 [0] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.2<0>
5d5684b73250:15353:15364 [0] NCCL INFO Using network Socket
5d5684b73250:15353:15365 [1] NCCL INFO Using network Socket
5d5684b73250:15353:15366 [2] NCCL INFO Using network Socket
5d5684b73250:15353:15365 [1] NCCL INFO Setting affinity for GPU 1 to 0fffff00,000fffff
5d5684b73250:15353:15364 [0] NCCL INFO Setting affinity for GPU 0 to 0fffff00,000fffff
5d5684b73250:15353:15366 [2] NCCL INFO Setting affinity for GPU 2 to ffff,f00000ff,fff00000
5d5684b73250:15353:15365 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0
5d5684b73250:15353:15364 [0] NCCL INFO Channel 00/02 :    0   1   2
5d5684b73250:15353:15364 [0] NCCL INFO Channel 01/02 :    0   1   2
5d5684b73250:15353:15366 [2] NCCL INFO Trees [0] -1/-1/-1->2->1 [1] -1/-1/-1->2->1
5d5684b73250:15353:15364 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1
5d5684b73250:15353:15366 [2] NCCL INFO Channel 00 : 2[86000] -> 0[3b000] via SHM/direct/direct
5d5684b73250:15353:15366 [2] NCCL INFO Channel 01 : 2[86000] -> 0[3b000] via SHM/direct/direct
5d5684b73250:15353:15365 [1] NCCL INFO Channel 00/0 : 1[5e000] -> 2[86000] via P2P/direct pointer
5d5684b73250:15353:15364 [0] NCCL INFO Channel 00 : 0[3b000] -> 1[5e000] via SHM/direct/direct
5d5684b73250:15353:15364 [0] NCCL INFO Channel 01 : 0[3b000] -> 1[5e000] via SHM/direct/direct
5d5684b73250:15353:15365 [1] NCCL INFO Channel 01/0 : 1[5e000] -> 2[86000] via P2P/direct pointer
5d5684b73250:15353:15365 [1] NCCL INFO Connected all rings
5d5684b73250:15353:15364 [0] NCCL INFO Connected all rings
5d5684b73250:15353:15366 [2] NCCL INFO Connected all rings
5d5684b73250:15353:15366 [2] NCCL INFO Channel 00/0 : 2[86000] -> 1[5e000] via P2P/direct pointer
[5d5684b73250:15353:0:15364] Caught signal 7 (Bus error: nonexistent physical address)
Bus error (core dumped)