More than 1 GPU not working using Tao Train

Please see the log below:

root@5d5684b73250:/workspace# cd nccl-tests/
root@5d5684b73250:/workspace/nccl-tests# make
make -C src build BUILDDIR=/workspace/nccl-tests/build
make[1]: Entering directory '/workspace/nccl-tests/src'
Compiling  timer.cc                            > /workspace/nccl-tests/build/timer.o
Compiling /workspace/nccl-tests/build/verifiable/verifiable.o
Compiling  all_reduce.cu                       > /workspace/nccl-tests/build/all_reduce.o
Compiling  common.cu                           > /workspace/nccl-tests/build/common.o
Linking  /workspace/nccl-tests/build/all_reduce.o > /workspace/nccl-tests/build/all_reduce_perf
Compiling  all_gather.cu                       > /workspace/nccl-tests/build/all_gather.o
Linking  /workspace/nccl-tests/build/all_gather.o > /workspace/nccl-tests/build/all_gather_perf
Compiling  broadcast.cu                        > /workspace/nccl-tests/build/broadcast.o
Linking  /workspace/nccl-tests/build/broadcast.o > /workspace/nccl-tests/build/broadcast_perf
Compiling  reduce_scatter.cu                   > /workspace/nccl-tests/build/reduce_scatter.o
Linking  /workspace/nccl-tests/build/reduce_scatter.o > /workspace/nccl-tests/build/reduce_scatter_perf
Compiling  reduce.cu                           > /workspace/nccl-tests/build/reduce.o
Linking  /workspace/nccl-tests/build/reduce.o > /workspace/nccl-tests/build/reduce_perf
Compiling  alltoall.cu                         > /workspace/nccl-tests/build/alltoall.o
Linking  /workspace/nccl-tests/build/alltoall.o > /workspace/nccl-tests/build/alltoall_perf
Compiling  scatter.cu                          > /workspace/nccl-tests/build/scatter.o
Linking  /workspace/nccl-tests/build/scatter.o > /workspace/nccl-tests/build/scatter_perf
Compiling  gather.cu                           > /workspace/nccl-tests/build/gather.o
Linking  /workspace/nccl-tests/build/gather.o > /workspace/nccl-tests/build/gather_perf
Compiling  sendrecv.cu                         > /workspace/nccl-tests/build/sendrecv.o
Linking  /workspace/nccl-tests/build/sendrecv.o > /workspace/nccl-tests/build/sendrecv_perf
Compiling  hypercube.cu                        > /workspace/nccl-tests/build/hypercube.o
Linking  /workspace/nccl-tests/build/hypercube.o > /workspace/nccl-tests/build/hypercube_perf
make[1]: Leaving directory '/workspace/nccl-tests/src'
root@5d5684b73250:/workspace/nccl-tests# ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 4
# nThread 1 nGpus 4 minBytes 8 maxBytes 134217728 step: 2(factor) warmup iters: 5 iters: 20 agg iters: 1 validation: 1 graph: 0
#
# Using devices
#  Rank  0 Group  0 Pid  15332 on 5d5684b73250 device  0 [0x3b] NVIDIA RTX A6000
#  Rank  1 Group  0 Pid  15332 on 5d5684b73250 device  1 [0x5e] NVIDIA RTX A6000
#  Rank  2 Group  0 Pid  15332 on 5d5684b73250 device  2 [0x86] NVIDIA RTX A6000
#  Rank  3 Group  0 Pid  15332 on 5d5684b73250 device  3 [0xaf] NVIDIA RTX A6000
5d5684b73250:15332:15332 [0] NCCL INFO Bootstrap : Using eth0:172.17.0.2<0>
5d5684b73250:15332:15332 [0] NCCL INFO NET/Plugin: Failed to find ncclNetPlugin_v6 symbol.
5d5684b73250:15332:15332 [0] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin (v5)
5d5684b73250:15332:15332 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol.
5d5684b73250:15332:15332 [0] NCCL INFO NET/Plugin: Loaded coll plugin SHARP (v5)
5d5684b73250:15332:15332 [3] NCCL INFO cudaDriverVersion 12000
NCCL version 2.15.5+cuda11.8
5d5684b73250:15332:15346 [1] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
5d5684b73250:15332:15346 [1] NCCL INFO P2P plugin IBext
5d5684b73250:15332:15346 [1] NCCL INFO NET/IB : No device found.
5d5684b73250:15332:15346 [1] NCCL INFO NET/IB : No device found.
5d5684b73250:15332:15346 [1] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.2<0>
5d5684b73250:15332:15346 [1] NCCL INFO Using network Socket
5d5684b73250:15332:15348 [3] NCCL INFO Using network Socket
5d5684b73250:15332:15347 [2] NCCL INFO Using network Socket
5d5684b73250:15332:15345 [0] NCCL INFO Using network Socket
5d5684b73250:15332:15346 [1] NCCL INFO Setting affinity for GPU 1 to 0fffff00,000fffff
5d5684b73250:15332:15347 [2] NCCL INFO Setting affinity for GPU 2 to ffff,f00000ff,fff00000
5d5684b73250:15332:15348 [3] NCCL INFO Setting affinity for GPU 3 to ffff,f00000ff,fff00000
5d5684b73250:15332:15345 [0] NCCL INFO Setting affinity for GPU 0 to 0fffff00,000fffff
5d5684b73250:15332:15348 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] 2/-1/-1->3->0 [2] -1/-1/-1->3->2 [3] 2/-1/-1->3->0
5d5684b73250:15332:15347 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 1/-1/-1->2->3 [2] 3/-1/-1->2->1 [3] 1/-1/-1->2->3
5d5684b73250:15332:15345 [0] NCCL INFO Channel 00/04 :    0   1   2   3
5d5684b73250:15332:15345 [0] NCCL INFO Channel 01/04 :    0   3   2   1
5d5684b73250:15332:15346 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] -1/-1/-1->1->2 [2] 2/-1/-1->1->0 [3] -1/-1/-1->1->2
5d5684b73250:15332:15345 [0] NCCL INFO Channel 02/04 :    0   1   2   3
5d5684b73250:15332:15345 [0] NCCL INFO Channel 03/04 :    0   3   2   1
5d5684b73250:15332:15345 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 3/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 3/-1/-1->0->-1
5d5684b73250:15332:15347 [2] NCCL INFO Channel 00 : 2[86000] -> 3[af000] via SHM/direct/direct
5d5684b73250:15332:15345 [0] NCCL INFO Channel 00 : 0[3b000] -> 1[5e000] via SHM/direct/direct
5d5684b73250:15332:15347 [2] NCCL INFO Channel 02 : 2[86000] -> 3[af000] via SHM/direct/direct
5d5684b73250:15332:15345 [0] NCCL INFO Channel 02 : 0[3b000] -> 1[5e000] via SHM/direct/direct
5d5684b73250:15332:15348 [3] NCCL INFO Channel 00/0 : 3[af000] -> 0[3b000] via P2P/direct pointer
5d5684b73250:15332:15348 [3] NCCL INFO Channel 02/0 : 3[af000] -> 0[3b000] via P2P/direct pointer
5d5684b73250:15332:15346 [1] NCCL INFO Channel 00/0 : 1[5e000] -> 2[86000] via P2P/direct pointer
5d5684b73250:15332:15348 [3] NCCL INFO Channel 01 : 3[af000] -> 2[86000] via SHM/direct/direct
5d5684b73250:15332:15348 [3] NCCL INFO Channel 03 : 3[af000] -> 2[86000] via SHM/direct/direct
5d5684b73250:15332:15346 [1] NCCL INFO Channel 02/0 : 1[5e000] -> 2[86000] via P2P/direct pointer
5d5684b73250:15332:15346 [1] NCCL INFO Channel 01 : 1[5e000] -> 0[3b000] via SHM/direct/direct
5d5684b73250:15332:15346 [1] NCCL INFO Channel 03 : 1[5e000] -> 0[3b000] via SHM/direct/direct
[1679561506.255767] [5d5684b73250:15332:0]        spinlock.c:29   UCX  WARN  ucs_recursive_spinlock_destroy() failed: busy
[1679561506.255769] [5d5684b73250:15332:1]           debug.c:1289 UCX  WARN  ucs_debug_disable_signal: signal 8 was not set in ucs
[5d5684b73250:15332:0:15347] Caught signal 7 (Bus error: nonexistent physical address)
[5d5684b73250:15332:1:15345] Caught signal 7 (Bus error: nonexistent physical address)
==== backtrace (tid:  15345) ====
 0 0x0000000000014420 __funlockfile()  ???:0
 1 0x000000000018bb41 __nss_database_lookup()  ???:0
 2 0x000000000007587d ncclGroupEnd()  ???:0
 3 0x000000000007b0ef ncclGroupEnd()  ???:0
 4 0x0000000000059e97 ncclGetUniqueId()  ???:0
 5 0x00000000000489b1 ???()  /usr/lib/x86_64-linux-gnu/libnccl.so.2:0
 6 0x000000000004a655 ???()  /usr/lib/x86_64-linux-gnu/libnccl.so.2:0
 7 0x0000000000063dcc ncclRedOpDestroy()  ???:0
 8 0x0000000000008609 start_thread()  ???:0
 9 0x000000000011f133 clone()  ???:0
=================================
Bus error (core dumped)
root@5d5684b73250:/workspace/nccl-tests# ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 3
# nThread 1 nGpus 3 minBytes 8 maxBytes 134217728 step: 2(factor) warmup iters: 5 iters: 20 agg iters: 1 validation: 1 graph: 0
#
# Using devices
#  Rank  0 Group  0 Pid  15353 on 5d5684b73250 device  0 [0x3b] NVIDIA RTX A6000
#  Rank  1 Group  0 Pid  15353 on 5d5684b73250 device  1 [0x5e] NVIDIA RTX A6000
#  Rank  2 Group  0 Pid  15353 on 5d5684b73250 device  2 [0x86] NVIDIA RTX A6000
5d5684b73250:15353:15353 [0] NCCL INFO Bootstrap : Using eth0:172.17.0.2<0>
5d5684b73250:15353:15353 [0] NCCL INFO NET/Plugin: Failed to find ncclNetPlugin_v6 symbol.
5d5684b73250:15353:15353 [0] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin (v5)
5d5684b73250:15353:15353 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol.
5d5684b73250:15353:15353 [0] NCCL INFO NET/Plugin: Loaded coll plugin SHARP (v5)
5d5684b73250:15353:15353 [2] NCCL INFO cudaDriverVersion 12000
NCCL version 2.15.5+cuda11.8
5d5684b73250:15353:15364 [0] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
5d5684b73250:15353:15364 [0] NCCL INFO P2P plugin IBext
5d5684b73250:15353:15364 [0] NCCL INFO NET/IB : No device found.
5d5684b73250:15353:15364 [0] NCCL INFO NET/IB : No device found.
5d5684b73250:15353:15364 [0] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.2<0>
5d5684b73250:15353:15364 [0] NCCL INFO Using network Socket
5d5684b73250:15353:15365 [1] NCCL INFO Using network Socket
5d5684b73250:15353:15366 [2] NCCL INFO Using network Socket
5d5684b73250:15353:15365 [1] NCCL INFO Setting affinity for GPU 1 to 0fffff00,000fffff
5d5684b73250:15353:15364 [0] NCCL INFO Setting affinity for GPU 0 to 0fffff00,000fffff
5d5684b73250:15353:15366 [2] NCCL INFO Setting affinity for GPU 2 to ffff,f00000ff,fff00000
5d5684b73250:15353:15365 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0
5d5684b73250:15353:15364 [0] NCCL INFO Channel 00/02 :    0   1   2
5d5684b73250:15353:15364 [0] NCCL INFO Channel 01/02 :    0   1   2
5d5684b73250:15353:15366 [2] NCCL INFO Trees [0] -1/-1/-1->2->1 [1] -1/-1/-1->2->1
5d5684b73250:15353:15364 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1
5d5684b73250:15353:15366 [2] NCCL INFO Channel 00 : 2[86000] -> 0[3b000] via SHM/direct/direct
5d5684b73250:15353:15366 [2] NCCL INFO Channel 01 : 2[86000] -> 0[3b000] via SHM/direct/direct
5d5684b73250:15353:15365 [1] NCCL INFO Channel 00/0 : 1[5e000] -> 2[86000] via P2P/direct pointer
5d5684b73250:15353:15364 [0] NCCL INFO Channel 00 : 0[3b000] -> 1[5e000] via SHM/direct/direct
5d5684b73250:15353:15364 [0] NCCL INFO Channel 01 : 0[3b000] -> 1[5e000] via SHM/direct/direct
5d5684b73250:15353:15365 [1] NCCL INFO Channel 01/0 : 1[5e000] -> 2[86000] via P2P/direct pointer
5d5684b73250:15353:15365 [1] NCCL INFO Connected all rings
5d5684b73250:15353:15364 [0] NCCL INFO Connected all rings
5d5684b73250:15353:15366 [2] NCCL INFO Connected all rings
5d5684b73250:15353:15366 [2] NCCL INFO Channel 00/0 : 2[86000] -> 1[5e000] via P2P/direct pointer
[5d5684b73250:15353:0:15364] Caught signal 7 (Bus error: nonexistent physical address)
Bus error (core dumped)

So, there is issue even running with nccl-test.
Could you update nccl inside the tao container?

$ wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/sbsa/cuda-keyring_1.0-1_all.deb
$ sudo dpkg -i cuda-keyring_1.0-1_all.deb
$ sudo apt-get update
$ sudo apt install libnccl2=2.17.1-1+cuda12.0 libnccl-dev=2.17.1-1+cuda12.0

Then run above-mentioned nccl-test again? Thanks.
$ cd nccl-tests/
$ make
$ ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 4
$ ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 3

I ran the commands inside the same docker container

I still get the error

root@5d5684b73250:/workspace/nccl-tests# ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 4
# nThread 1 nGpus 4 minBytes 8 maxBytes 134217728 step: 2(factor) warmup iters: 5 iters: 20 agg iters: 1 validation: 1 graph: 0
#
# Using devices
#  Rank  0 Group  0 Pid  17487 on 5d5684b73250 device  0 [0x3b] NVIDIA RTX A6000
#  Rank  1 Group  0 Pid  17487 on 5d5684b73250 device  1 [0x5e] NVIDIA RTX A6000
#  Rank  2 Group  0 Pid  17487 on 5d5684b73250 device  2 [0x86] NVIDIA RTX A6000
#  Rank  3 Group  0 Pid  17487 on 5d5684b73250 device  3 [0xaf] NVIDIA RTX A6000
5d5684b73250:17487:17487 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker
5d5684b73250:17487:17487 [0] NCCL INFO Bootstrap : Using eth0:172.17.0.2<0>
5d5684b73250:17487:17487 [0] NCCL INFO NET/Plugin: Failed to find ncclNetPlugin_v6 symbol.
5d5684b73250:17487:17487 [0] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin (v5)
5d5684b73250:17487:17487 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol.
5d5684b73250:17487:17487 [0] NCCL INFO NET/Plugin: Loaded coll plugin SHARP (v5)
5d5684b73250:17487:17487 [3] NCCL INFO cudaDriverVersion 12000
NCCL version 2.17.1+cuda12.0
5d5684b73250:17487:17503 [3] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
5d5684b73250:17487:17503 [3] NCCL INFO P2P plugin IBext
5d5684b73250:17487:17503 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker
5d5684b73250:17487:17503 [3] NCCL INFO NET/IB : No device found.
5d5684b73250:17487:17503 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker
5d5684b73250:17487:17503 [3] NCCL INFO NET/IB : No device found.
5d5684b73250:17487:17503 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker
5d5684b73250:17487:17503 [3] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.2<0>
5d5684b73250:17487:17503 [3] NCCL INFO Using network Socket
5d5684b73250:17487:17500 [0] NCCL INFO Using network Socket
5d5684b73250:17487:17502 [2] NCCL INFO Using network Socket
5d5684b73250:17487:17501 [1] NCCL INFO Using network Socket
5d5684b73250:17487:17501 [1] NCCL INFO Setting affinity for GPU 1 to 0fffff00,000fffff
5d5684b73250:17487:17503 [3] NCCL INFO Setting affinity for GPU 3 to ffff,f00000ff,fff00000
5d5684b73250:17487:17502 [2] NCCL INFO Setting affinity for GPU 2 to ffff,f00000ff,fff00000
5d5684b73250:17487:17500 [0] NCCL INFO Setting affinity for GPU 0 to 0fffff00,000fffff
5d5684b73250:17487:17503 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] 2/-1/-1->3->0 [2] -1/-1/-1->3->2 [3] 2/-1/-1->3->0
5d5684b73250:17487:17503 [3] NCCL INFO P2P Chunksize set to 524288
5d5684b73250:17487:17502 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 1/-1/-1->2->3 [2] 3/-1/-1->2->1 [3] 1/-1/-1->2->3
5d5684b73250:17487:17501 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] -1/-1/-1->1->2 [2] 2/-1/-1->1->0 [3] -1/-1/-1->1->2
5d5684b73250:17487:17501 [1] NCCL INFO P2P Chunksize set to 524288
5d5684b73250:17487:17502 [2] NCCL INFO P2P Chunksize set to 524288
5d5684b73250:17487:17500 [0] NCCL INFO Channel 00/04 :    0   1   2   3
5d5684b73250:17487:17500 [0] NCCL INFO Channel 01/04 :    0   3   2   1
5d5684b73250:17487:17500 [0] NCCL INFO Channel 02/04 :    0   1   2   3
5d5684b73250:17487:17500 [0] NCCL INFO Channel 03/04 :    0   3   2   1
5d5684b73250:17487:17500 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 3/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 3/-1/-1->0->-1
5d5684b73250:17487:17500 [0] NCCL INFO P2P Chunksize set to 524288
5d5684b73250:17487:17503 [3] NCCL INFO Channel 00/0 : 3[af000] -> 0[3b000] via P2P/direct pointer
5d5684b73250:17487:17501 [1] NCCL INFO Channel 00/0 : 1[5e000] -> 2[86000] via P2P/direct pointer
5d5684b73250:17487:17503 [3] NCCL INFO Channel 02/0 : 3[af000] -> 0[3b000] via P2P/direct pointer
5d5684b73250:17487:17501 [1] NCCL INFO Channel 02/0 : 1[5e000] -> 2[86000] via P2P/direct pointer
[5d5684b73250:17487:0:17502] Caught signal 7 (Bus error: nonexistent physical address)
Bus error (core dumped)
root@5d5684b73250:/workspace/nccl-tests# /build/all_reduce_perf -b 8 -e 128M -f 2 -g 3
bash: /build/all_reduce_perf: No such file or directory
root@5d5684b73250:/workspace/nccl-tests# ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 3
# nThread 1 nGpus 3 minBytes 8 maxBytes 134217728 step: 2(factor) warmup iters: 5 iters: 20 agg iters: 1 validation: 1 graph: 0
#
# Using devices
#  Rank  0 Group  0 Pid  17509 on 5d5684b73250 device  0 [0x3b] NVIDIA RTX A6000
#  Rank  1 Group  0 Pid  17509 on 5d5684b73250 device  1 [0x5e] NVIDIA RTX A6000
#  Rank  2 Group  0 Pid  17509 on 5d5684b73250 device  2 [0x86] NVIDIA RTX A6000
5d5684b73250:17509:17509 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker
5d5684b73250:17509:17509 [0] NCCL INFO Bootstrap : Using eth0:172.17.0.2<0>
5d5684b73250:17509:17509 [0] NCCL INFO NET/Plugin: Failed to find ncclNetPlugin_v6 symbol.
5d5684b73250:17509:17509 [0] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin (v5)
5d5684b73250:17509:17509 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol.
5d5684b73250:17509:17509 [0] NCCL INFO NET/Plugin: Loaded coll plugin SHARP (v5)
5d5684b73250:17509:17509 [2] NCCL INFO cudaDriverVersion 12000
NCCL version 2.17.1+cuda12.0
5d5684b73250:17509:17522 [2] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
5d5684b73250:17509:17522 [2] NCCL INFO P2P plugin IBext
5d5684b73250:17509:17522 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker
5d5684b73250:17509:17522 [2] NCCL INFO NET/IB : No device found.
5d5684b73250:17509:17522 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker
5d5684b73250:17509:17522 [2] NCCL INFO NET/IB : No device found.
5d5684b73250:17509:17522 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker
5d5684b73250:17509:17522 [2] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.2<0>
5d5684b73250:17509:17522 [2] NCCL INFO Using network Socket
5d5684b73250:17509:17520 [0] NCCL INFO Using network Socket
5d5684b73250:17509:17521 [1] NCCL INFO Using network Socket
5d5684b73250:17509:17520 [0] NCCL INFO Setting affinity for GPU 0 to 0fffff00,000fffff
5d5684b73250:17509:17521 [1] NCCL INFO Setting affinity for GPU 1 to 0fffff00,000fffff
5d5684b73250:17509:17522 [2] NCCL INFO Setting affinity for GPU 2 to ffff,f00000ff,fff00000
5d5684b73250:17509:17521 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0
5d5684b73250:17509:17521 [1] NCCL INFO P2P Chunksize set to 524288
5d5684b73250:17509:17522 [2] NCCL INFO Trees [0] -1/-1/-1->2->1 [1] -1/-1/-1->2->1
5d5684b73250:17509:17522 [2] NCCL INFO P2P Chunksize set to 524288
5d5684b73250:17509:17520 [0] NCCL INFO Channel 00/02 :    0   1   2
5d5684b73250:17509:17520 [0] NCCL INFO Channel 01/02 :    0   1   2
5d5684b73250:17509:17520 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1
5d5684b73250:17509:17520 [0] NCCL INFO P2P Chunksize set to 524288
[5d5684b73250:17509:0:17521] Caught signal 7 (Bus error: nonexistent physical address)
Bus error (core dumped)
'''

Could you share the result of
$ nvidia-smi topo -m
$ ifconfig -s

for $ nvidia-smi topo -m

	GPU0	GPU1	GPU2	GPU3	CPU Affinity	NUMA Affinity
GPU0	 X 	NODE	SYS	NV4	0-19,40-59	0
GPU1	NODE	 X 	NV4	SYS	0-19,40-59	0
GPU2	SYS	NV4	 X 	NODE	20-39,60-79	1
GPU3	NV4	SYS	NODE	 X 	20-39,60-79	1

Legend:

  X    = Self
  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
  PIX  = Connection traversing at most a single PCIe bridge
  NV#  = Connection traversing a bonded set of # NVLinks

for $ ifconfig -s

face      MTU    RX-OK RX-ERR RX-DRP RX-OVR    TX-OK TX-ERR TX-DRP TX-OVR Flg
docker0   1500    78038      0      0 0        154832      0      0      0 BMRU
enp4s0    1500        0      0      0 0             0      0      0      0 BMU
enp5s0    1500        0      0      0 0             0      0      0      0 BMU
lo       65536  3225177      0      0 0       3225177      0      0      0 LRU
vethaf5b  1500    78038      0      0 0        154981      0      0      0 BMRU
wlx7cc2c  1500  7161924      0      0 0       2473068      0      0      0 BMRU

For 2gpus training, can you run below 6 experiments by adding below in command line? Please share the result if all are working.

--gpu_index 0 1
--gpu_index 0 2
--gpu_index 0 3
--gpu_index 1 2
--gpu_index 1 3
--gpu_index 2 3

More, to narrow down, could you try to use 515 driver instead?

sudo apt purge nvidia-driver-525
sudo apt autoremove
sudo apt autoclean

sudo apt install nvidia-driver-515

I used this command with the detectnet train and i get this same error for all the combinations:

Traceback (most recent call last):
  File "/usr/local/bin/detectnet_v2", line 8, in <module>
    sys.exit(main())
  File "<frozen iva.detectnet_v2.entrypoint.detectnet_v2>", line 12, in main
  File "<frozen iva.common.entrypoint.entrypoint>", line 268, in launch_job
  File "<frozen iva.common.entrypoint.entrypoint>", line 46, in get_modules
  File "/usr/lib/python3.6/importlib/__init__.py", line 126, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
  File "<frozen importlib._bootstrap>", line 994, in _gcd_import
  File "<frozen importlib._bootstrap>", line 971, in _find_and_load
  File "<frozen importlib._bootstrap>", line 955, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 665, in _load_unlocked
  File "<frozen importlib._bootstrap_external>", line 678, in exec_module
  File "<frozen importlib._bootstrap>", line 219, in _call_with_frames_removed
  File "</usr/local/lib/python3.6/dist-packages/iva/detectnet_v2/scripts/train.py>", line 3, in <module>
  File "<frozen iva.detectnet_v2.scripts.train>", line 35, in <module>
  File "</usr/local/lib/python3.6/dist-packages/iva/common/mlops/clearml.py>", line 1, in <module>
  File "<frozen iva.common.mlops.clearml>", line 9, in <module>
  File "/usr/local/lib/python3.6/dist-packages/clearml/__init__.py", line 5, in <module>
    from .task import Task
  File "/usr/local/lib/python3.6/dist-packages/clearml/task.py", line 45, in <module>
    from .backend_interface.metrics import Metrics
  File "/usr/local/lib/python3.6/dist-packages/clearml/backend_interface/__init__.py", line 2, in <module>
    from .task import Task
  File "/usr/local/lib/python3.6/dist-packages/clearml/backend_interface/task/__init__.py", line 1, in <module>
    from .task import Task
  File "/usr/local/lib/python3.6/dist-packages/clearml/backend_interface/task/task.py", line 32, in <module>
    from ...binding.artifacts import Artifacts
  File "/usr/local/lib/python3.6/dist-packages/clearml/binding/artifacts.py", line 23, in <module>
    from ..backend_interface.metrics.events import UploadEvent
  File "/usr/local/lib/python3.6/dist-packages/clearml/backend_interface/metrics/__init__.py", line 2, in <module>
    from .interface import Metrics
  File "/usr/local/lib/python3.6/dist-packages/clearml/backend_interface/metrics/interface.py", line 14, in <module>
    from ...storage.helper import StorageHelper
  File "/usr/local/lib/python3.6/dist-packages/clearml/storage/__init__.py", line 2, in <module>
    from .manager import StorageManager
  File "/usr/local/lib/python3.6/dist-packages/clearml/storage/manager.py", line 15, in <module>
    from .cache import CacheManager
  File "/usr/local/lib/python3.6/dist-packages/clearml/storage/cache.py", line 12, in <module>
    from .helper import StorageHelper
  File "/usr/local/lib/python3.6/dist-packages/clearml/storage/helper.py", line 44, in <module>
    from ..utilities.process.mp import ForkSafeRLock, SafeEvent
  File "/usr/local/lib/python3.6/dist-packages/clearml/utilities/process/mp.py", line 36, in <module>
    class _ForkSafeThreadSyncObject(object):
  File "/usr/local/lib/python3.6/dist-packages/clearml/utilities/process/mp.py", line 37, in _ForkSafeThreadSyncObject
    __process_lock = get_context("fork" if sys.platform == "linux" else "spawn").Lock()
  File "/usr/lib/python3.6/multiprocessing/context.py", line 67, in Lock
    return Lock(ctx=self.get_context())
  File "/usr/lib/python3.6/multiprocessing/synchronize.py", line 162, in __init__
    SemLock.__init__(self, SEMAPHORE, 1, 1, ctx=ctx)
  File "/usr/lib/python3.6/multiprocessing/synchronize.py", line 59, in __init__
    unlink_now)
OSError: [Errno 28] No space left on device

Can you free some disk space? The error log is new and pointing to “no space left on device”.

surely i have enough?

Filesystem      Size  Used Avail Use% Mounted on
udev            378G     0  378G   0% /dev
tmpfs            76G  3.9M   76G   1% /run
/dev/nvme0n1p3  1.8T  248G  1.5T  15% /
tmpfs           378G  1.3M  378G   1% /dev/shm
tmpfs           5.0M  4.0K  5.0M   1% /run/lock
tmpfs           378G     0  378G   0% /sys/fs/cgroup
/dev/loop0      128K  128K     0 100% /snap/bare/5
/dev/loop2       50M   50M     0 100% /snap/snapd/18596
/dev/loop1       64M   64M     0 100% /snap/core20/1828
/dev/loop3       64M   64M     0 100% /snap/core20/1852
/dev/loop4      249M  249M     0 100% /snap/gnome-3-38-2004/99
/dev/loop5       66M   66M     0 100% /snap/gtk-common-themes/1519
/dev/loop6       50M   50M     0 100% /snap/snapd/18357
/dev/loop7       46M   46M     0 100% /snap/snap-store/638
/dev/loop9      347M  347M     0 100% /snap/gnome-3-38-2004/119
/dev/loop10      92M   92M     0 100% /snap/gtk-common-themes/1535
/dev/loop8       46M   46M     0 100% /snap/snap-store/599
/dev/nvme0n1p1  523M  5.3M  518M   1% /boot/efi
tmpfs            76G   20K   76G   1% /run/user/125
tmpfs            76G   84K   76G   1% /run/user/1000

Please change to 515 driver as mentioned above, then check if nccl-tests runs well for 2gpus, 3gpus and 4gpus.

Okay I did exactly that with nvidia 515 driver

using the nccl-tests, please see the results:

4 GPUS:

# nThread 1 nGpus 4 minBytes 8 maxBytes 134217728 step: 2(factor) warmup iters: 5 iters: 20 agg iters: 1 validation: 1 graph: 0
#
# Using devices
#  Rank  0 Group  0 Pid    124 on a2910a560c15 device  0 [0x3b] NVIDIA RTX A6000
#  Rank  1 Group  0 Pid    124 on a2910a560c15 device  1 [0x5e] NVIDIA RTX A6000
#  Rank  2 Group  0 Pid    124 on a2910a560c15 device  2 [0x86] NVIDIA RTX A6000
#  Rank  3 Group  0 Pid    124 on a2910a560c15 device  3 [0xaf] NVIDIA RTX A6000
a2910a560c15:124:124 [0] NCCL INFO Bootstrap : Using eth0:172.17.0.2<0>
a2910a560c15:124:124 [0] NCCL INFO NET/Plugin: Failed to find ncclNetPlugin_v6 symbol.
a2910a560c15:124:124 [0] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin (v5)
a2910a560c15:124:124 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol.
a2910a560c15:124:124 [0] NCCL INFO NET/Plugin: Loaded coll plugin SHARP (v5)
a2910a560c15:124:124 [3] NCCL INFO cudaDriverVersion 11080
NCCL version 2.15.5+cuda11.8
a2910a560c15:124:137 [0] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
a2910a560c15:124:137 [0] NCCL INFO P2P plugin IBext
a2910a560c15:124:137 [0] NCCL INFO NET/IB : No device found.
a2910a560c15:124:137 [0] NCCL INFO NET/IB : No device found.
a2910a560c15:124:137 [0] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.2<0>
a2910a560c15:124:137 [0] NCCL INFO Using network Socket
a2910a560c15:124:140 [3] NCCL INFO Using network Socket
a2910a560c15:124:138 [1] NCCL INFO Using network Socket
a2910a560c15:124:139 [2] NCCL INFO Using network Socket
a2910a560c15:124:140 [3] NCCL INFO Setting affinity for GPU 3 to ffff,f00000ff,fff00000
a2910a560c15:124:139 [2] NCCL INFO Setting affinity for GPU 2 to ffff,f00000ff,fff00000
a2910a560c15:124:138 [1] NCCL INFO Setting affinity for GPU 1 to 0fffff00,000fffff
a2910a560c15:124:137 [0] NCCL INFO Setting affinity for GPU 0 to 0fffff00,000fffff
a2910a560c15:124:140 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] 2/-1/-1->3->0 [2] -1/-1/-1->3->2 [3] 2/-1/-1->3->0
a2910a560c15:124:139 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 1/-1/-1->2->3 [2] 3/-1/-1->2->1 [3] 1/-1/-1->2->3
a2910a560c15:124:137 [0] NCCL INFO Channel 00/04 :    0   1   2   3
a2910a560c15:124:137 [0] NCCL INFO Channel 01/04 :    0   3   2   1
a2910a560c15:124:137 [0] NCCL INFO Channel 02/04 :    0   1   2   3
a2910a560c15:124:138 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] -1/-1/-1->1->2 [2] 2/-1/-1->1->0 [3] -1/-1/-1->1->2
a2910a560c15:124:137 [0] NCCL INFO Channel 03/04 :    0   3   2   1
a2910a560c15:124:137 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 3/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 3/-1/-1->0->-1
a2910a560c15:124:139 [2] NCCL INFO Channel 00 : 2[86000] -> 3[af000] via SHM/direct/direct
a2910a560c15:124:139 [2] NCCL INFO Channel 02 : 2[86000] -> 3[af000] via SHM/direct/direct
a2910a560c15:124:137 [0] NCCL INFO Channel 00 : 0[3b000] -> 1[5e000] via SHM/direct/direct
a2910a560c15:124:137 [0] NCCL INFO Channel 02 : 0[3b000] -> 1[5e000] via SHM/direct/direct
a2910a560c15:124:140 [3] NCCL INFO Channel 00/0 : 3[af000] -> 0[3b000] via P2P/direct pointer
a2910a560c15:124:138 [1] NCCL INFO Channel 00/0 : 1[5e000] -> 2[86000] via P2P/direct pointer
a2910a560c15:124:140 [3] NCCL INFO Channel 02/0 : 3[af000] -> 0[3b000] via P2P/direct pointer
a2910a560c15:124:140 [3] NCCL INFO Channel 01 : 3[af000] -> 2[86000] via SHM/direct/direct
a2910a560c15:124:138 [1] NCCL INFO Channel 02/0 : 1[5e000] -> 2[86000] via P2P/direct pointer
a2910a560c15:124:140 [3] NCCL INFO Channel 03 : 3[af000] -> 2[86000] via SHM/direct/direct
a2910a560c15:124:138 [1] NCCL INFO Channel 01 : 1[5e000] -> 0[3b000] via SHM/direct/direct
a2910a560c15:124:138 [1] NCCL INFO Channel 03 : 1[5e000] -> 0[3b000] via SHM/direct/direct
[1679903531.027312] [a2910a560c15:124  :0]           debug.c:1289 UCX  WARN  ucs_debug_disable_signal: signal 1 was not set in ucs
[a2910a560c15:124  :1:137] Caught signal 7 (Bus error: nonexistent physical address)
[a2910a560c15:124  :0:139] Caught signal 7 (Bus error: nonexistent physical address)
==== backtrace (tid:    139) ====
 0 0x0000000000014420 __funlockfile()  ???:0
 1 0x000000000018bb41 __nss_database_lookup()  ???:0
 2 0x000000000007587d ncclGroupEnd()  ???:0
 3 0x000000000007b0ef ncclGroupEnd()  ???:0
 4 0x0000000000059e97 ncclGetUniqueId()  ???:0
 5 0x00000000000489b1 ???()  /usr/lib/x86_64-linux-gnu/libnccl.so.2:0
 6 0x000000000004a655 ???()  /usr/lib/x86_64-linux-gnu/libnccl.so.2:0
 7 0x0000000000063dcc ncclRedOpDestroy()  ???:0
 8 0x0000000000008609 start_thread()  ???:0
 9 0x000000000011f133 clone()  ???:0
=================================
Bus error (core dumped)

For 3GPU

# nThread 1 nGpus 3 minBytes 8 maxBytes 134217728 step: 2(factor) warmup iters: 5 iters: 20 agg iters: 1 validation: 1 graph: 0
#
# Using devices
#  Rank  0 Group  0 Pid    145 on a2910a560c15 device  0 [0x3b] NVIDIA RTX A6000
#  Rank  1 Group  0 Pid    145 on a2910a560c15 device  1 [0x5e] NVIDIA RTX A6000
#  Rank  2 Group  0 Pid    145 on a2910a560c15 device  2 [0x86] NVIDIA RTX A6000
a2910a560c15:145:145 [0] NCCL INFO Bootstrap : Using eth0:172.17.0.2<0>
a2910a560c15:145:145 [0] NCCL INFO NET/Plugin: Failed to find ncclNetPlugin_v6 symbol.
a2910a560c15:145:145 [0] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin (v5)
a2910a560c15:145:145 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol.
a2910a560c15:145:145 [0] NCCL INFO NET/Plugin: Loaded coll plugin SHARP (v5)
a2910a560c15:145:145 [2] NCCL INFO cudaDriverVersion 11080
NCCL version 2.15.5+cuda11.8
a2910a560c15:145:156 [0] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
a2910a560c15:145:156 [0] NCCL INFO P2P plugin IBext
a2910a560c15:145:156 [0] NCCL INFO NET/IB : No device found.
a2910a560c15:145:156 [0] NCCL INFO NET/IB : No device found.
a2910a560c15:145:156 [0] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.2<0>
a2910a560c15:145:156 [0] NCCL INFO Using network Socket
a2910a560c15:145:157 [1] NCCL INFO Using network Socket
a2910a560c15:145:158 [2] NCCL INFO Using network Socket
a2910a560c15:145:157 [1] NCCL INFO Setting affinity for GPU 1 to 0fffff00,000fffff
a2910a560c15:145:156 [0] NCCL INFO Setting affinity for GPU 0 to 0fffff00,000fffff
a2910a560c15:145:158 [2] NCCL INFO Setting affinity for GPU 2 to ffff,f00000ff,fff00000
a2910a560c15:145:157 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0
a2910a560c15:145:158 [2] NCCL INFO Trees [0] -1/-1/-1->2->1 [1] -1/-1/-1->2->1
a2910a560c15:145:156 [0] NCCL INFO Channel 00/02 :    0   1   2
a2910a560c15:145:156 [0] NCCL INFO Channel 01/02 :    0   1   2
a2910a560c15:145:156 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1
a2910a560c15:145:158 [2] NCCL INFO Channel 00 : 2[86000] -> 0[3b000] via SHM/direct/direct
a2910a560c15:145:158 [2] NCCL INFO Channel 01 : 2[86000] -> 0[3b000] via SHM/direct/direct
a2910a560c15:145:157 [1] NCCL INFO Channel 00/0 : 1[5e000] -> 2[86000] via P2P/direct pointer
a2910a560c15:145:156 [0] NCCL INFO Channel 00 : 0[3b000] -> 1[5e000] via SHM/direct/direct
a2910a560c15:145:156 [0] NCCL INFO Channel 01 : 0[3b000] -> 1[5e000] via SHM/direct/direct
a2910a560c15:145:157 [1] NCCL INFO Channel 01/0 : 1[5e000] -> 2[86000] via P2P/direct pointer
a2910a560c15:145:157 [1] NCCL INFO Connected all rings
a2910a560c15:145:156 [0] NCCL INFO Connected all rings
[a2910a560c15:145  :0:156] Caught signal 7 (Bus error: nonexistent physical address)
a2910a560c15:145:158 [2] NCCL INFO Connected all rings
a2910a560c15:145:158 [2] NCCL INFO Channel 00/0 : 2[86000] -> 1[5e000] via P2P/direct pointer
Bus error (core dumped)

for 2GPU

# nThread 1 nGpus 2 minBytes 8 maxBytes 134217728 step: 2(factor) warmup iters: 5 iters: 20 agg iters: 1 validation: 1 graph: 0
#
# Using devices
#  Rank  0 Group  0 Pid    162 on a2910a560c15 device  0 [0x3b] NVIDIA RTX A6000
#  Rank  1 Group  0 Pid    162 on a2910a560c15 device  1 [0x5e] NVIDIA RTX A6000
a2910a560c15:162:162 [0] NCCL INFO Bootstrap : Using eth0:172.17.0.2<0>
a2910a560c15:162:162 [0] NCCL INFO NET/Plugin: Failed to find ncclNetPlugin_v6 symbol.
a2910a560c15:162:162 [0] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin (v5)
a2910a560c15:162:162 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol.
a2910a560c15:162:162 [0] NCCL INFO NET/Plugin: Loaded coll plugin SHARP (v5)
a2910a560c15:162:162 [1] NCCL INFO cudaDriverVersion 11080
NCCL version 2.15.5+cuda11.8
a2910a560c15:162:171 [0] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
a2910a560c15:162:171 [0] NCCL INFO P2P plugin IBext
a2910a560c15:162:171 [0] NCCL INFO NET/IB : No device found.
a2910a560c15:162:171 [0] NCCL INFO NET/IB : No device found.
a2910a560c15:162:171 [0] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.2<0>
a2910a560c15:162:171 [0] NCCL INFO Using network Socket
a2910a560c15:162:172 [1] NCCL INFO Using network Socket
a2910a560c15:162:171 [0] NCCL INFO Setting affinity for GPU 0 to 0fffff00,000fffff
a2910a560c15:162:172 [1] NCCL INFO Setting affinity for GPU 1 to 0fffff00,000fffff
a2910a560c15:162:171 [0] NCCL INFO Channel 00/02 :    0   1
a2910a560c15:162:171 [0] NCCL INFO Channel 01/02 :    0   1
a2910a560c15:162:171 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1
a2910a560c15:162:172 [1] NCCL INFO Trees [0] -1/-1/-1->1->0 [1] -1/-1/-1->1->0
a2910a560c15:162:172 [1] NCCL INFO Channel 00 : 1[5e000] -> 0[3b000] via SHM/direct/direct
a2910a560c15:162:171 [0] NCCL INFO Channel 00 : 0[3b000] -> 1[5e000] via SHM/direct/direct
a2910a560c15:162:172 [1] NCCL INFO Channel 01 : 1[5e000] -> 0[3b000] via SHM/direct/direct
a2910a560c15:162:171 [0] NCCL INFO Channel 01 : 0[3b000] -> 1[5e000] via SHM/direct/direct
a2910a560c15:162:172 [1] NCCL INFO Connected all rings
a2910a560c15:162:171 [0] NCCL INFO Connected all rings
a2910a560c15:162:172 [1] NCCL INFO Connected all trees
a2910a560c15:162:171 [0] NCCL INFO Connected all trees
a2910a560c15:162:172 [1] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 512
a2910a560c15:162:172 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
a2910a560c15:162:171 [0] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 512
a2910a560c15:162:171 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
[a2910a560c15:162  :0:174] Caught signal 7 (Bus error: nonexistent physical address)
Bus error (core dumped)

for 1GPUS

# nThread 1 nGpus 1 minBytes 8 maxBytes 134217728 step: 2(factor) warmup iters: 5 iters: 20 agg iters: 1 validation: 1 graph: 0
#
# Using devices
#  Rank  0 Group  0 Pid    175 on a2910a560c15 device  0 [0x3b] NVIDIA RTX A6000
a2910a560c15:175:175 [0] NCCL INFO Bootstrap : Using eth0:172.17.0.2<0>
a2910a560c15:175:175 [0] NCCL INFO NET/Plugin: Failed to find ncclNetPlugin_v6 symbol.
a2910a560c15:175:175 [0] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin (v5)
a2910a560c15:175:175 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol.
a2910a560c15:175:175 [0] NCCL INFO NET/Plugin: Loaded coll plugin SHARP (v5)
a2910a560c15:175:175 [0] NCCL INFO cudaDriverVersion 11080
NCCL version 2.15.5+cuda11.8
a2910a560c15:175:182 [0] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
a2910a560c15:175:182 [0] NCCL INFO P2P plugin IBext
a2910a560c15:175:182 [0] NCCL INFO NET/IB : No device found.
a2910a560c15:175:182 [0] NCCL INFO NET/IB : No device found.
a2910a560c15:175:182 [0] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.2<0>
a2910a560c15:175:182 [0] NCCL INFO Using network Socket
a2910a560c15:175:182 [0] NCCL INFO Setting affinity for GPU 0 to 0fffff00,000fffff
a2910a560c15:175:182 [0] NCCL INFO Channel 00/32 :    0
a2910a560c15:175:182 [0] NCCL INFO Channel 01/32 :    0
a2910a560c15:175:182 [0] NCCL INFO Channel 02/32 :    0
a2910a560c15:175:182 [0] NCCL INFO Channel 03/32 :    0
a2910a560c15:175:182 [0] NCCL INFO Channel 04/32 :    0
a2910a560c15:175:182 [0] NCCL INFO Channel 05/32 :    0
a2910a560c15:175:182 [0] NCCL INFO Channel 06/32 :    0
a2910a560c15:175:182 [0] NCCL INFO Channel 07/32 :    0
a2910a560c15:175:182 [0] NCCL INFO Channel 08/32 :    0
a2910a560c15:175:182 [0] NCCL INFO Channel 09/32 :    0
a2910a560c15:175:182 [0] NCCL INFO Channel 10/32 :    0
a2910a560c15:175:182 [0] NCCL INFO Channel 11/32 :    0
a2910a560c15:175:182 [0] NCCL INFO Channel 12/32 :    0
a2910a560c15:175:182 [0] NCCL INFO Channel 13/32 :    0
a2910a560c15:175:182 [0] NCCL INFO Channel 14/32 :    0
a2910a560c15:175:182 [0] NCCL INFO Channel 15/32 :    0
a2910a560c15:175:182 [0] NCCL INFO Channel 16/32 :    0
a2910a560c15:175:182 [0] NCCL INFO Channel 17/32 :    0
a2910a560c15:175:182 [0] NCCL INFO Channel 18/32 :    0
a2910a560c15:175:182 [0] NCCL INFO Channel 19/32 :    0
a2910a560c15:175:182 [0] NCCL INFO Channel 20/32 :    0
a2910a560c15:175:182 [0] NCCL INFO Channel 21/32 :    0
a2910a560c15:175:182 [0] NCCL INFO Channel 22/32 :    0
a2910a560c15:175:182 [0] NCCL INFO Channel 23/32 :    0
a2910a560c15:175:182 [0] NCCL INFO Channel 24/32 :    0
a2910a560c15:175:182 [0] NCCL INFO Channel 25/32 :    0
a2910a560c15:175:182 [0] NCCL INFO Channel 26/32 :    0
a2910a560c15:175:182 [0] NCCL INFO Channel 27/32 :    0
a2910a560c15:175:182 [0] NCCL INFO Channel 28/32 :    0
a2910a560c15:175:182 [0] NCCL INFO Channel 29/32 :    0
a2910a560c15:175:182 [0] NCCL INFO Channel 30/32 :    0
a2910a560c15:175:182 [0] NCCL INFO Channel 31/32 :    0
a2910a560c15:175:182 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1
a2910a560c15:175:182 [0] NCCL INFO Connected all rings
a2910a560c15:175:182 [0] NCCL INFO Connected all trees
a2910a560c15:175:182 [0] NCCL INFO 32 coll channels, 32 p2p channels, 32 p2p channels per peer
a2910a560c15:175:182 [0] NCCL INFO comm 0x55ecdafdfd00 rank 0 nranks 1 cudaDev 0 busId 3b000 - Init COMPLETE
#
#                                                              out-of-place                       in-place          
#       size         count      type   redop    root     time   algbw   busbw #wrong     time   algbw   busbw #wrong
#        (B)    (elements)                               (us)  (GB/s)  (GB/s)            (us)  (GB/s)  (GB/s)       
           8             2     float     sum      -1     3.01    0.00    0.00      0     0.15    0.05    0.00      0
          16             4     float     sum      -1     4.24    0.00    0.00      0     0.18    0.09    0.00      0
          32             8     float     sum      -1     3.33    0.01    0.00      0     0.18    0.18    0.00      0
          64            16     float     sum      -1     3.31    0.02    0.00      0     0.18    0.36    0.00      0
         128            32     float     sum      -1     3.31    0.04    0.00      0     0.18    0.72    0.00      0
         256            64     float     sum      -1     3.36    0.08    0.00      0     0.18    1.43    0.00      0
         512           128     float     sum      -1     3.33    0.15    0.00      0     0.18    2.87    0.00      0
        1024           256     float     sum      -1     3.30    0.31    0.00      0     0.18    5.79    0.00      0
        2048           512     float     sum      -1     3.30    0.62    0.00      0     0.18   11.57    0.00      0
        4096          1024     float     sum      -1     3.24    1.26    0.00      0     0.18   22.90    0.00      0
        8192          2048     float     sum      -1     3.27    2.51    0.00      0     0.18   46.11    0.00      0
       16384          4096     float     sum      -1     3.26    5.03    0.00      0     0.18   92.28    0.00      0
       32768          8192     float     sum      -1     3.18   10.32    0.00      0     0.17  195.81    0.00      0
       65536         16384     float     sum      -1     3.94   16.65    0.00      0     0.16  400.46    0.00      0
      131072         32768     float     sum      -1     3.09   42.40    0.00      0     0.16  843.99    0.00      0
      262144         65536     float     sum      -1     3.15   83.12    0.00      0     0.16  1602.84    0.00      0
      524288        131072     float     sum      -1     3.57  147.03    0.00      0     0.16  3379.23    0.00      0
     1048576        262144     float     sum      -1     5.84  179.60    0.00      0     0.16  6738.92    0.00      0
     2097152        524288     float     sum      -1     8.92  235.05    0.00      0     0.17  12565.32    0.00      0
     4194304       1048576     float     sum      -1    15.93  263.30    0.00      0     0.16  26990.37    0.00      0
     8388608       2097152     float     sum      -1    28.72  292.11    0.00      0     0.16  53704.28    0.00      0
    16777216       4194304     float     sum      -1    53.11  315.90    0.00      0     0.16  107649.77    0.00      0
    33554432       8388608     float     sum      -1    102.2  328.28    0.00      0     0.16  216270.91    0.00      0
    67108864      16777216     float     sum      -1    200.3  335.02    0.00      0     0.17  394874.16    0.00      0
   134217728      33554432     float     sum      -1    397.0  338.09    0.00      0     0.15  873813.33    0.00      0
a2910a560c15:175:175 [0] NCCL INFO comm 0x55ecdafdfd00 rank 0 nranks 1 cudaDev 0 busId 3b000 - Destroy COMPLETE
# Out of bounds values : 0 OK
# Avg bus bandwidth    : 0 
#

For 2gpus, it gets failed now. But I recall that 2gpus can run well in your previous result.
Can you confirm
2gpus + 515 driver: failed
2gpus + 525 driver: ok

2gpus + 515 driver: failed

OK, can you run nccl test outside tao container?

2GPUs + 515 driver + nccl test + without_tao_container

How about running nccl with
2gpus + 525 driver ?

I recall that it is ok, right?

i’m trying to run the make command for nccl-tests outside the tao container, i get this error, what do i need to install?

make -C src build BUILDDIR=/home/amrc_cymru/nccl-tests/build
make[1]: Entering directory '/home/amrc_cymru/nccl-tests/src'
Compiling /home/amrc_cymru/nccl-tests/build/verifiable/verifiable.o
make[1]: /usr/local/cuda/bin/nvcc: Command not found
make[1]: *** [../verifiable/verifiable.mk:11: /home/amrc_cymru/nccl-tests/build/verifiable/verifiable.o] Error 127
make[1]: Leaving directory '/home/amrc_cymru/nccl-tests/src'
make: *** [Makefile:20: src.build] Error 2

Can you check if nvcc is available in /usr/local/cuda/bin
$ ls /usr/local/cuda/bin

I don’t have the cuda folder in the local directory

Please install CUDA via cuda-installation-guide-linux 12.1 documentation.