NVSHMEM on multi-node GPUs failed . My gpu is A5000

i install the nvshmem_src_2.19 with the follow command

export NVSHMEM_PREFIX=~/nvshmem

export UCX_HOME=~/UCX

 cmake  -DNVSHMEM_PREFIX=~/nvshmem -DNVSHMEM_IBRC_SUPPORT=1 -DNVSHMEM_UCX_SUPPORT=1 -DNVSHMEM_IBGDA_SUPPORT=1 -DNVSHMEM_MPI_SUPPORT=1     -DNVSHMEM_MPI_IS_OMPI=1    ..

Then my code is below

#include <stdio.h>
#include "mpi.h"
#include "nvshmem.h"
#include "nvshmemx.h"
#include <unistd.h>

#define CUDA_CHECK(stmt)                                  \
do {                                                      \
    cudaError_t result = (stmt);                          \
    if (cudaSuccess != result) {                          \
        fprintf(stderr, "[%s:%d] CUDA failed with %s \n", \
         __FILE__, __LINE__, cudaGetErrorString(result)); \
        exit(-1);                                         \
    }                                                     \
} while (0)

__global__ void simple_shift(int *destination) {
    int mype = nvshmem_my_pe();
    int npes = nvshmem_n_pes();
    int peer = (mype + 1) % npes;

    nvshmem_int_p(destination, mype, peer);
}

int main (int argc, char *argv[]) {
    int mype_node, msg;
    cudaStream_t stream;
    int rank, nranks;
    char hostname[256];
    gethostname(hostname, 256);
    
    MPI_Comm mpi_comm = MPI_COMM_WORLD;
    nvshmemx_init_attr_t attr;

    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &nranks);
    printf("Rank: %d, Hostname: %s\n", rank, hostname);

    attr.mpi_comm = &mpi_comm;
    nvshmemx_init_attr(NVSHMEMX_INIT_WITH_MPI_COMM, &attr);
    mype_node = nvshmem_team_my_pe(NVSHMEMX_TEAM_NODE);

    CUDA_CHECK(cudaSetDevice(mype_node));
    CUDA_CHECK(cudaStreamCreate(&stream));
    int *destination = (int *) nvshmem_malloc (sizeof(int));

    simple_shift<<<1, 1, 0, stream>>>(destination);
    nvshmemx_barrier_all_on_stream(stream);
    CUDA_CHECK(cudaMemcpyAsync(&msg, destination, sizeof(int),
                cudaMemcpyDeviceToHost, stream));

    CUDA_CHECK(cudaStreamSynchronize(stream));
    printf("%d: received message %d and hostname:%s\n", nvshmem_my_pe(), msg, hostname);

    nvshmem_free(destination);
    nvshmem_finalize();
    MPI_Finalize();
    return 0;
}

my execution command is

export  NVSHMEM_DEBUG=TRACE
 export NVSHMEM_DEBUG_SUBSYS=ALL
 nvshmrun  -n 2 -ppn 1 --host ip1:1,ip2:1 ./worker > nvshmem.log 2>&1

my log is

Rank: 1, Hostname: xxxxx-0-11
Rank: 0, Hostname: xxxxx-0-9
xxxxx-0-9:2682207:2682207 [0] NVSHMEM INFO host name: xxxxx-0-9 hash 14953616763125465901
xxxxx-0-11:3667279:3667279 [0] NVSHMEM INFO host name: xxxxx-0-11 hash 13854007266692032502
xxxxx-0-11:3667279:3667279 [0] NVSHMEM INFO PE distribution has been identified as NVSHMEMI_PE_DIST_ROUNDROBIN
xxxxx-0-11:3667279:3667279 [0] NVSHMEM INFO PE 1 (process) affinity to 128 CPUs:
     0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
    27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
    54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
    81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105
    106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
    126 127 
xxxxx-0-11:3667279:3667279 [0] NVSHMEM INFO cudaDriverVersion 12030
xxxxx-0-11:3667279:3667279 [0] NVSHMEM INFO NVSHMEM symmetric heap kind = DEVICE selected
/home/yyyy/nvshmem_src_2.10.1-3/src/host/init/init.cu:816: non-zero status: 5 xxxxx-0-11:3667279:3667279 [0] NVSHMEM INFO nvshmemi_common_init failed, continuing
nvshmem get cucontext failed 
xxxxx-0-9:2682207:2682207 [0] NVSHMEM INFO PE distribution has been identified as NVSHMEMI_PE_DIST_ROUNDROBIN
NVSHMEM configuration:
  CUDA API                     12010
  CUDA Runtime                 12020
  CUDA Driver                  12030
  Build Timestamp              Mar 15 2024 08:32:29
  Build Variables             
	NVSHMEM_DEBUG=OFF NVSHMEM_DEVEL=OFF NVSHMEM_DEFAULT_PMI2=OFF
	NVSHMEM_DEFAULT_PMIX=OFF NVSHMEM_DEFAULT_UCX=OFF NVSHMEM_DISABLE_COLL_POLL=ON
	NVSHMEM_ENABLE_ALL_DEVICE_INLINING=OFF NVSHMEM_ENV_ALL=OFF
	NVSHMEM_GPU_COLL_USE_LDST=OFF NVSHMEM_IBGDA_SUPPORT=1
	NVSHMEM_IBGDA_SUPPORT_GPUMEM_ONLY=OFF NVSHMEM_IBDEVX_SUPPORT=OFF
	NVSHMEM_IBRC_SUPPORT=0 NVSHMEM_LIBFABRIC_SUPPORT=OFF NVSHMEM_MPI_SUPPORT=1
	NVSHMEM_NVTX=ON NVSHMEM_PMIX_SUPPORT=OFF NVSHMEM_SHMEM_SUPPORT=OFF
	NVSHMEM_TEST_STATIC_LIB=OFF NVSHMEM_TIMEOUT_DEVICE_POLLING=OFF NVSHMEM_TRACE=OFF
	NVSHMEM_UCX_SUPPORT=1 NVSHMEM_USE_DLMALLOC=OFF NVSHMEM_USE_NCCL=OFF
	NVSHMEM_USE_GDRCOPY=ON NVSHMEM_VERBOSE=OFF CUDA_HOME=/usr/local/cuda
	GDRCOPY_HOME=/usr/local/gdrdrv LIBFABRIC_HOME=/usr/local/libfabric
	MPI_HOME=/usr/local/ompi NCCL_HOME=/usr/local/nccl
	NVSHMEM_PREFIX=/home/yyyy/nvshmem PMIX_HOME=/usr SHMEM_HOME=/usr/local/ompi
	UCX_HOME=/home/yyyy/UCX

xxxxx-0-9:2682207:2682207 [0] NVSHMEM INFO PE 0 (process) affinity to 128 CPUs:
     0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
    27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
    54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
    81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105
    106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
    126 127 
xxxxx-0-9:2682207:2682207 [0] NVSHMEM INFO cudaDriverVersion 12030
xxxxx-0-9:2682207:2682207 [0] NVSHMEM INFO NVSHMEM symmetric heap kind = DEVICE selected
/home/yyyy/nvshmem_src_2.10.1-3/src/host/init/init.cu:816: non-zero status: 5 nvshmem get cucontext failed 
xxxxx-0-9:2682207:2682207 [0] NVSHMEM INFO nvshmemi_common_init failed, continuing
xxxxx-0-11:3667279:3667279 [0] NVSHMEM INFO NVSHMEM symmetric heap kind = DEVICE selected
xxxxx-0-11:3667279:3667279 [0] NVSHMEM INFO [1] nvshmemi_get_cucontext->cuCtxSynchronize->CUDA_SUCCESS) my_stream (nil)
xxxxx-0-11:3667279:3667279 [0] NVSHMEM INFO in get_cucontext, queried and saved context for device: 0 context: 0x43bc7e0
xxxxx-0-11:3667279:3667279 [0] NVSHMEM INFO [1] nvshmemi_get_cucontext->cuCtxGetDevice->0(CUDA_ERROR_INVALID_CONTEXT 201) cuStreamCreateWithPriority my_stream 0x4efbcb0
xxxxx-0-9:2682207:2682207 [0] NVSHMEM INFO NVSHMEM symmetric heap kind = DEVICE selected
xxxxx-0-9:2682207:2682207 [0] NVSHMEM INFO [0] nvshmemi_get_cucontext->cuCtxSynchronize->CUDA_SUCCESS) my_stream (nil)
xxxxx-0-9:2682207:2682207 [0] NVSHMEM INFO in get_cucontext, queried and saved context for device: 0 context: 0x43a5b60
xxxxx-0-9:2682207:2682207 [0] NVSHMEM INFO [0] nvshmemi_get_cucontext->cuCtxGetDevice->0(CUDA_ERROR_INVALID_CONTEXT 201) cuStreamCreateWithPriority my_stream 0x4ee61a0
xxxxx-0-11:3667279:3667279 [0] NVSHMEM INFO host name: xxxxx-0-11 hash 13854007266692032502
xxxxx-0-9:2682207:2682207 [0] NVSHMEM INFO host name: xxxxx-0-9 hash 14953616763125465901
xxxxx-0-9:2682207:2682207 [0] NVSHMEM INFO nvshmemi_setup_local_heap, heapextra = 285225000
xxxxx-0-9:2682207:2682207 [0] NVSHMEM INFO [0] mspace ptr: 0x2e18100
xxxxx-0-9:2682207:2682207 [0] NVSHMEM INFO host name: xxxxx-0-9 hash 14953616763125465901
xxxxx-0-11:3667279:3667279 [0] NVSHMEM INFO nvshmemi_setup_local_heap, heapextra = 285225000
xxxxx-0-11:3667279:3667279 [0] NVSHMEM INFO [1] mspace ptr: 0x2e18100
xxxxx-0-11:3667279:3667279 [0] NVSHMEM INFO host name: xxxxx-0-11 hash 13854007266692032502
xxxxx-0-9:2682207:2682207 [0] NVSHMEM INFO UCX transport skipped in favor of: ibrc

xxxxx-0-9:2682207:2682207 [0] NVSHMEM INFO IBGDA Disabled by the environment.
xxxxx-0-9:2682207:2682207 [0] NVSHMEM INFO [0x57fcc40] ndev 4 pcie_devid 0 cudevice 0 peer host hash cf85eb37e96b832d p2p host hash cf85eb37e96b832d
xxxxx-0-9:2682207:2682207 [0] NVSHMEM INFO [0] reach 15 to peer 0 over transport 0
xxxxx-0-9:2682207:2682207 [0] NVSHMEM INFO [0x57fcc40] ndev 4 pcie_devid 0 cudevice 0 peer host hash c043523516dbe7f6 p2p host hash cf85eb37e96b832d
xxxxx-0-9:2682207:2682207 [0] NVSHMEM INFO [0] reach 0 to peer 1 over transport 0
/home/yyyy/nvshmem_src_2.10.1-3/src/host/topo/topo.cpp:420: [GPU 0] Peer GPU 1 is not accessible, exiting ... 
/home/yyyy/nvshmem_src_2.10.1-3/src/host/init/init.cu:843: non-zero status: 3 building transport map failed 

/home/yyyy/nvshmem_src_2.10.1-3/src/host/init/init.cu:nvshmemi_check_state_and_init:933: nvshmem initialization failed, exiting 

/home/yyyy/nvshmem_src_2.10.1-3/src/util/cs.cpp:23: non-zero status: 16: File exists, exiting... mutex destroy failed 

xxxxx-0-11:3667279:3667279 [0] NVSHMEM INFO UCX transport skipped in favor of: ibrc

xxxxx-0-11:3667279:3667279 [0] NVSHMEM INFO IBGDA Disabled by the environment.
xxxxx-0-11:3667279:3667279 [0] NVSHMEM INFO [0x5812750] ndev 4 pcie_devid 0 cudevice 0 peer host hash cf85eb37e96b832d p2p host hash c043523516dbe7f6
xxxxx-0-11:3667279:3667279 [0] NVSHMEM INFO [1] reach 0 to peer 0 over transport 0
/home/yyyy/nvshmem_src_2.10.1-3/src/host/topo/topo.cpp:420: [GPU 1] Peer GPU 0 is not accessible, exiting ... 
/home/yyyy/nvshmem_src_2.10.1-3/src/host/init/init.cu:843: non-zero status: 3 building transport map failed 

/home/yyyy/nvshmem_src_2.10.1-3/src/host/init/init.cu:nvshmemi_check_state_and_init:933: nvshmem initialization failed, exiting 

/home/yyyy/nvshmem_src_2.10.1-3/src/util/cs.cpp:23: non-zero status: 16: File exists, exiting... mutex destroy failed

the new long for the command nvshmrun -n 2 -ppn 1 --host ip1:1,ip2:1 ./worker > nvshmem.log 2>&1

Rank: 1, Hostname: xxxx-0-11
Rank: 0, Hostname: xxxx-0-9
xxxx-0-9:2690888:2690888 [0] NVSHMEM INFO host name: xxxx-0-9 hash 14953616763125465901
xxxx-0-11:3684783:3684783 [0] NVSHMEM INFO host name: xxxx-0-11 hash 13854007266692032502
xxxx-0-11:3684783:3684783 [0] NVSHMEM INFO PE distribution has been identified as NVSHMEMI_PE_DIST_ROUNDROBIN
xxxx-0-11:3684783:3684783 [0] NVSHMEM INFO PE 1 (process) affinity to 128 CPUs:
     0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
    27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
    54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
    81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104
    106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
    126 127 
xxxx-0-11:3684783:3684783 [0] NVSHMEM INFO cudaDriverVersion 12030
xxxx-0-11:3684783:3684783 [0] NVSHMEM INFO NVSHMEM symmetric heap kind = DEVICE selected
/home/yyyy/nvshmem_src_2.11.0-5/src/host/init/init.cu:948: non-zero status: 5 nvshmem get cucontext failed 
xxxx-0-11:3684783:3684783 [0] NVSHMEM INFO nvshmemi_common_init failed, continuing
xxxx-0-9:2690888:2690888 [0] NVSHMEM INFO PE distribution has been identified as NVSHMEMI_PE_DIST_ROUNDROBIN
NVSHMEM configuration:
  CUDA API                     12010
  CUDA Runtime                 12020
  CUDA Driver                  12030
  Build Timestamp              Mar 15 2024 09:55:22
  Build Variables             
	NVSHMEM_DEBUG=OFF NVSHMEM_DEVEL=OFF NVSHMEM_DEFAULT_PMI2=OFF
	NVSHMEM_DEFAULT_PMIX=OFF NVSHMEM_DEFAULT_UCX=OFF NVSHMEM_DISABLE_COLL_POLL=ON
	NVSHMEM_ENABLE_ALL_DEVICE_INLINING=OFF NVSHMEM_GPU_COLL_USE_LDST=OFF
	NVSHMEM_IBGDA_SUPPORT=1 NVSHMEM_IBGDA_SUPPORT_GPUMEM_ONLY=OFF
	NVSHMEM_IBDEVX_SUPPORT=OFF NVSHMEM_IBRC_SUPPORT=1 NVSHMEM_LIBFABRIC_SUPPORT=OFF
	NVSHMEM_MPI_SUPPORT=1 NVSHMEM_NVTX=ON NVSHMEM_PMIX_SUPPORT=OFF
	NVSHMEM_SHMEM_SUPPORT=OFF NVSHMEM_TEST_STATIC_LIB=OFF
	NVSHMEM_TIMEOUT_DEVICE_POLLING=OFF NVSHMEM_TRACE=OFF NVSHMEM_UCX_SUPPORT=1
	NVSHMEM_USE_DLMALLOC=OFF NVSHMEM_USE_NCCL=OFF NVSHMEM_USE_GDRCOPY=ON
	NVSHMEM_VERBOSE=OFF CUDA_HOME=/usr/local/cuda GDRCOPY_HOME=/usr/local/gdrdrv
	LIBFABRIC_HOME=/usr/local/libfabric MPI_HOME=/usr/local/ompi
	NCCL_HOME=/usr/local/nccl NVSHMEM_PREFIX=/home/yyyy/nvshmem PMIX_HOME=/usr
	SHMEM_HOME=/usr/local/ompi UCX_HOME=/home/yyyy/UCX

xxxx-0-9:2690888:2690888 [0] NVSHMEM INFO PE 0 (process) affinity to 128 CPUs:
     0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
    27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
    54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
    81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104
    106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
    126 127 
xxxx-0-9:2690888:2690888 [0] NVSHMEM INFO cudaDriverVersion 12030
xxxx-0-9:2690888:2690888 [0] NVSHMEM INFO NVSHMEM symmetric heap kind = DEVICE selected
/home/yyyy/nvshmem_src_2.11.0-5/src/host/init/init.cu:948: non-zero status: 5 nvshmem get cucontext failed 
xxxx-0-9:2690888:2690888 [0] NVSHMEM INFO nvshmemi_common_init failed, continuing
xxxx-0-9:2690888:2690888 [0] NVSHMEM INFO NVSHMEM symmetric heap kind = DEVICE selected
xxxx-0-9:2690888:2690888 [0] NVSHMEM INFO [0] nvshmemi_get_cucontext->cuCtxSynchronize->CUDA_SUCCESS) my_stream (nil)
xxxx-0-9:2690888:2690888 [0] NVSHMEM INFO in get_cucontext, queried and saved context for device: 0 context: 0x46d7420
xxxx-0-9:2690888:2690888 [0] NVSHMEM INFO [0] nvshmemi_get_cucontext->cuCtxGetDevice->0(CUDA_ERROR_INVALID_CONTEXT 201) cuStreamCreateWithPriority my_stream 0x5332500
xxxx-0-11:3684783:3684783 [0] NVSHMEM INFO NVSHMEM symmetric heap kind = DEVICE selected
xxxx-0-11:3684783:3684783 [0] NVSHMEM INFO [1] nvshmemi_get_cucontext->cuCtxSynchronize->CUDA_SUCCESS) my_stream (nil)
xxxx-0-11:3684783:3684783 [0] NVSHMEM INFO in get_cucontext, queried and saved context for device: 0 context: 0x3a94b80
xxxx-0-11:3684783:3684783 [0] NVSHMEM INFO [1] nvshmemi_get_cucontext->cuCtxGetDevice->0(CUDA_ERROR_INVALID_CONTEXT 201) cuStreamCreateWithPriority my_stream 0x45d5360
xxxx-0-11:3684783:3684783 [0] NVSHMEM INFO host name: xxxx-0-11 hash 13854007266692032502
xxxx-0-9:2690888:2690888 [0] NVSHMEM INFO host name: xxxx-0-9 hash 14953616763125465901
xxxx-0-11:3684783:3684783 [0] NVSHMEM INFO nvshmemi_setup_local_heap, heapextra = 285225000
xxxx-0-11:3684783:3684783 [0] NVSHMEM INFO host name: xxxx-0-11 hash 13854007266692032502
xxxx-0-9:2690888:2690888 [0] NVSHMEM INFO nvshmemi_setup_local_heap, heapextra = 285225000
xxxx-0-9:2690888:2690888 [0] NVSHMEM INFO host name: xxxx-0-9 hash 14953616763125465901
xxxx-0-11:3684783:3684783 [0] NVSHMEM INFO NVML library found. libnvidia-ml.so
xxxx-0-9:2690888:2690888 [0] NVSHMEM INFO NVML library found. libnvidia-ml.so
/home/yyyy/nvshmem_src_2.11.0-5/src/modules/transport/common/transport_gdr_common.cpp 73 GDR driver version: (2, 5)
/home/yyyy/nvshmem_src_2.11.0-5/src/modules/transport/ibrc/ibrc.cpp 1635 Begin - Enumerating IB devices in the system ([<dev_id, device_name, num_ports>]) - 
/home/yyyy/nvshmem_src_2.11.0-5/src/modules/transport/ibrc/ibrc.cpp 1656 Enumerated IB devices in the system - device id=0 (of 1), name=mlx5_0, num_ports=1
/home/yyyy/nvshmem_src_2.11.0-5/src/modules/transport/ibrc/ibrc.cpp 1737 End - Enumerating IB devices in the system
/home/yyyy/nvshmem_src_2.11.0-5/src/modules/transport/ibrc/ibrc.cpp 1742 Begin - Ordered list of devices for assignment (after processing user provdied env vars (if any))  - 
/home/yyyy/nvshmem_src_2.11.0-5/src/modules/transport/ibrc/ibrc.cpp 1746 Ordered list of devices for assignment - idx=0 (of 1), device id=0, port_num=1
/home/yyyy/nvshmem_src_2.11.0-5/src/modules/transport/ibrc/ibrc.cpp 1750 End - Ordered list of devices for assignment (after processing user provdied env vars (if any))
/home/yyyy/nvshmem_src_2.11.0-5/src/modules/transport/ibrc/ibrc.cpp 212 /home/yyyy/nvshmem_src_2.11.0-5/src/modules/transport/ibrc/ibrc.cpp:1790 Ib Alloc Size 2097152 pointer 0x7f3ee0512000
/home/yyyy/nvshmem_src_2.11.0-5/src/modules/transport/ibrc/ibrc.cpp:nvshmemt_init:1851: neither nv_peer_mem, or nvidia_peermem detected. Skipping transport.

WARN: init failed for remote transport: ibrc
xxxx-0-11:3684783:3684783 [0] NVSHMEM INFO IBGDA Disabled by the environment.
xxxx-0-11:3684783:3684783 [0] NVSHMEM INFO [0x4efa4a0] ndev 4 pcie_devid 0 cudevice 0 peer host hash cf85eb37e96b832d p2p host hash c043523516dbe7f6
xxxx-0-11:3684783:3684783 [0] NVSHMEM INFO [1] reach 0 to peer 0 over transport 0
/home/yyyy/nvshmem_src_2.11.0-5/src/host/topo/topo.cpp:476: [GPU 1] Peer GPU 0 is not accessible, exiting ... 
/home/yyyy/nvshmem_src_2.11.0-5/src/host/init/init.cu:972: non-zero status: 3 building transport map failed 

/home/yyyy/nvshmem_src_2.11.0-5/src/host/init/init.cu:nvshmemi_check_state_and_init:1062: nvshmem initialization failed, exiting 

/home/yyyy/nvshmem_src_2.11.0-5/src/util/cs.cpp:23: non-zero status: 16: No such file or directory, exiting... mutex destroy failed 

/home/yyyy/nvshmem_src_2.11.0-5/src/modules/transport/common/transport_gdr_common.cpp 73 GDR driver version: (2, 5)
/home/yyyy/nvshmem_src_2.11.0-5/src/modules/transport/ibrc/ibrc.cpp 1635 Begin - Enumerating IB devices in the system ([<dev_id, device_name, num_ports>]) - 
/home/yyyy/nvshmem_src_2.11.0-5/src/modules/transport/ibrc/ibrc.cpp 1656 Enumerated IB devices in the system - device id=0 (of 1), name=mlx5_0, num_ports=1
/home/yyyy/nvshmem_src_2.11.0-5/src/modules/transport/ibrc/ibrc.cpp 1737 End - Enumerating IB devices in the system
/home/yyyy/nvshmem_src_2.11.0-5/src/modules/transport/ibrc/ibrc.cpp 1742 Begin - Ordered list of devices for assignment (after processing user provdied env vars (if any))  - 
/home/yyyy/nvshmem_src_2.11.0-5/src/modules/transport/ibrc/ibrc.cpp 1746 Ordered list of devices for assignment - idx=0 (of 1), device id=0, port_num=1
/home/yyyy/nvshmem_src_2.11.0-5/src/modules/transport/ibrc/ibrc.cpp 1750 End - Ordered list of devices for assignment (after processing user provdied env vars (if any))
/home/yyyy/nvshmem_src_2.11.0-5/src/modules/transport/ibrc/ibrc.cpp 212 /home/yyyy/nvshmem_src_2.11.0-5/src/modules/transport/ibrc/ibrc.cpp:1790 Ib Alloc Size 2097152 pointer 0x7f28903d8000
/home/yyyy/nvshmem_src_2.11.0-5/src/modules/transport/ibrc/ibrc.cpp:nvshmemt_init:1851: neither nv_peer_mem, or nvidia_peermem detected. Skipping transport.

WARN: init failed for remote transport: ibrc
xxxx-0-9:2690888:2690888 [0] NVSHMEM INFO IBGDA Disabled by the environment.
xxxx-0-9:2690888:2690888 [0] NVSHMEM INFO [0x5c57760] ndev 4 pcie_devid 0 cudevice 0 peer host hash cf85eb37e96b832d p2p host hash cf85eb37e96b832d
xxxx-0-9:2690888:2690888 [0] NVSHMEM INFO [0] reach 15 to peer 0 over transport 0
xxxx-0-9:2690888:2690888 [0] NVSHMEM INFO [0x5c57760] ndev 4 pcie_devid 0 cudevice 0 peer host hash c043523516dbe7f6 p2p host hash cf85eb37e96b832d
xxxx-0-9:2690888:2690888 [0] NVSHMEM INFO [0] reach 0 to peer 1 over transport 0
/home/yyyy/nvshmem_src_2.11.0-5/src/host/topo/topo.cpp:476: [GPU 0] Peer GPU 1 is not accessible, exiting ... 
/home/yyyy/nvshmem_src_2.11.0-5/src/host/init/init.cu:972: non-zero status: 3 building transport map failed 

/home/yyyy/nvshmem_src_2.11.0-5/src/host/init/init.cu:nvshmemi_check_state_and_init:1062: nvshmem initialization failed, exiting 

/home/yyyy/nvshmem_src_2.11.0-5/src/util/cs.cpp:23: non-zero status: 16: No such file or directory, exiting... mutex destroy failed 


@lambda7xx This is a indication of likely the problem at hand for the failed initialized of IBRC transport - you are missing installation of nvidia-peermem module. You can following the instructions to install this module and retry: GPUDirect RDMA

In a multinode setup, where P2P is not supported b/w GPUs, you will need to set NVSHMEM_DISABLE_P2P=1 to bypass this error: Environment Variables — NVSHMEM 2.11.0 documentation

Please share your results, after installing peermem module and enabling this env in your run command.

Thanks. I have install the nv_peer_mem successfully. let me try set NVSHMEM_DISABLE_P2P=1

hello, set NVSHMEM_DISABLE_P2P=1 means when I install the nvshmem, I should set NVSHMEM_DISABLE_P2P=1 ? or when i run the code, I should set NVSHMEM_DISABLE_P2P=1 ? thanks

To answer your question, its a runtime env - so you don’t need it a build time.
I think if you have resolved IB initialization problem on the multi-node setup, this env is not needed. Sorry for confusion on this env as that P2P error message was a red-herring in a multi-node configuration.

Let me know if you still running into any issue.