Kernel: Linux bi-headnode01 5.4.0-137-generic
OS: Ubuntu 20.04.5 LTS
Version: MLNX_OFED_LINUX-4.9-6.0.6.0
IB Card: ConnectX-3
I’m having trouble getting our headnode working after reinstalling it. It appears that traffic isn’t crossing the card, but it can see the switch and other nodes.
This is doing an ibping from a lustre node to the headnode:
[root@bi-ddn2 log]# ibping -G 0x506b4b030073a571
ibwarn: [23083] _do_madrpc: recv failed: Connection timed out
ibwarn: [23083] mad_rpc_rmpp: _do_madrpc failed; dport (Lid 4)
ibwarn: [23083] _do_madrpc: recv failed: Connection timed out
ibwarn: [23083] mad_rpc_rmpp: _do_madrpc failed; dport (Lid 4)
ibwarn: [23083] _do_madrpc: recv failed: Connection timed out
ibwarn: [23083] mad_rpc_rmpp: _do_madrpc failed; dport (Lid 4)
ibwarn: [23083] _do_madrpc: recv failed: Connection timed out
ibwarn: [23083] mad_rpc_rmpp: _do_madrpc failed; dport (Lid 4)
ibwarn: [23083] _do_madrpc: recv failed: Connection timed out
ibwarn: [23083] mad_rpc_rmpp: _do_madrpc failed; dport (Lid 4)
ibwarn: [23083] _do_madrpc: recv failed: Connection timed out
ibwarn: [23083] mad_rpc_rmpp: _do_madrpc failed; dport (Lid 4)
ibwarn: [23083] _do_madrpc: recv failed: Connection timed out
ibwarn: [23083] mad_rpc_rmpp: _do_madrpc failed; dport (Lid 4)
It can talk to the switch with no problem.
root@bi-headnode01:~# ibnodes
Ca : 0xec0d9a0300144560 ports 2 “DataDirect HCA-1”
Ca : 0x506b4b03006d6ef0 ports 1 “bi-node002 HCA-1”
Ca : 0xec0d9a0300143ac0 ports 2 “DataDirect HCA-1”
Ca : 0x506b4b030073a570 ports 1 “bi-headnode01 HCA-1”
Switch : 0x1070fd0300b9d5a2 ports 41 “MF0;bi-clusterswitch1:MQM8700/U1” enhanced port 0 lid 1 lmc 0
The modules seem to be loading:
root@bi-headnode01:~# lsmod |grep -i ib
ko2iblnd 237568 1
lnet 573440 7 osc,ko2iblnd,obdclass,ptlrpc,mgc,lmv,lustre
libcfs 475136 12 fld,lnet,osc,fid,ko2iblnd,obdclass,ptlrpc,mgc,lov,mdc,lmv,lustre
ib_ucm 20480 0
ib_umad 24576 0
mlx5_ib 401408 0
ib_uverbs 135168 3 rdma_ucm,mlx5_ib,ib_ucm
ib_ipoib 180224 0
ib_iser 53248 0
rdma_cm 61440 3 ko2iblnd,ib_iser,rdma_ucm
ib_cm 57344 3 rdma_cm,ib_ipoib,ib_ucm
libiscsi 61440 1 ib_iser
mlx5_core 1216512 2 mlx5_fpga_tools,mlx5_ib
scsi_transport_iscsi 110592 2 ib_iser,libiscsi
mlx4_ib 229376 0
ib_core 335872 12 rdma_cm,ib_ipoib,ko2iblnd,mlx4_ib,iw_cm,ib_iser,ib_umad,rdma_ucm,ib_uverbs,mlx5_ib,ib_cm,ib_ucm
mlx4_core 352256 2 mlx4_ib,mlx4_en
mlx_compat 65536 17 rdma_cm,ib_ipoib,mlx4_core,ko2iblnd,mlx4_ib,iw_cm,mlx5_fpga_tools,ib_iser,ib_umad,mlx4_en,ib_core,rdma_ucm,ib_uverbs,mlx5_ib,ib_cm,mlx5_core,ib_ucm