Hi all,
I’m building a stateless cluster using ConntectX-3 and warewulf for management, but I’m having a hard time making the adapter up. I have build a chroot image with the infiniband packages looks like the adapter is not being initialised (this is in all compute nodes, so probably not a hardware issue).
root@geomechanics fcanesin]# ssh n00 [root@n00 ~]# clear [root@n00 ~]# hca_self_test.ofed ---- Performing Adapter Device Self Test ---- Number of CAs Detected ................. 1 PCI Device Check ....................... PASS Kernel Arch ............................ x86_64 Host Driver Version .................... MLNX_OFED_LINUX-2.0-2.0.5 (OFED-2.0-2.0.5): 2.6.32-358.el6.x86_64 Host Driver RPM Check .................. PASS Firmware on CA #0 VPI .................. v2.10.4700 Firmware Check on CA #0 (VPI) .......... NA REASON: NO required fw version Host Driver Initialization ............. PASS Number of CA Ports Active .............. 0 Kernel Syslog Check .................... PASS Node GUID on CA #0 (VPI) ............... NA ------------------ DONE --------------------- [root@n00 ~]# ibstat [root@n00 ~]# ifup ib0 Device ib0 does not seem to be present, delaying initialization. [root@n00 ~]# cat /var/log/dmesg | grep ml Command line: ro initrd=bootstrap/51/initfs.gz wwhostname=n00.cluster wwkmods=ipv6,ib_addr,ib_core,ib_mad,ib_sa,ib_,ib_umad,iw_cm,rdma_cm,rdma_ucm,mlx4_core,mlx4_ib,ib_mthca,ib_ipoib wwmaster=10.0.0.254 wwipaddr=10.0.0.100 wwnetmanetdev=eth0 BOOT_IMAGE=bootstrap/51/kernel Kernel command line: ro initrd=bootstrap/51/initfs.gz wwhostname=n00.cluster wwkmods=ipv6,ib_addr,ib_core,ib_mad,ib,ib_ucm,ib_umad,iw_cm,rdma_cm,rdma_ucm,mlx4_core,mlx4_ib,ib_mthca,ib_ipoib wwmaster=10.0.0.254 wwipaddr=10.0.0.100 55.0 wwnetdev=eth0 BOOT_IMAGE=bootstrap/51/kernel Compat-mlnx-ofed backport release: gcecc987 mlx4_core: Mellanox ConnectX core driver v1.1 (Jun 12 2013) mlx4_core: Initializing 0000:04:00.0 mlx4_core 0000:04:00.0: PCI INT A -> GSI 32 (level, low) -> IRQ 32 mlx4_core 0000:04:00.0: setting latency timer to 64 mlx4_core 0000:04:00.0: command INIT_HCA (0x7) failed: fw status = 0x3 mlx4_core 0000:04:00.0: INIT_HCA returns -22 mlx4_core 0000:04:00.0: INIT_HCA command failed, aborting. mlx4_core 0000:04:00.0: PCI INT A disabled mlx4_core: probe of 0000:04:00.0 failed with error -22 [root@n00 ~]# lsmod | grep ib mlx4_ib 154125 0 mlx4_core 233054 2 mlx4_en,mlx4_ib libsas 74168 1 isci scsi_transport_sas 35620 2 isci,libsas ib_umad 12538 0 ib_ucm 12120 0 ib_uverbs 40038 2 rdma_ucm,ib_ucm ib_ipoib 109448 0 ib_cm 41480 3 rdma_cm,ib_ucm,ib_ipoib ib_sa 24010 5 rdma_ucm,rdma_cm,mlx4_ib,ib_ipoib,ib_cm ib_mad 43081 4 mlx4_ib,ib_umad,ib_cm,ib_sa ib_core 80859 12 rdma_ucm,rdma_cm,mlx4_ib,iw_cm,ib_umad,ib_ucm,ib_uverbs,ib_ipoib,ib_cm,ib_ ib_addr 5900 1 rdma_cm compat 18042 17 mlx4_en,rdma_ucm,rdma_cm,mlx4_ib,mlx4_core,iw_cm,ib_umad,ib_ucm,sa,ib_mad,ib_core,ib_addr ipv6 321422 88 ib_ipoib,ib_addr [root@n00 ~]# lspci | grep Mell 04:00.0 Network controller: Mellanox Technologies MT27500 Family [ConnectX-3] [root@n00 ~]#
This is with MLNX_OFED as you can see, I have tested using yum groupinstall “Infiniband Support” … exactly thee same problem…I’m without ideas. Help!?