So I have been trying to get my mellanox cx6-dx working with mellanox drivers for some time because I want to enable target offloading for NvME over RDMA which is following this guide Here.
I am having issues installing the drivers, so most recently i have tried installing via doca-all … most of everything installs correctly, but during the installation I see this message…
Package 'mlnx-nfsrdma-dkms' is not installed, skipping module 'mlnx-nfsrdma'.
Package 'mlnx-nvme-dkms' is not installed, skipping module 'mlnx-nvme'.
Package 'fwctl-dkms' is not installed, skipping module 'fwctl'.
Package 'mlx5_fwctl-dkms' is not installed, skipping module 'mlx5_fwctl'.
As im not really interested in nfs over rdma i am concerned about mlnx-nvme.
I try to run …
modprobe nvme << works no issues
modprobe nvmet << works no issues
modprobe nvme-rdma << Fails with the following error
modprobe: ERROR: could not insert ‘nvme_rdma’: Invalid argument
modprobe nvmet-rdma << Fails with the following error
modprobe: ERROR: could not insert ‘nvmet_rdma’: Invalid argument
now im assuming that because mlnx-nvme-dkms was not installed this may be the issue… so i do an apt install for mlnx-nvme-dkms which results in a makefile error… the error is below
/var/lib/dkms/mlnx-nvme/24.07.OFED.24.07.0.5.7.1/build/make.log
DKMS make.log for mlnx-nvme-24.07.OFED.24.07.0.5.7.1 for kernel 6.1.0-25-amd64 (x86_64)
Fri Sep 13 03:42:25 PM PDT 2024
/bin/sh: 1: Syntax error: Unterminated quoted string
/bin/sh: 1: [: -lt: unexpected operator
grep: mlnx-nvme.spec: No such file or directory
make -C /lib/modules/6.1.0-25-amd64/build O=/lib/modules/6.1.0-25-amd64/build M=/var/lib/dkms/mlnx-nvme/24.07.OFED.24.07.0.5.7.1/build KBUILD_EXTRA_SYMBOLS=/usr/src/ofa_kernel/x86_64/6.1.0-25-amd64/Module.symvers \
CONFIG_NVME_HOST_WITHOUT_FC= CONFIG_NVME_TARGET=m CONFIG_NVME_TARGET_LOOP=m CONFIG_NVME_TARGET_RDMA=m CONFIG_NVME_TARGET_FC=m CONFIG_NVME_TARGET_FCLOOP=m CONFIG_NVME_TARGET_DUMMY= CONFIG_NVME_CORE=m CONFIG_BLK_DEV_NVME=m CONFIG_NVME_FABRICS=m CONFIG_NVME_FC=m CONFIG_NVME_RDMA=m CONFIG_NVME_MULTIPATH=y CONFIG_NVME_HOST_DUMMY= \
CONFIG_DTRACE= \
CONFIG_CTF= \
LINUXINCLUDE=' -DCONFIG_NVME_MULTIPATH=1 -include /lib/modules/6.1.0-25-amd64/build/include/generated/autoconf.h -include /usr/src/ofa_kernel/x86_64/6.1.0-25-amd64/include/linux/compat-2.6.h -I/var/lib/dkms/mlnx-nvme/24.07.OFED.24.07.0.5.7.1/build -I/usr/src/ofa_kernel/x86_64/6.1.0-25-amd64/include -I/usr/src/ofa_kernel/x86_64/6.1.0-25-amd64/include/uapi $(if $(CONFIG_XEN),-D__XEN_INTERFACE_VERSION__=$(CONFIG_XEN_INTERFACE_VERSION)) $(if $(CONFIG_XEN),-I$(srctree)/arch/x86/include/mach-xen) -I$(srctree)/arch/$(SRCARCH)/include -Iarch/$(SRCARCH)/include/generated -Iinclude -I$(srctree)/arch/$(SRCARCH)/include/uapi -Iarch/$(SRCARCH)/include/generated/uapi -I$(srctree)/include -I$(srctree)/include/uapi -Iinclude/generated/uapi $(if $(KBUILD_SRC),-Iinclude2 -I$(srctree)/include) -I$(srctree)/arch/$(SRCARCH)/include -Iarch/$(SRCARCH)/include/generated ' \
modules
make[1]: warning: jobserver unavailable: using -j1. Add '+' to parent make rule.
make[1]: Entering directory '/usr/src/linux-headers-6.1.0-25-amd64'
CC [M] /var/lib/dkms/mlnx-nvme/24.07.OFED.24.07.0.5.7.1/build/host/core.o
CC [M] /var/lib/dkms/mlnx-nvme/24.07.OFED.24.07.0.5.7.1/build/host/ioctl.o
CC [M] /var/lib/dkms/mlnx-nvme/24.07.OFED.24.07.0.5.7.1/build/host/sysfs.o
CC [M] /var/lib/dkms/mlnx-nvme/24.07.OFED.24.07.0.5.7.1/build/host/pr.o
CC [M] /var/lib/dkms/mlnx-nvme/24.07.OFED.24.07.0.5.7.1/build/host/trace.o
CC [M] /var/lib/dkms/mlnx-nvme/24.07.OFED.24.07.0.5.7.1/build/host/multipath.o
CC [M] /var/lib/dkms/mlnx-nvme/24.07.OFED.24.07.0.5.7.1/build/host/zns.o
CC [M] /var/lib/dkms/mlnx-nvme/24.07.OFED.24.07.0.5.7.1/build/host/hwmon.o
LD [M] /var/lib/dkms/mlnx-nvme/24.07.OFED.24.07.0.5.7.1/build/host/nvme-core.o
CC [M] /var/lib/dkms/mlnx-nvme/24.07.OFED.24.07.0.5.7.1/build/host/pci.o
In file included from /usr/src/ofa_kernel/x86_64/6.1.0-25-amd64/include/linux/blkdev.h:6,
from /var/lib/dkms/mlnx-nvme/24.07.OFED.24.07.0.5.7.1/build/host/pci.c:12:
/var/lib/dkms/mlnx-nvme/24.07.OFED.24.07.0.5.7.1/build/host/pci.c: In function ‘nvme_map_metadata’:
/usr/src/linux-headers-6.1.0-25-common/include/linux/blkdev.h:631:37: error: invalid type argument of ‘->’ (have ‘struct bio_vec’)
631 | dma_map_page_attrs(dev, (bv)->bv_page, (bv)->bv_offset, (bv)->bv_len, \
| ^~
/var/lib/dkms/mlnx-nvme/24.07.OFED.24.07.0.5.7.1/build/host/pci.c:1205:25: note: in expansion of macro ‘dma_map_bvec’
1205 | iod->meta_dma = dma_map_bvec(dev->dev, rq_integrity_vec(req),
| ^~~~~~~~~~~~
/usr/src/linux-headers-6.1.0-25-common/include/linux/blkdev.h:631:52: error: invalid type argument of ‘->’ (have ‘struct bio_vec’)
631 | dma_map_page_attrs(dev, (bv)->bv_page, (bv)->bv_offset, (bv)->bv_len, \
| ^~
/var/lib/dkms/mlnx-nvme/24.07.OFED.24.07.0.5.7.1/build/host/pci.c:1205:25: note: in expansion of macro ‘dma_map_bvec’
1205 | iod->meta_dma = dma_map_bvec(dev->dev, rq_integrity_vec(req),
| ^~~~~~~~~~~~
/usr/src/linux-headers-6.1.0-25-common/include/linux/blkdev.h:631:69: error: invalid type argument of ‘->’ (have ‘struct bio_vec’)
631 | dma_map_page_attrs(dev, (bv)->bv_page, (bv)->bv_offset, (bv)->bv_len, \
| ^~
/var/lib/dkms/mlnx-nvme/24.07.OFED.24.07.0.5.7.1/build/host/pci.c:1205:25: note: in expansion of macro ‘dma_map_bvec’
1205 | iod->meta_dma = dma_map_bvec(dev->dev, rq_integrity_vec(req),
| ^~~~~~~~~~~~
In file included from /usr/src/linux-headers-6.1.0-25-common/include/linux/pci.h:2538,
from /usr/src/ofa_kernel/x86_64/6.1.0-25-amd64/include/linux/pci.h:7,
from /var/lib/dkms/mlnx-nvme/24.07.OFED.24.07.0.5.7.1/build/host/pci.c:27:
/var/lib/dkms/mlnx-nvme/24.07.OFED.24.07.0.5.7.1/build/host/pci.c: In function ‘nvme_pci_unmap_rq’:
/var/lib/dkms/mlnx-nvme/24.07.OFED.24.07.0.5.7.1/build/host/pci.c:1374:53: error: invalid type argument of ‘->’ (have ‘struct bio_vec’)
1374 | rq_integrity_vec(req)->bv_len, rq_dma_dir(req));
| ^~
/usr/src/linux-headers-6.1.0-25-common/include/linux/dma-mapping.h:416:63: note: in definition of macro ‘dma_unmap_page’
416 | #define dma_unmap_page(d, a, s, r) dma_unmap_page_attrs(d, a, s, r, 0)
| ^
make[3]: *** [/usr/src/linux-headers-6.1.0-25-common/scripts/Makefile.build:255: /var/lib/dkms/mlnx-nvme/24.07.OFED.24.07.0.5.7.1/build/host/pci.o] Error 1
make[2]: *** [/usr/src/linux-headers-6.1.0-25-common/scripts/Makefile.build:508: /var/lib/dkms/mlnx-nvme/24.07.OFED.24.07.0.5.7.1/build/host] Error 2
make[1]: *** [/usr/src/linux-headers-6.1.0-25-common/Makefile:2034: /var/lib/dkms/mlnx-nvme/24.07.OFED.24.07.0.5.7.1/build] Error 2
make[1]: Leaving directory '/usr/src/linux-headers-6.1.0-25-amd64'
make: *** [makefile:143: default] Error 2
I see a similair post here… and when i tried to install via non-doca
Here #1 and #2
I am not sure what i should be doing to fix this issue, i would love to just get these drivers working so that i can do nvme over rdma but i just cant get it to work…
I even restarted openibd and then ran modprobe nvmet and then rebuild the kernel and rebooted in the off-chance that maybe the old module was still loaded and a reboot would help. Still the same issues. What can i do to fix this?
Also just for reference this is the DMESG after a reboot…
[ 207.556869] nvme_rdma: disagrees about version of symbol ib_mr_pool_destroy
[ 207.556878] nvme_rdma: Unknown symbol ib_mr_pool_destroy (err -22)
[ 207.556902] nvme_rdma: disagrees about version of symbol ib_unregister_client
[ 207.556905] nvme_rdma: Unknown symbol ib_unregister_client (err -22)
[ 207.556966] nvme_rdma: disagrees about version of symbol rdma_reject_msg
[ 207.556969] nvme_rdma: Unknown symbol rdma_reject_msg (err -22)
[ 207.557091] nvme_rdma: disagrees about version of symbol __ib_alloc_pd
[ 207.557095] nvme_rdma: Unknown symbol __ib_alloc_pd (err -22)
[ 207.557186] nvme_rdma: disagrees about version of symbol rdma_resolve_addr
[ 207.557189] nvme_rdma: Unknown symbol rdma_resolve_addr (err -22)
[ 207.557223] nvme_rdma: disagrees about version of symbol rdma_set_service_type
[ 207.557226] nvme_rdma: Unknown symbol rdma_set_service_type (err -22)
[ 207.557246] nvme_rdma: disagrees about version of symbol ib_map_mr_sg_pi
[ 207.557249] nvme_rdma: Unknown symbol ib_map_mr_sg_pi (err -22)
[ 207.557305] nvme_rdma: disagrees about version of symbol ib_mr_pool_init
[ 207.557308] nvme_rdma: Unknown symbol ib_mr_pool_init (err -22)
[ 207.557333] nvme_rdma: disagrees about version of symbol ib_process_cq_direct
[ 207.557336] nvme_rdma: Unknown symbol ib_process_cq_direct (err -22)
[ 207.557415] nvme_rdma: disagrees about version of symbol ib_event_msg
[ 207.557418] nvme_rdma: Unknown symbol ib_event_msg (err -22)
[ 207.557454] nvme_rdma: disagrees about version of symbol rdma_disconnect
[ 207.557457] nvme_rdma: Unknown symbol rdma_disconnect (err -22)
[ 207.557575] nvme_rdma: disagrees about version of symbol __rdma_create_kernel_id
[ 207.557579] nvme_rdma: Unknown symbol __rdma_create_kernel_id (err -22)
[ 207.557658] nvme_rdma: disagrees about version of symbol rdma_resolve_route
[ 207.557661] nvme_rdma: Unknown symbol rdma_resolve_route (err -22)
[ 207.557685] nvme_rdma: disagrees about version of symbol ib_register_client
[ 207.557688] nvme_rdma: Unknown symbol ib_register_client (err -22)
[ 207.557778] nvme_rdma: disagrees about version of symbol rdma_create_qp
[ 207.557781] nvme_rdma: Unknown symbol rdma_create_qp (err -22)
[ 207.557828] nvme_rdma: disagrees about version of symbol ib_map_mr_sg
[ 207.557831] nvme_rdma: Unknown symbol ib_map_mr_sg (err -22)
[ 207.557862] nvme_rdma: disagrees about version of symbol ib_cq_pool_put
[ 207.557865] nvme_rdma: Unknown symbol ib_cq_pool_put (err -22)
[ 207.557895] nvme_rdma: disagrees about version of symbol __ib_alloc_cq
[ 207.557898] nvme_rdma: Unknown symbol __ib_alloc_cq (err -22)
[ 207.557930] nvme_rdma: disagrees about version of symbol rdma_destroy_qp
[ 207.557933] nvme_rdma: Unknown symbol rdma_destroy_qp (err -22)
[ 207.557954] nvme_rdma: disagrees about version of symbol ib_check_mr_status
[ 207.557957] nvme_rdma: Unknown symbol ib_check_mr_status (err -22)
[ 207.558059] nvme_rdma: disagrees about version of symbol ib_destroy_qp_user
[ 207.558062] nvme_rdma: Unknown symbol ib_destroy_qp_user (err -22)
[ 207.558126] nvme_rdma: disagrees about version of symbol ib_cq_pool_get
[ 207.558129] nvme_rdma: Unknown symbol ib_cq_pool_get (err -22)
[ 207.558138] nvme_rdma: disagrees about version of symbol rdma_connect_locked
[ 207.558141] nvme_rdma: Unknown symbol rdma_connect_locked (err -22)
[ 207.558171] nvme_rdma: disagrees about version of symbol ib_wc_status_msg
[ 207.558174] nvme_rdma: Unknown symbol ib_wc_status_msg (err -22)
[ 207.558219] nvme_rdma: disagrees about version of symbol ib_dma_virt_map_sg
[ 207.558222] nvme_rdma: Unknown symbol ib_dma_virt_map_sg (err -22)
[ 207.558280] nvme_rdma: disagrees about version of symbol ib_free_cq
[ 207.558283] nvme_rdma: Unknown symbol ib_free_cq (err -22)
[ 207.558301] nvme_rdma: disagrees about version of symbol rdma_destroy_id
[ 207.558303] nvme_rdma: Unknown symbol rdma_destroy_id (err -22)
[ 207.558484] nvme_rdma: disagrees about version of symbol ib_mr_pool_get
[ 207.558486] nvme_rdma: Unknown symbol ib_mr_pool_get (err -22)
[ 207.558537] nvme_rdma: disagrees about version of symbol ib_mr_pool_put
[ 207.558540] nvme_rdma: Unknown symbol ib_mr_pool_put (err -22)
[ 207.558635] nvme_rdma: disagrees about version of symbol ib_drain_qp
[ 207.558638] nvme_rdma: Unknown symbol ib_drain_qp (err -22)
[ 207.558660] nvme_rdma: disagrees about version of symbol ib_dealloc_pd_user
[ 207.558663] nvme_rdma: Unknown symbol ib_dealloc_pd_user (err -22)
[ 207.558682] nvme_rdma: disagrees about version of symbol rdma_consumer_reject_data
[ 207.558685] nvme_rdma: Unknown symbol rdma_consumer_reject_data (err -22)
and this is output from lsmod on nvme and on mellanox
root@gigabyte:/home/mihai# lsmod | grep nvme
nvme_fabrics 32768 0
nvme 57344 3
nvme_core 163840 6 nvme,nvme_fabrics
t10_pi 16384 2 sd_mod,nvme_core
root@gigabyte:/home/mihai# lsmod | grep mlx5
mlx5_ib 479232 0
ib_uverbs 184320 2 rdma_ucm,mlx5_ib
ib_core 454656 8 rdma_cm,ib_ipoib,iw_cm,ib_umad,rdma_ucm,ib_uverbs,mlx5_ib,ib_cm
mlx5_core 2420736 1 mlx5_ib
mlxfw 36864 1 mlx5_core
psample 20480 1 mlx5_core
mlxdevm 180224 1 mlx5_core
mlx_compat 20480 11 rdma_cm,ib_ipoib,mlxdevm,iw_cm,ib_umad,ib_core,rdma_ucm,ib_uverbs,mlx5_ib,ib_cm,mlx5_core
tls 135168 1 mlx5_core
pci_hyperv_intf 16384 1 mlx5_core