Hi wpceswpces,
Two more things that might contribute to the problem:
-
xpcs_check_pcs_lock_status() uses RETRY_ONCE (1 retry = 2ms total) to wait for PCS block lock after 25G lane bringup. On a direct-attach fiber link this is far too short. The function returns -1 immediately, set_speed_work_func() reschedules itself after 1 second, and with four interfaces all failing simultaneously the workqueue is flooded at 4 retries/second indefinitely. The kworkers accumulate in D-state and stall RCU grace periods.
-
The static-1000ms retry interval in set_speed_work_func() provides no backoff, so a persistent link failure (e.g. while waiting for the remote board to come up) keeps hammering the workqueue.
Modification 1. Extend xpcs_check_pcs_lock_status() to allow 100ms for PCS block lock when operating in 25G mode.
Modification 2. Replace fixed 1000ms retry with exponential backoff (1s → 2s → 4s → 8s → 16s → 30s capped). Reset the retry counter on success or whenever a fresh link event cancels the pending work.
Note: another potential cause of error is that xlgpcs_init() has its full XLGPCS mode-selection sequence guarded by “#if 0 //FIXME”. That requires the correct T26X hardware sequence from NVIDIA and is not addressed here.
osd.c.txt (27.1 KB)
ether_linux.h.txt (28.0 KB)
ether_linux.c.txt (221.6 KB)
# Copy attached to:
cp ether_linux.* source/nvidia-oot/drivers/net/ethernet/nvidia/nvethernet/
cp osd.c source/nvidia-oot/drivers/net/ethernet/nvidia/nvethernet/
Here’s the patch that failed to apply, which is why I attached full files; but does show the edits.
diff --git a/nvidia-oot/drivers/net/ethernet/nvidia/nvethernet/nvethernetrm/osi/core/xpcs.c b/nvidia-oot/drivers/net/ethernet/nvidia/nvethernet/nvethernetrm/osi/core/xpcs.c
--- a/nvidia-oot/drivers/net/ethernet/nvidia/nvethernet/nvethernetrm/osi/core/xpcs.c
+++ b/nvidia-oot/drivers/net/ethernet/nvidia/nvethernet/nvethernetrm/osi/core/xpcs.c
@@ -546,6 +546,13 @@ static nve32_t xpcs_check_pcs_lock_status(struct osi_core_priv_data *osi_core)
};
+ /* 25G fiber/DAC requires more time for PCS block lock than the
+ * 1ms HW-team figure (measured for 10G). Allow up to 100ms.
+ */
+ if (osi_core->uphy_gbe_mode == OSI_GBE_MODE_25G)
+ retry = 100U;
+
count = 0;
while (cond == COND_NOT_MET) {
diff --git a/nvidia-oot/drivers/net/ethernet/nvidia/nvethernet/ether_linux.h b/nvidia-oot/drivers/net/ethernet/nvidia/nvethernet/ether_linux.h
--- a/nvidia-oot/drivers/net/ethernet/nvidia/nvethernet/ether_linux.h
+++ b/nvidia-oot/drivers/net/ethernet/nvidia/nvethernet/ether_linux.h
@@ -733,6 +733,8 @@ struct ether_priv_data {
/** Ref count for set_speed_work_func */
atomic_t set_speed_ref_cnt;
+ /** Retry counter for set_speed_work_func exponential backoff */
+ unsigned int set_speed_retry_cnt;
/** flag to enable logs using ethtool */
u32 msg_enable;
diff --git a/nvidia-oot/drivers/net/ethernet/nvidia/nvethernet/ether_linux.c b/nvidia-oot/drivers/net/ethernet/nvidia/nvethernet/ether_linux.c
--- a/nvidia-oot/drivers/net/ethernet/nvidia/nvethernet/ether_linux.c
+++ b/nvidia-oot/drivers/net/ethernet/nvidia/nvethernet/ether_linux.c
@@ -1306,10 +1306,23 @@ void set_speed_work_func(struct work_struct *work)
ret = osi_handle_ioctl(pdata->osi_core, &ioctl_data);
if (ret < 0) {
- netdev_dbg(dev, "Retry set speed\n");
+ unsigned int delay_ms;
+
+ /* Exponential backoff: 1s, 2s, 4s, 8s, 16s, 30s (capped).
+ * Prevents workqueue flooding when lane bringup fails on
+ * all four bonded MGBE interfaces simultaneously.
+ */
+ if (pdata->set_speed_retry_cnt >= 5U)
+ delay_ms = 30000U;
+ else
+ delay_ms = 1000U << pdata->set_speed_retry_cnt;
+
+ pdata->set_speed_retry_cnt++;
+ netdev_dbg(dev, "Retry set speed in %ums (attempt %u)\n",
+ delay_ms, pdata->set_speed_retry_cnt);
schedule_delayed_work(&pdata->set_speed_work,
- msecs_to_jiffies(1000));
+ msecs_to_jiffies(delay_ms));
atomic_set(&pdata->set_speed_ref_cnt, OSI_DISABLE);
return;
}
@@ -1346,6 +1359,7 @@ void set_speed_work_func(struct work_struct *work)
netif_carrier_on(dev);
+ pdata->set_speed_retry_cnt = 0;
atomic_set(&pdata->set_speed_ref_cnt, OSI_DISABLE);
}
@@ -1411,6 +1425,7 @@ static void ether_adjust_link(struct net_device *dev)
cancel_delayed_work_sync(&pdata->set_speed_work);
+ pdata->set_speed_retry_cnt = 0;
if (phydev->link) {
if (phydev->speed != pdata->speed) {
diff --git a/nvidia-oot/drivers/net/ethernet/nvidia/nvethernet/osd.c b/nvidia-oot/drivers/net/ethernet/nvidia/nvethernet/osd.c
--- a/nvidia-oot/drivers/net/ethernet/nvidia/nvethernet/osd.c
+++ b/nvidia-oot/drivers/net/ethernet/nvidia/nvethernet/osd.c
@@ -895,6 +895,7 @@ void ether_restart_lane_bringup_task(struct tasklet_struct *t)
netif_tx_stop_all_queues(pdata->ndev);
netif_tx_unlock(pdata->ndev);
+ pdata->set_speed_retry_cnt = 0;
schedule_delayed_work(&pdata->set_speed_work, msecs_to_jiffies(500));
if (netif_msg_drv(pdata)) {
netdev_info(pdata->ndev,