ceph/libceph: fix hung tasks and connection recovery during network disruptions

[PATCH v1 12/13] libceph: force monitor reconnect on persistent EADDRNOTAVAIL
Posted by Ionut Nechita (Wind River) 3 weeks, 5 days ago
From: Ionut Nechita <ionut.nechita@windriver.com>

When the kernel CephFS client experiences persistent EADDRNOTAVAIL
errors (e.g., because the original source address was a transient
CNI pod address that no longer exists), the monitor client may get
stuck retrying the same monitor indefinitely while in hunting mode.
The mon_fault() handler currently ignores faults when already
hunting, assuming delayed_work() will handle the retry. However,
delayed_work() simply calls reopen_session() which may pick the
same monitor again, creating an infinite loop of failed connection
attempts to the same target.

Additionally, when EADDRNOTAVAIL is persistent across all monitors,
the hunt_mult backoff grows exponentially, causing increasingly
long delays between reconnection attempts. Once the network issue
resolves (e.g., route cache expires, new address becomes available),
the client may take minutes to recover due to the accumulated
backoff.

Fix this by modifying mon_fault() to force a reopen_session() even
when already hunting, if the messenger's addr_notavail_count
indicates persistent address failures. This ensures the client
tries a different monitor on each fault rather than waiting for
the delayed_work timer. Also reset hunt_mult to 1 when forcing
a reconnect due to EADDRNOTAVAIL, so that once the network issue
resolves, the client recovers quickly without accumulated backoff
delays.

Also add a safety check in delayed_work(): if addr_notavail_count
exceeds the reset threshold and we're hunting, reset hunt_mult to
prevent accumulated backoff from delaying recovery.

Signed-off-by: Ionut Nechita <ionut.nechita@windriver.com>
---
 net/ceph/mon_client.c | 39 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 38 insertions(+), 1 deletion(-)

diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index ab66b599ac479..6e3d314fbf2b2 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -1084,6 +1084,7 @@ static void delayed_work(struct work_struct *work)
 {
 	struct ceph_mon_client *monc =
 		container_of(work, struct ceph_mon_client, delayed_work.work);
+	int notavail_count;
 
 	mutex_lock(&monc->mutex);
 	dout("%s mon%d\n", __func__, monc->cur_mon);
@@ -1094,6 +1095,22 @@ static void delayed_work(struct work_struct *work)
 	if (monc->hunting) {
 		dout("%s continuing hunt\n", __func__);
 		reopen_session(monc);
+
+		/*
+		 * If we're hunting and EADDRNOTAVAIL has been persistent,
+		 * reset the backoff multiplier so we recover quickly once
+		 * the network issue resolves. Without this, hunt_mult can
+		 * grow large during extended EADDRNOTAVAIL periods, causing
+		 * the client to take minutes to reconnect even after the
+		 * underlying issue is fixed.
+		 */
+		notavail_count =
+			atomic_read(&monc->client->msgr.addr_notavail_count);
+		if (notavail_count >= ADDRNOTAVAIL_RESET_THRESHOLD) {
+			dout("%s addr_notavail_count %d, resetting hunt_mult\n",
+			     __func__, notavail_count);
+			monc->hunt_mult = 1;
+		}
 	} else {
 		int is_auth = ceph_auth_is_authenticated(monc->auth);
 
@@ -1554,6 +1571,7 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
 static void mon_fault(struct ceph_connection *con)
 {
 	struct ceph_mon_client *monc = con->private;
+	int notavail_count;
 
 	mutex_lock(&monc->mutex);
 	dout("%s mon%d\n", __func__, monc->cur_mon);
@@ -1563,7 +1581,26 @@ static void mon_fault(struct ceph_connection *con)
 			reopen_session(monc);
 			__schedule_delayed(monc);
 		} else {
-			dout("%s already hunting\n", __func__);
+			/*
+			 * Already hunting. Normally we just wait for
+			 * delayed_work() to retry. But if EADDRNOTAVAIL
+			 * is persistent, force an immediate reconnect to
+			 * a different monitor. This avoids getting stuck
+			 * retrying the same monitor that keeps failing.
+			 * Also reset hunt_mult so we don't accumulate
+			 * excessive backoff during the outage.
+			 */
+			notavail_count =
+				atomic_read(&con->msgr->addr_notavail_count);
+			if (notavail_count > 0) {
+				dout("%s addr_notavail %d, forcing reopen\n",
+				     __func__, notavail_count);
+				monc->hunt_mult = 1;
+				reopen_session(monc);
+				__schedule_delayed(monc);
+			} else {
+				dout("%s already hunting\n", __func__);
+			}
 		}
 	}
 	mutex_unlock(&monc->mutex);
-- 
2.53.0