ceph/libceph: fix hung tasks and connection recovery during network disruptions

[PATCH v1 01/13] libceph: handle EADDRNOTAVAIL more gracefully

Posted by Ionut Nechita (Wind River) 3 weeks, 5 days ago

From: Ionut Nechita <ionut.nechita@windriver.com>

When connecting to Ceph monitors/OSDs, kernel_connect() may return
-EADDRNOTAVAIL if the source address is unavailable. This occurs
during:
- IPv6 Duplicate Address Detection (DAD)
- IPv4/IPv6 interface state changes (link up/down events)
- Address removal or reconfiguration on the interface
- Network namespace transitions in containerized environments
- CNI reconfigurations during containerized rolling upgrades
Currently, libceph treats EADDRNOTAVAIL like any other connection error
and enters exponential backoff (BASE_DELAY_INTERVAL 250ms doubling up
to MAX_DELAY_INTERVAL 15s). Additionally, the monitor client has its
own hunt-level backoff (CEPH_MONC_HUNT_INTERVAL 3s * hunt_mult, where
hunt_mult doubles up to 10x = 30s max). These two backoff mechanisms
compound: at steady state each monitor gets ~30 seconds of attempts
with connection-level delays up to 15s, and the round-trip through
all monitors takes ~60 seconds.
In production testing (6.12.0-1-rt-amd64, Dell PowerEdge
R720, IPv6-only Ceph cluster with 2 monitors), the EADDRNOTAVAIL
condition persisted for ~36 minutes during a rolling upgrade:
  13:20:52 - mon0 session lost, hunting begins, first error -99
  13:57:03 - mon0 session finally re-established
  ~470 failed connect attempts across both monitors
  sync task blocked for 983+ seconds, triggering hung task warnings:
    "INFO: task sync:514917 blocked for more than 122 seconds"
    ...repeated at 245s, 368s, 491s, 614s, 737s, 860s, 983s
The duration of EADDRNOTAVAIL varies by environment: it can be brief
(simple DAD, 1-2s) or prolonged (complex network reconfiguration
during rolling upgrades, minutes). In both cases, the key issue is
that exponential backoff up to 15s wastes time once the address
becomes available -- the client may sit idle for up to 15 seconds
before attempting to reconnect.
This patch bypasses the exponential backoff for EADDRNOTAVAIL by
using a fixed short retry interval (ADDRNOTAVAIL_DELAY, HZ/10 =
100ms). This ensures reconnection happens within 100ms of the address
becoming available, rather than waiting up to 15 seconds.
Implementation:
- Detect EADDRNOTAVAIL in ceph_tcp_connect() for both IPv4 and IPv6
- Signal the condition to con_fault() via an addr_notavail flag
  (per-protocol: v1 and v2)
- In con_fault(), use ADDRNOTAVAIL_DELAY instead of exponential
  backoff when the flag is set
- Clear the flag on successful connection and when reopening
- Use pr_warn_ratelimited() instead of pr_err() for this case
The fast retry is appropriate because each attempt is inexpensive
(kernel_connect() fails immediately when the address is unavailable)
and quick recovery is critical for storage availability.
Fixes: 60bf8bf8815e ("libceph: fix msgr backoff")
Signed-off-by: Ionut Nechita <ionut.nechita@windriver.com>
---
 include/linux/ceph/messenger.h | 11 +++++++
 net/ceph/messenger.c           | 55 ++++++++++++++++++++++++++++++++--
 2 files changed, 63 insertions(+), 3 deletions(-)

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 1717cc57cdacd..730a754353aed 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -320,6 +320,13 @@ struct ceph_msg {
 /* ceph connection fault delay defaults, for exponential backoff */
 #define BASE_DELAY_INTERVAL	(HZ / 4)
 #define MAX_DELAY_INTERVAL	(15 * HZ)
+/*
+ * Shorter retry delay for EADDRNOTAVAIL. This error typically indicates
+ * a transient condition (IPv6 DAD in progress, address reconfiguration,
+ * temporary route issue) that resolves in 1-2 seconds. Fast retries
+ * allow quick recovery without exponential backoff delays.
+ */
+#define ADDRNOTAVAIL_DELAY	(HZ / 10)
 
 struct ceph_connection_v1_info {
 	struct kvec out_kvec[8],         /* sending header/footer data */
@@ -360,6 +367,8 @@ struct ceph_connection_v1_info {
 	u32 connect_seq;      /* identify the most recent connection
 				 attempt for this session */
 	u32 peer_global_seq;  /* peer's global seq for this connection */
+
+	bool addr_notavail;  /* address not available (transient) */
 };
 
 #define CEPH_CRC_LEN			4
@@ -430,6 +439,8 @@ struct ceph_connection_v2_info {
 
 	int con_mode;  /* CEPH_CON_MODE_* */
 
+	bool addr_notavail;  /* address not available (transient) */
+
 	void *conn_bufs[16];
 	int conn_buf_cnt;
 	int data_len_remain;
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 9f6d860411cbd..c40c7c332e7f4 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -466,8 +466,22 @@ int ceph_tcp_connect(struct ceph_connection *con)
 		     ceph_pr_addr(&con->peer_addr),
 		     sock->sk->sk_state);
 	} else if (ret < 0) {
-		pr_err("connect %s error %d\n",
-		       ceph_pr_addr(&con->peer_addr), ret);
+		if (ret == -EADDRNOTAVAIL) {
+			/*
+			 * Address not yet available - could be IPv6 DAD in
+			 * progress, address reconfiguration, or temporary
+			 * route issue. Use shorter delay.
+			 */
+			pr_warn_ratelimited("connect %s: address not available (DAD/route issue?), will retry\n",
+					    ceph_pr_addr(&con->peer_addr));
+			if (ceph_msgr2(from_msgr(con->msgr)))
+				con->v2.addr_notavail = true;
+			else
+				con->v1.addr_notavail = true;
+		} else {
+			pr_err("connect %s error %d\n",
+			       ceph_pr_addr(&con->peer_addr), ret);
+		}
 		sock_release(sock);
 		return ret;
 	}
@@ -476,6 +490,13 @@ int ceph_tcp_connect(struct ceph_connection *con)
 		tcp_sock_set_nodelay(sock->sk);
 
 	con->sock = sock;
+
+	/* Clear addr_notavail flag on successful connection */
+	if (ceph_msgr2(from_msgr(con->msgr)))
+		con->v2.addr_notavail = false;
+	else
+		con->v1.addr_notavail = false;
+
 	return 0;
 }
 
@@ -609,6 +630,13 @@ void ceph_con_open(struct ceph_connection *con,
 
 	memcpy(&con->peer_addr, addr, sizeof(*addr));
 	con->delay = 0;      /* reset backoff memory */
+
+	/* Clear addr_notavail flag when opening/reopening connection */
+	if (ceph_msgr2(from_msgr(con->msgr)))
+		con->v2.addr_notavail = false;
+	else
+		con->v1.addr_notavail = false;
+
 	mutex_unlock(&con->mutex);
 	queue_con(con);
 }
@@ -1613,6 +1641,8 @@ static void ceph_con_workfn(struct work_struct *work)
  */
 static void con_fault(struct ceph_connection *con)
 {
+	bool addr_issue = false;
+
 	dout("fault %p state %d to peer %s\n",
 	     con, con->state, ceph_pr_addr(&con->peer_addr));
 
@@ -1620,6 +1650,19 @@ static void con_fault(struct ceph_connection *con)
 		ceph_pr_addr(&con->peer_addr), con->error_msg);
 	con->error_msg = NULL;
 
+	/* Check and reset addr_notavail flag if set */
+	if (ceph_msgr2(from_msgr(con->msgr))) {
+		if (con->v2.addr_notavail) {
+			addr_issue = true;
+			con->v2.addr_notavail = false;
+		}
+	} else {
+		if (con->v1.addr_notavail) {
+			addr_issue = true;
+			con->v1.addr_notavail = false;
+		}
+	}
+
 	WARN_ON(con->state == CEPH_CON_S_STANDBY ||
 		con->state == CEPH_CON_S_CLOSED);
 
@@ -1644,7 +1687,13 @@ static void con_fault(struct ceph_connection *con)
 	} else {
 		/* retry after a delay. */
 		con->state = CEPH_CON_S_PREOPEN;
-		if (!con->delay) {
+		if (addr_issue) {
+			/*
+			 * Address not available - use shorter delay as this
+			 * is often a transient condition.
+			 */
+			con->delay = ADDRNOTAVAIL_DELAY;
+		} else if (!con->delay) {
 			con->delay = BASE_DELAY_INTERVAL;
 		} else if (con->delay < MAX_DELAY_INTERVAL) {
 			con->delay *= 2;
-- 
2.53.0

Re: [PATCH v1 01/13] libceph: handle EADDRNOTAVAIL more gracefully

Posted by Viacheslav Dubeyko 3 weeks, 4 days ago

On Thu, 2026-03-12 at 10:16 +0200, Ionut Nechita (Wind River) wrote:
> From: Ionut Nechita <ionut.nechita@windriver.com>
> 
> When connecting to Ceph monitors/OSDs, kernel_connect() may return
> -EADDRNOTAVAIL if the source address is unavailable. This occurs
> during:
> - IPv6 Duplicate Address Detection (DAD)
> - IPv4/IPv6 interface state changes (link up/down events)
> - Address removal or reconfiguration on the interface
> - Network namespace transitions in containerized environments
> - CNI reconfigurations during containerized rolling upgrades
> Currently, libceph treats EADDRNOTAVAIL like any other connection error
> and enters exponential backoff (BASE_DELAY_INTERVAL 250ms doubling up
> to MAX_DELAY_INTERVAL 15s). Additionally, the monitor client has its
> own hunt-level backoff (CEPH_MONC_HUNT_INTERVAL 3s * hunt_mult, where
> hunt_mult doubles up to 10x = 30s max). These two backoff mechanisms
> compound: at steady state each monitor gets ~30 seconds of attempts
> with connection-level delays up to 15s, and the round-trip through
> all monitors takes ~60 seconds.
> In production testing (6.12.0-1-rt-amd64, Dell PowerEdge
> R720, IPv6-only Ceph cluster with 2 monitors), the EADDRNOTAVAIL
> condition persisted for ~36 minutes during a rolling upgrade:
>   13:20:52 - mon0 session lost, hunting begins, first error -99
>   13:57:03 - mon0 session finally re-established
>   ~470 failed connect attempts across both monitors
>   sync task blocked for 983+ seconds, triggering hung task warnings:
>     "INFO: task sync:514917 blocked for more than 122 seconds"
>     ...repeated at 245s, 368s, 491s, 614s, 737s, 860s, 983s
> The duration of EADDRNOTAVAIL varies by environment: it can be brief
> (simple DAD, 1-2s) or prolonged (complex network reconfiguration
> during rolling upgrades, minutes). In both cases, the key issue is
> that exponential backoff up to 15s wastes time once the address
> becomes available -- the client may sit idle for up to 15 seconds
> before attempting to reconnect.
> This patch bypasses the exponential backoff for EADDRNOTAVAIL by
> using a fixed short retry interval (ADDRNOTAVAIL_DELAY, HZ/10 =
> 100ms). This ensures reconnection happens within 100ms of the address

As far as I know, HZ depends on frequency. So, HZ/10 is not necessary 100ms. Am
I right here?

> becoming available, rather than waiting up to 15 seconds.
> Implementation:
> - Detect EADDRNOTAVAIL in ceph_tcp_connect() for both IPv4 and IPv6
> - Signal the condition to con_fault() via an addr_notavail flag
>   (per-protocol: v1 and v2)
> - In con_fault(), use ADDRNOTAVAIL_DELAY instead of exponential
>   backoff when the flag is set
> - Clear the flag on successful connection and when reopening
> - Use pr_warn_ratelimited() instead of pr_err() for this case
> The fast retry is appropriate because each attempt is inexpensive
> (kernel_connect() fails immediately when the address is unavailable)
> and quick recovery is critical for storage availability.
> Fixes: 60bf8bf8815e ("libceph: fix msgr backoff")
> Signed-off-by: Ionut Nechita <ionut.nechita@windriver.com>
> ---
>  include/linux/ceph/messenger.h | 11 +++++++
>  net/ceph/messenger.c           | 55 ++++++++++++++++++++++++++++++++--
>  2 files changed, 63 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
> index 1717cc57cdacd..730a754353aed 100644
> --- a/include/linux/ceph/messenger.h
> +++ b/include/linux/ceph/messenger.h
> @@ -320,6 +320,13 @@ struct ceph_msg {
>  /* ceph connection fault delay defaults, for exponential backoff */
>  #define BASE_DELAY_INTERVAL	(HZ / 4)
>  #define MAX_DELAY_INTERVAL	(15 * HZ)
> +/*
> + * Shorter retry delay for EADDRNOTAVAIL. This error typically indicates
> + * a transient condition (IPv6 DAD in progress, address reconfiguration,
> + * temporary route issue) that resolves in 1-2 seconds. Fast retries
> + * allow quick recovery without exponential backoff delays.
> + */
> +#define ADDRNOTAVAIL_DELAY	(HZ / 10)

What's wrong with BASE_DELAY_INTERVAL? I don't see big difference between HZ/4
and HZ/10.

>  
>  struct ceph_connection_v1_info {
>  	struct kvec out_kvec[8],         /* sending header/footer data */
> @@ -360,6 +367,8 @@ struct ceph_connection_v1_info {
>  	u32 connect_seq;      /* identify the most recent connection
>  				 attempt for this session */
>  	u32 peer_global_seq;  /* peer's global seq for this connection */
> +
> +	bool addr_notavail;  /* address not available (transient) */

You've introduced the same field for v1 and v2. But why you haven't used the
struct ceph_connection? In this case, you don't need to use
ceph_msgr2(from_msgr(con->msgr)) everywhere.

>  };
>  
>  #define CEPH_CRC_LEN			4
> @@ -430,6 +439,8 @@ struct ceph_connection_v2_info {
>  
>  	int con_mode;  /* CEPH_CON_MODE_* */
>  
> +	bool addr_notavail;  /* address not available (transient) */
> +
>  	void *conn_bufs[16];
>  	int conn_buf_cnt;
>  	int data_len_remain;
> diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
> index 9f6d860411cbd..c40c7c332e7f4 100644
> --- a/net/ceph/messenger.c
> +++ b/net/ceph/messenger.c
> @@ -466,8 +466,22 @@ int ceph_tcp_connect(struct ceph_connection *con)
>  		     ceph_pr_addr(&con->peer_addr),
>  		     sock->sk->sk_state);
>  	} else if (ret < 0) {
> -		pr_err("connect %s error %d\n",
> -		       ceph_pr_addr(&con->peer_addr), ret);
> +		if (ret == -EADDRNOTAVAIL) {
> +			/*
> +			 * Address not yet available - could be IPv6 DAD in
> +			 * progress, address reconfiguration, or temporary
> +			 * route issue. Use shorter delay.
> +			 */
> +			pr_warn_ratelimited("connect %s: address not available (DAD/route issue?), will retry\n",
> +					    ceph_pr_addr(&con->peer_addr));
> +			if (ceph_msgr2(from_msgr(con->msgr)))
> +				con->v2.addr_notavail = true;
> +			else
> +				con->v1.addr_notavail = true;
> +		} else {
> +			pr_err("connect %s error %d\n",
> +			       ceph_pr_addr(&con->peer_addr), ret);
> +		}
>  		sock_release(sock);
>  		return ret;
>  	}
> @@ -476,6 +490,13 @@ int ceph_tcp_connect(struct ceph_connection *con)
>  		tcp_sock_set_nodelay(sock->sk);
>  
>  	con->sock = sock;
> +
> +	/* Clear addr_notavail flag on successful connection */
> +	if (ceph_msgr2(from_msgr(con->msgr)))
> +		con->v2.addr_notavail = false;
> +	else
> +		con->v1.addr_notavail = false;
> +
>  	return 0;
>  }
>  
> @@ -609,6 +630,13 @@ void ceph_con_open(struct ceph_connection *con,
>  
>  	memcpy(&con->peer_addr, addr, sizeof(*addr));
>  	con->delay = 0;      /* reset backoff memory */
> +
> +	/* Clear addr_notavail flag when opening/reopening connection */
> +	if (ceph_msgr2(from_msgr(con->msgr)))
> +		con->v2.addr_notavail = false;
> +	else
> +		con->v1.addr_notavail = false;
> +
>  	mutex_unlock(&con->mutex);
>  	queue_con(con);
>  }
> @@ -1613,6 +1641,8 @@ static void ceph_con_workfn(struct work_struct *work)
>   */
>  static void con_fault(struct ceph_connection *con)
>  {
> +	bool addr_issue = false;

What's the point to introduce this local variable? Why don't use con-
>v2.addr_notavail where you are using addr_issue? Any particular reason for
this?

> +
>  	dout("fault %p state %d to peer %s\n",
>  	     con, con->state, ceph_pr_addr(&con->peer_addr));
>  
> @@ -1620,6 +1650,19 @@ static void con_fault(struct ceph_connection *con)
>  		ceph_pr_addr(&con->peer_addr), con->error_msg);
>  	con->error_msg = NULL;
>  
> +	/* Check and reset addr_notavail flag if set */
> +	if (ceph_msgr2(from_msgr(con->msgr))) {
> +		if (con->v2.addr_notavail) {
> +			addr_issue = true;
> +			con->v2.addr_notavail = false;
> +		}
> +	} else {
> +		if (con->v1.addr_notavail) {
> +			addr_issue = true;
> +			con->v1.addr_notavail = false;
> +		}
> +	}
> +
>  	WARN_ON(con->state == CEPH_CON_S_STANDBY ||
>  		con->state == CEPH_CON_S_CLOSED);
>  
> @@ -1644,7 +1687,13 @@ static void con_fault(struct ceph_connection *con)
>  	} else {
>  		/* retry after a delay. */
>  		con->state = CEPH_CON_S_PREOPEN;
> -		if (!con->delay) {
> +		if (addr_issue) {
> +			/*
> +			 * Address not available - use shorter delay as this
> +			 * is often a transient condition.
> +			 */
> +			con->delay = ADDRNOTAVAIL_DELAY;

So, the main point of introducing con->v2/1.addr_notavail is to set this delay.
I am not sure that there is big difference between HZ/4 and HZ/10. Do we really
need to change the delay here?

Thanks,
Slava.

> +		} else if (!con->delay) {
>  			con->delay = BASE_DELAY_INTERVAL;
>  		} else if (con->delay < MAX_DELAY_INTERVAL) {
>  			con->delay *= 2;