include/linux/ceph/messenger.h | 11 +++++++ net/ceph/messenger.c | 55 ++++++++++++++++++++++++++++++++-- 2 files changed, 63 insertions(+), 3 deletions(-)
From: Ionut Nechita <ionut.nechita@windriver.com>
When connecting to Ceph monitors/OSDs, kernel_connect() may return
-EADDRNOTAVAIL if the source address is unavailable. This occurs
during:
- IPv6 Duplicate Address Detection (DAD)
- IPv4/IPv6 interface state changes (link up/down events)
- Address removal or reconfiguration on the interface
- Network namespace transitions in containerized environments
- CNI reconfigurations in Kubernetes/StarlingX rolling upgrades
Currently, libceph treats EADDRNOTAVAIL like any other connection error
and enters exponential backoff (BASE_DELAY_INTERVAL 250ms doubling up
to MAX_DELAY_INTERVAL 15s). Additionally, the monitor client has its
own hunt-level backoff (CEPH_MONC_HUNT_INTERVAL 3s * hunt_mult, where
hunt_mult doubles up to 10x = 30s max). These two backoff mechanisms
compound: at steady state each monitor gets ~30 seconds of attempts
with connection-level delays up to 15s, and the round-trip through
all monitors takes ~60 seconds.
In production on a StarlingX system (6.12.0-1-rt-amd64, Dell PowerEdge
R720, IPv6-only Ceph cluster with 2 monitors), the EADDRNOTAVAIL
condition persisted for ~36 minutes during a rolling upgrade:
13:20:52 - mon0 session lost, hunting begins, first error -99
13:57:03 - mon0 session finally re-established
~470 failed connect attempts across both monitors
sync task blocked for 983+ seconds, triggering hung task warnings:
"INFO: task sync:514917 blocked for more than 122 seconds"
...repeated at 245s, 368s, 491s, 614s, 737s, 860s, 983s
The duration of EADDRNOTAVAIL varies by environment: it can be brief
(simple DAD, 1-2s) or prolonged (complex network reconfiguration
during rolling upgrades, minutes). In both cases, the key issue is
that exponential backoff up to 15s wastes time once the address
becomes available -- the client may sit idle for up to 15 seconds
before attempting to reconnect.
This patch bypasses the exponential backoff for EADDRNOTAVAIL by
using a fixed short retry interval (ADDRNOTAVAIL_DELAY, HZ/10 =
100ms). This ensures reconnection happens within 100ms of the address
becoming available, rather than waiting up to 15 seconds.
Implementation:
- Detect EADDRNOTAVAIL in ceph_tcp_connect() for both IPv4 and IPv6
- Signal the condition to con_fault() via an addr_notavail flag
(per-protocol: v1 and v2)
- In con_fault(), use ADDRNOTAVAIL_DELAY instead of exponential
backoff when the flag is set
- Clear the flag on successful connection and when reopening
- Use pr_warn_ratelimited() instead of pr_err() for this case
The fast retry is appropriate because each attempt is inexpensive
(kernel_connect() fails immediately when the address is unavailable)
and quick recovery is critical for storage availability.
Fixes: 60bf8bf8815e6 ("libceph: fix msgr backoff")
Signed-off-by: Ionut Nechita <ionut.nechita@windriver.com>
---
Changes since v1:
- Corrected commit message: removed incorrect "1-2 seconds" claim,
added actual production dmesg data showing the 36-minute
EADDRNOTAVAIL duration and explained the two compounding backoff
mechanisms (connection-level + monitor hunt-level)
include/linux/ceph/messenger.h | 11 +++++++
net/ceph/messenger.c | 55 ++++++++++++++++++++++++++++++++--
2 files changed, 63 insertions(+), 3 deletions(-)
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 6aa4c6478c9f6..ec08d02a9d4bd 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -321,6 +321,13 @@ struct ceph_msg {
/* ceph connection fault delay defaults, for exponential backoff */
#define BASE_DELAY_INTERVAL (HZ / 4)
#define MAX_DELAY_INTERVAL (15 * HZ)
+/*
+ * Shorter retry delay for EADDRNOTAVAIL. This error typically indicates
+ * a transient condition (IPv6 DAD in progress, address reconfiguration,
+ * temporary route issue) that resolves in 1-2 seconds. Fast retries
+ * allow quick recovery without exponential backoff delays.
+ */
+#define ADDRNOTAVAIL_DELAY (HZ / 10)
struct ceph_connection_v1_info {
struct kvec out_kvec[8], /* sending header/footer data */
@@ -361,6 +368,8 @@ struct ceph_connection_v1_info {
u32 connect_seq; /* identify the most recent connection
attempt for this session */
u32 peer_global_seq; /* peer's global seq for this connection */
+
+ bool addr_notavail; /* address not available (transient) */
};
#define CEPH_CRC_LEN 4
@@ -432,6 +441,8 @@ struct ceph_connection_v2_info {
int con_mode; /* CEPH_CON_MODE_* */
+ bool addr_notavail; /* address not available (transient) */
+
void *conn_bufs[16];
int conn_buf_cnt;
int data_len_remain;
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 70b25f4ecba67..d86efcfb7b87f 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -467,8 +467,22 @@ int ceph_tcp_connect(struct ceph_connection *con)
ceph_pr_addr(&con->peer_addr),
sock->sk->sk_state);
} else if (ret < 0) {
- pr_err("connect %s error %d\n",
- ceph_pr_addr(&con->peer_addr), ret);
+ if (ret == -EADDRNOTAVAIL) {
+ /*
+ * Address not yet available - could be IPv6 DAD in
+ * progress, address reconfiguration, or temporary
+ * route issue. Use shorter delay.
+ */
+ pr_warn_ratelimited("connect %s: address not available (DAD/route issue?), will retry\n",
+ ceph_pr_addr(&con->peer_addr));
+ if (ceph_msgr2(from_msgr(con->msgr)))
+ con->v2.addr_notavail = true;
+ else
+ con->v1.addr_notavail = true;
+ } else {
+ pr_err("connect %s error %d\n",
+ ceph_pr_addr(&con->peer_addr), ret);
+ }
sock_release(sock);
return ret;
}
@@ -477,6 +491,13 @@ int ceph_tcp_connect(struct ceph_connection *con)
tcp_sock_set_nodelay(sock->sk);
con->sock = sock;
+
+ /* Clear addr_notavail flag on successful connection */
+ if (ceph_msgr2(from_msgr(con->msgr)))
+ con->v2.addr_notavail = false;
+ else
+ con->v1.addr_notavail = false;
+
return 0;
}
@@ -610,6 +631,13 @@ void ceph_con_open(struct ceph_connection *con,
memcpy(&con->peer_addr, addr, sizeof(*addr));
con->delay = 0; /* reset backoff memory */
+
+ /* Clear addr_notavail flag when opening/reopening connection */
+ if (ceph_msgr2(from_msgr(con->msgr)))
+ con->v2.addr_notavail = false;
+ else
+ con->v1.addr_notavail = false;
+
mutex_unlock(&con->mutex);
queue_con(con);
}
@@ -1614,6 +1642,8 @@ static void ceph_con_workfn(struct work_struct *work)
*/
static void con_fault(struct ceph_connection *con)
{
+ bool addr_issue = false;
+
dout("fault %p state %d to peer %s\n",
con, con->state, ceph_pr_addr(&con->peer_addr));
@@ -1621,6 +1651,19 @@ static void con_fault(struct ceph_connection *con)
ceph_pr_addr(&con->peer_addr), con->error_msg);
con->error_msg = NULL;
+ /* Check and reset addr_notavail flag if set */
+ if (ceph_msgr2(from_msgr(con->msgr))) {
+ if (con->v2.addr_notavail) {
+ addr_issue = true;
+ con->v2.addr_notavail = false;
+ }
+ } else {
+ if (con->v1.addr_notavail) {
+ addr_issue = true;
+ con->v1.addr_notavail = false;
+ }
+ }
+
WARN_ON(con->state == CEPH_CON_S_STANDBY ||
con->state == CEPH_CON_S_CLOSED);
@@ -1645,7 +1688,13 @@ static void con_fault(struct ceph_connection *con)
} else {
/* retry after a delay. */
con->state = CEPH_CON_S_PREOPEN;
- if (!con->delay) {
+ if (addr_issue) {
+ /*
+ * Address not available - use shorter delay as this
+ * is often a transient condition.
+ */
+ con->delay = ADDRNOTAVAIL_DELAY;
+ } else if (!con->delay) {
con->delay = BASE_DELAY_INTERVAL;
} else if (con->delay < MAX_DELAY_INTERVAL) {
con->delay *= 2;
--
2.52.0
© 2016 - 2026 Red Hat, Inc.