fs/ceph/super.c | 4 +++- include/linux/ceph/osd_client.h | 2 +- net/ceph/osd_client.c | 15 +++++++++++++-- 3 files changed, 17 insertions(+), 4 deletions(-)
From: Ionut Nechita <ionut.nechita@windriver.com>
When a Ceph OSD becomes unreachable (e.g., due to IPv6 EADDRNOTAVAIL
during DAD or network transitions), the sync syscall can block
indefinitely in ceph_osdc_sync(). This function iterates over all
in-flight write requests and calls wait_for_completion() with no
timeout on each one. The hung_task detector fires repeatedly with
stack traces showing:
ceph_osdc_sync [libceph]
ceph_sync_fs [ceph]
iterate_supers
ksys_sync
Since ceph_osdc_sync() is called before ceph_mdsc_sync() in
ceph_sync_fs(), an OSD hang prevents the MDS timeout protection
from commit e789e5252fda ("ceph: add timeout protection to
ceph_mdsc_sync() path") from ever being reached.
This is particularly problematic in Kubernetes environments with
PREEMPT_RT kernels where Ceph storage pods undergo rolling updates
and IPv6 network reconfigurations cause temporary OSD unavailability.
Fix this by adding mount_timeout-based timeout to the blocking wait,
following the existing pattern used by wait_request_timeout() in the
same file:
- ceph_osdc_sync(): use wait_for_completion_timeout() with
mount_timeout instead of indefinite wait_for_completion()
- Change return type from void to int, return -ETIMEDOUT on timeout
- ceph_sync_fs(): propagate OSD sync error, short-circuit before
MDS sync on failure
On timeout, pending OSD requests are NOT cancelled - they remain
in-flight and complete when the OSD reconnects. The timeout simply
unblocks the calling task. If mount_timeout is set to 0,
ceph_timeout_jiffies() returns MAX_SCHEDULE_TIMEOUT, preserving the
original infinite-wait behavior.
Signed-off-by: Ionut Nechita <ionut.nechita@windriver.com>
---
fs/ceph/super.c | 4 +++-
include/linux/ceph/osd_client.h | 2 +-
net/ceph/osd_client.c | 15 +++++++++++++--
3 files changed, 17 insertions(+), 4 deletions(-)
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 00d415af9680..f5ff8f505f85 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -133,7 +133,9 @@ static int ceph_sync_fs(struct super_block *sb, int wait)
}
doutc(cl, "(blocking)\n");
- ceph_osdc_sync(&fsc->client->osdc);
+ ret = ceph_osdc_sync(&fsc->client->osdc);
+ if (ret)
+ return ret;
ret = ceph_mdsc_sync(fsc->mdsc);
doutc(cl, "(blocking) done\n");
return ret;
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index d7941478158c..871827e2dd98 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -587,7 +587,7 @@ void ceph_osdc_start_request(struct ceph_osd_client *osdc,
extern void ceph_osdc_cancel_request(struct ceph_osd_request *req);
extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
struct ceph_osd_request *req);
-extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
+extern int ceph_osdc_sync(struct ceph_osd_client *osdc);
extern void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc);
void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 2f1c461e0ffc..67f99579ad0c 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -4744,10 +4744,13 @@ EXPORT_SYMBOL(ceph_osdc_wait_request);
/*
* sync - wait for all in-flight requests to flush. avoid starvation.
*/
-void ceph_osdc_sync(struct ceph_osd_client *osdc)
+int ceph_osdc_sync(struct ceph_osd_client *osdc)
{
+ struct ceph_options *opts = osdc->client->options;
+ unsigned long timeout = ceph_timeout_jiffies(opts->mount_timeout);
struct rb_node *n, *p;
u64 last_tid = atomic64_read(&osdc->last_tid);
+ unsigned long left;
again:
down_read(&osdc->lock);
@@ -4770,7 +4773,14 @@ void ceph_osdc_sync(struct ceph_osd_client *osdc)
up_read(&osdc->lock);
dout("%s waiting on req %p tid %llu last_tid %llu\n",
__func__, req, req->r_tid, last_tid);
- wait_for_completion(&req->r_completion);
+ left = wait_for_completion_timeout(&req->r_completion,
+ timeout);
+ if (!left) {
+ pr_warn("ceph: osd sync request tid %llu timed out\n",
+ req->r_tid);
+ ceph_osdc_put_request(req);
+ return -ETIMEDOUT;
+ }
ceph_osdc_put_request(req);
goto again;
}
@@ -4780,6 +4790,7 @@ void ceph_osdc_sync(struct ceph_osd_client *osdc)
up_read(&osdc->lock);
dout("%s done last_tid %llu\n", __func__, last_tid);
+ return 0;
}
EXPORT_SYMBOL(ceph_osdc_sync);
--
2.53.0
On Wed, 2026-02-18 at 21:49 +0200, Ionut Nechita (Wind River) wrote:
> From: Ionut Nechita <ionut.nechita@windriver.com>
>
> When a Ceph OSD becomes unreachable (e.g., due to IPv6 EADDRNOTAVAIL
> during DAD or network transitions), the sync syscall can block
> indefinitely in ceph_osdc_sync(). This function iterates over all
> in-flight write requests and calls wait_for_completion() with no
> timeout on each one. The hung_task detector fires repeatedly with
> stack traces showing:
>
> ceph_osdc_sync [libceph]
> ceph_sync_fs [ceph]
> iterate_supers
> ksys_sync
>
> Since ceph_osdc_sync() is called before ceph_mdsc_sync() in
> ceph_sync_fs(), an OSD hang prevents the MDS timeout protection
> from commit e789e5252fda ("ceph: add timeout protection to
> ceph_mdsc_sync() path") from ever being reached.
>
> This is particularly problematic in Kubernetes environments with
> PREEMPT_RT kernels where Ceph storage pods undergo rolling updates
> and IPv6 network reconfigurations cause temporary OSD unavailability.
>
> Fix this by adding mount_timeout-based timeout to the blocking wait,
> following the existing pattern used by wait_request_timeout() in the
> same file:
>
> - ceph_osdc_sync(): use wait_for_completion_timeout() with
> mount_timeout instead of indefinite wait_for_completion()
> - Change return type from void to int, return -ETIMEDOUT on timeout
> - ceph_sync_fs(): propagate OSD sync error, short-circuit before
> MDS sync on failure
>
> On timeout, pending OSD requests are NOT cancelled - they remain
> in-flight and complete when the OSD reconnects. The timeout simply
> unblocks the calling task. If mount_timeout is set to 0,
> ceph_timeout_jiffies() returns MAX_SCHEDULE_TIMEOUT, preserving the
> original infinite-wait behavior.
I am still not completely convinced that adding timeout is the proper fix.
Probably, we have some race condition in the code that needs to be fixed. I
suspect that suggested solution looks like workaround that hiding the issue
somehow.
Thanks,
Slava.
>
> Signed-off-by: Ionut Nechita <ionut.nechita@windriver.com>
> ---
> fs/ceph/super.c | 4 +++-
> include/linux/ceph/osd_client.h | 2 +-
> net/ceph/osd_client.c | 15 +++++++++++++--
> 3 files changed, 17 insertions(+), 4 deletions(-)
>
> diff --git a/fs/ceph/super.c b/fs/ceph/super.c
> index 00d415af9680..f5ff8f505f85 100644
> --- a/fs/ceph/super.c
> +++ b/fs/ceph/super.c
> @@ -133,7 +133,9 @@ static int ceph_sync_fs(struct super_block *sb, int wait)
> }
>
> doutc(cl, "(blocking)\n");
> - ceph_osdc_sync(&fsc->client->osdc);
> + ret = ceph_osdc_sync(&fsc->client->osdc);
> + if (ret)
> + return ret;
> ret = ceph_mdsc_sync(fsc->mdsc);
> doutc(cl, "(blocking) done\n");
> return ret;
> diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
> index d7941478158c..871827e2dd98 100644
> --- a/include/linux/ceph/osd_client.h
> +++ b/include/linux/ceph/osd_client.h
> @@ -587,7 +587,7 @@ void ceph_osdc_start_request(struct ceph_osd_client *osdc,
> extern void ceph_osdc_cancel_request(struct ceph_osd_request *req);
> extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
> struct ceph_osd_request *req);
> -extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
> +extern int ceph_osdc_sync(struct ceph_osd_client *osdc);
>
> extern void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc);
> void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc);
> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
> index 2f1c461e0ffc..67f99579ad0c 100644
> --- a/net/ceph/osd_client.c
> +++ b/net/ceph/osd_client.c
> @@ -4744,10 +4744,13 @@ EXPORT_SYMBOL(ceph_osdc_wait_request);
> /*
> * sync - wait for all in-flight requests to flush. avoid starvation.
> */
> -void ceph_osdc_sync(struct ceph_osd_client *osdc)
> +int ceph_osdc_sync(struct ceph_osd_client *osdc)
> {
> + struct ceph_options *opts = osdc->client->options;
> + unsigned long timeout = ceph_timeout_jiffies(opts->mount_timeout);
> struct rb_node *n, *p;
> u64 last_tid = atomic64_read(&osdc->last_tid);
> + unsigned long left;
>
> again:
> down_read(&osdc->lock);
> @@ -4770,7 +4773,14 @@ void ceph_osdc_sync(struct ceph_osd_client *osdc)
> up_read(&osdc->lock);
> dout("%s waiting on req %p tid %llu last_tid %llu\n",
> __func__, req, req->r_tid, last_tid);
> - wait_for_completion(&req->r_completion);
> + left = wait_for_completion_timeout(&req->r_completion,
> + timeout);
> + if (!left) {
> + pr_warn("ceph: osd sync request tid %llu timed out\n",
> + req->r_tid);
> + ceph_osdc_put_request(req);
> + return -ETIMEDOUT;
> + }
> ceph_osdc_put_request(req);
> goto again;
> }
> @@ -4780,6 +4790,7 @@ void ceph_osdc_sync(struct ceph_osd_client *osdc)
>
> up_read(&osdc->lock);
> dout("%s done last_tid %llu\n", __func__, last_tid);
> + return 0;
> }
> EXPORT_SYMBOL(ceph_osdc_sync);
>
© 2016 - 2026 Red Hat, Inc.