[PATCH] ceph: add timeout protection to ceph_osdc_sync() path

Ionut Nechita (Wind River) posted 1 patch 1 month, 3 weeks ago
fs/ceph/super.c                 |  4 +++-
include/linux/ceph/osd_client.h |  2 +-
net/ceph/osd_client.c           | 15 +++++++++++++--
3 files changed, 17 insertions(+), 4 deletions(-)
[PATCH] ceph: add timeout protection to ceph_osdc_sync() path
Posted by Ionut Nechita (Wind River) 1 month, 3 weeks ago
From: Ionut Nechita <ionut.nechita@windriver.com>

When a Ceph OSD becomes unreachable (e.g., due to IPv6 EADDRNOTAVAIL
during DAD or network transitions), the sync syscall can block
indefinitely in ceph_osdc_sync(). This function iterates over all
in-flight write requests and calls wait_for_completion() with no
timeout on each one. The hung_task detector fires repeatedly with
stack traces showing:

  ceph_osdc_sync [libceph]
  ceph_sync_fs [ceph]
  iterate_supers
  ksys_sync

Since ceph_osdc_sync() is called before ceph_mdsc_sync() in
ceph_sync_fs(), an OSD hang prevents the MDS timeout protection
from commit e789e5252fda ("ceph: add timeout protection to
ceph_mdsc_sync() path") from ever being reached.

This is particularly problematic in Kubernetes environments with
PREEMPT_RT kernels where Ceph storage pods undergo rolling updates
and IPv6 network reconfigurations cause temporary OSD unavailability.

Fix this by adding mount_timeout-based timeout to the blocking wait,
following the existing pattern used by wait_request_timeout() in the
same file:

- ceph_osdc_sync(): use wait_for_completion_timeout() with
  mount_timeout instead of indefinite wait_for_completion()
- Change return type from void to int, return -ETIMEDOUT on timeout
- ceph_sync_fs(): propagate OSD sync error, short-circuit before
  MDS sync on failure

On timeout, pending OSD requests are NOT cancelled - they remain
in-flight and complete when the OSD reconnects. The timeout simply
unblocks the calling task. If mount_timeout is set to 0,
ceph_timeout_jiffies() returns MAX_SCHEDULE_TIMEOUT, preserving the
original infinite-wait behavior.

Signed-off-by: Ionut Nechita <ionut.nechita@windriver.com>
---
 fs/ceph/super.c                 |  4 +++-
 include/linux/ceph/osd_client.h |  2 +-
 net/ceph/osd_client.c           | 15 +++++++++++++--
 3 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 00d415af9680..f5ff8f505f85 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -133,7 +133,9 @@ static int ceph_sync_fs(struct super_block *sb, int wait)
 	}
 
 	doutc(cl, "(blocking)\n");
-	ceph_osdc_sync(&fsc->client->osdc);
+	ret = ceph_osdc_sync(&fsc->client->osdc);
+	if (ret)
+		return ret;
 	ret = ceph_mdsc_sync(fsc->mdsc);
 	doutc(cl, "(blocking) done\n");
 	return ret;
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index d7941478158c..871827e2dd98 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -587,7 +587,7 @@ void ceph_osdc_start_request(struct ceph_osd_client *osdc,
 extern void ceph_osdc_cancel_request(struct ceph_osd_request *req);
 extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
 				  struct ceph_osd_request *req);
-extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
+extern int ceph_osdc_sync(struct ceph_osd_client *osdc);
 
 extern void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc);
 void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 2f1c461e0ffc..67f99579ad0c 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -4744,10 +4744,13 @@ EXPORT_SYMBOL(ceph_osdc_wait_request);
 /*
  * sync - wait for all in-flight requests to flush.  avoid starvation.
  */
-void ceph_osdc_sync(struct ceph_osd_client *osdc)
+int ceph_osdc_sync(struct ceph_osd_client *osdc)
 {
+	struct ceph_options *opts = osdc->client->options;
+	unsigned long timeout = ceph_timeout_jiffies(opts->mount_timeout);
 	struct rb_node *n, *p;
 	u64 last_tid = atomic64_read(&osdc->last_tid);
+	unsigned long left;
 
 again:
 	down_read(&osdc->lock);
@@ -4770,7 +4773,14 @@ void ceph_osdc_sync(struct ceph_osd_client *osdc)
 			up_read(&osdc->lock);
 			dout("%s waiting on req %p tid %llu last_tid %llu\n",
 			     __func__, req, req->r_tid, last_tid);
-			wait_for_completion(&req->r_completion);
+			left = wait_for_completion_timeout(&req->r_completion,
+							   timeout);
+			if (!left) {
+				pr_warn("ceph: osd sync request tid %llu timed out\n",
+					req->r_tid);
+				ceph_osdc_put_request(req);
+				return -ETIMEDOUT;
+			}
 			ceph_osdc_put_request(req);
 			goto again;
 		}
@@ -4780,6 +4790,7 @@ void ceph_osdc_sync(struct ceph_osd_client *osdc)
 
 	up_read(&osdc->lock);
 	dout("%s done last_tid %llu\n", __func__, last_tid);
+	return 0;
 }
 EXPORT_SYMBOL(ceph_osdc_sync);
 
-- 
2.53.0
Re: [PATCH] ceph: add timeout protection to ceph_osdc_sync() path
Posted by Viacheslav Dubeyko 1 month, 3 weeks ago
On Wed, 2026-02-18 at 21:49 +0200, Ionut Nechita (Wind River) wrote:
> From: Ionut Nechita <ionut.nechita@windriver.com>
> 
> When a Ceph OSD becomes unreachable (e.g., due to IPv6 EADDRNOTAVAIL
> during DAD or network transitions), the sync syscall can block
> indefinitely in ceph_osdc_sync(). This function iterates over all
> in-flight write requests and calls wait_for_completion() with no
> timeout on each one. The hung_task detector fires repeatedly with
> stack traces showing:
> 
>   ceph_osdc_sync [libceph]
>   ceph_sync_fs [ceph]
>   iterate_supers
>   ksys_sync
> 
> Since ceph_osdc_sync() is called before ceph_mdsc_sync() in
> ceph_sync_fs(), an OSD hang prevents the MDS timeout protection
> from commit e789e5252fda ("ceph: add timeout protection to
> ceph_mdsc_sync() path") from ever being reached.
> 
> This is particularly problematic in Kubernetes environments with
> PREEMPT_RT kernels where Ceph storage pods undergo rolling updates
> and IPv6 network reconfigurations cause temporary OSD unavailability.
> 
> Fix this by adding mount_timeout-based timeout to the blocking wait,
> following the existing pattern used by wait_request_timeout() in the
> same file:
> 
> - ceph_osdc_sync(): use wait_for_completion_timeout() with
>   mount_timeout instead of indefinite wait_for_completion()
> - Change return type from void to int, return -ETIMEDOUT on timeout
> - ceph_sync_fs(): propagate OSD sync error, short-circuit before
>   MDS sync on failure
> 
> On timeout, pending OSD requests are NOT cancelled - they remain
> in-flight and complete when the OSD reconnects. The timeout simply
> unblocks the calling task. If mount_timeout is set to 0,
> ceph_timeout_jiffies() returns MAX_SCHEDULE_TIMEOUT, preserving the
> original infinite-wait behavior.

I am still not completely convinced that adding timeout is the proper fix.
Probably, we have some race condition in the code that needs to be fixed. I
suspect that suggested solution looks like workaround that hiding the issue
somehow.

Thanks,
Slava.

> 
> Signed-off-by: Ionut Nechita <ionut.nechita@windriver.com>
> ---
>  fs/ceph/super.c                 |  4 +++-
>  include/linux/ceph/osd_client.h |  2 +-
>  net/ceph/osd_client.c           | 15 +++++++++++++--
>  3 files changed, 17 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/ceph/super.c b/fs/ceph/super.c
> index 00d415af9680..f5ff8f505f85 100644
> --- a/fs/ceph/super.c
> +++ b/fs/ceph/super.c
> @@ -133,7 +133,9 @@ static int ceph_sync_fs(struct super_block *sb, int wait)
>  	}
>  
>  	doutc(cl, "(blocking)\n");
> -	ceph_osdc_sync(&fsc->client->osdc);
> +	ret = ceph_osdc_sync(&fsc->client->osdc);
> +	if (ret)
> +		return ret;
>  	ret = ceph_mdsc_sync(fsc->mdsc);
>  	doutc(cl, "(blocking) done\n");
>  	return ret;
> diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
> index d7941478158c..871827e2dd98 100644
> --- a/include/linux/ceph/osd_client.h
> +++ b/include/linux/ceph/osd_client.h
> @@ -587,7 +587,7 @@ void ceph_osdc_start_request(struct ceph_osd_client *osdc,
>  extern void ceph_osdc_cancel_request(struct ceph_osd_request *req);
>  extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
>  				  struct ceph_osd_request *req);
> -extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
> +extern int ceph_osdc_sync(struct ceph_osd_client *osdc);
>  
>  extern void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc);
>  void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc);
> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
> index 2f1c461e0ffc..67f99579ad0c 100644
> --- a/net/ceph/osd_client.c
> +++ b/net/ceph/osd_client.c
> @@ -4744,10 +4744,13 @@ EXPORT_SYMBOL(ceph_osdc_wait_request);
>  /*
>   * sync - wait for all in-flight requests to flush.  avoid starvation.
>   */
> -void ceph_osdc_sync(struct ceph_osd_client *osdc)
> +int ceph_osdc_sync(struct ceph_osd_client *osdc)
>  {
> +	struct ceph_options *opts = osdc->client->options;
> +	unsigned long timeout = ceph_timeout_jiffies(opts->mount_timeout);
>  	struct rb_node *n, *p;
>  	u64 last_tid = atomic64_read(&osdc->last_tid);
> +	unsigned long left;
>  
>  again:
>  	down_read(&osdc->lock);
> @@ -4770,7 +4773,14 @@ void ceph_osdc_sync(struct ceph_osd_client *osdc)
>  			up_read(&osdc->lock);
>  			dout("%s waiting on req %p tid %llu last_tid %llu\n",
>  			     __func__, req, req->r_tid, last_tid);
> -			wait_for_completion(&req->r_completion);
> +			left = wait_for_completion_timeout(&req->r_completion,
> +							   timeout);
> +			if (!left) {
> +				pr_warn("ceph: osd sync request tid %llu timed out\n",
> +					req->r_tid);
> +				ceph_osdc_put_request(req);
> +				return -ETIMEDOUT;
> +			}
>  			ceph_osdc_put_request(req);
>  			goto again;
>  		}
> @@ -4780,6 +4790,7 @@ void ceph_osdc_sync(struct ceph_osd_client *osdc)
>  
>  	up_read(&osdc->lock);
>  	dout("%s done last_tid %llu\n", __func__, last_tid);
> +	return 0;
>  }
>  EXPORT_SYMBOL(ceph_osdc_sync);
>