[v2] ceph: fix deadlock in ceph_readdir_prepopulate

[PATCH v2] ceph: fix deadlock in ceph_readdir_prepopulate

Posted by Zhao Sun 6 months ago

A deadlock can occur when ceph_get_inode is called outside of locks:

1) handle_reply calls ceph_get_inode, gets a new inode with I_NEW,
   and blocks on mdsc->snap_rwsem for write.

2) At the same time, ceph_readdir_prepopulate calls ceph_get_inode
   for the same inode while holding mdsc->snap_rwsem for read,
   and blocks on I_NEW.

This causes an ABBA deadlock between mdsc->snap_rwsem and the I_NEW bit.

The issue was introduced by commit bca9fc14c70f
("ceph: when filling trace, call ceph_get_inode outside of mutexes")
which attempted to avoid a deadlock involving ceph_check_caps.

That concern is now obsolete since commit 6a92b08fdad2
("ceph: don't take s_mutex or snap_rwsem in ceph_check_caps")
which made ceph_check_caps fully lock-free.

This patch primarily reverts bca9fc14c70f to resolve the new deadlock,
with a few minor adjustments to fit the current codebase.

Link: https://tracker.ceph.com/issues/72307
Signed-off-by: Zhao Sun <sunzhao03@kuaishou.com>
---
 fs/ceph/inode.c      | 26 ++++++++++++++++++++++----
 fs/ceph/mds_client.c | 29 -----------------------------
 2 files changed, 22 insertions(+), 33 deletions(-)

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 06cd2963e41e..d0f0035ee117 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1623,10 +1623,28 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
 	}
 
 	if (rinfo->head->is_target) {
-		/* Should be filled in by handle_reply */
-		BUG_ON(!req->r_target_inode);
+		in = xchg(&req->r_new_inode, NULL);
+		tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
+		tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
+
+		/*
+		 * If we ended up opening an existing inode, discard
+		 * r_new_inode
+		 */
+		if (req->r_op == CEPH_MDS_OP_CREATE &&
+		    !req->r_reply_info.has_create_ino) {
+			/* This should never happen on an async create */
+			WARN_ON_ONCE(req->r_deleg_ino);
+			iput(in);
+			in = NULL;
+		}
+
+		in = ceph_get_inode(mdsc->fsc->sb, tvino, in);
+		if (IS_ERR(in)) {
+			err = PTR_ERR(in);
+			goto done;
+		}
 
-		in = req->r_target_inode;
 		err = ceph_fill_inode(in, req->r_locked_page, &rinfo->targeti,
 				NULL, session,
 				(!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) &&
@@ -1636,13 +1654,13 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
 		if (err < 0) {
 			pr_err_client(cl, "badness %p %llx.%llx\n", in,
 				      ceph_vinop(in));
-			req->r_target_inode = NULL;
 			if (in->i_state & I_NEW)
 				discard_new_inode(in);
 			else
 				iput(in);
 			goto done;
 		}
+		req->r_target_inode = in;
 		if (in->i_state & I_NEW)
 			unlock_new_inode(in);
 	}
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 230e0c3f341f..8b70f2b96f46 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -3874,36 +3874,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 				       session->s_con.peer_features);
 	mutex_unlock(&mdsc->mutex);
 
-	/* Must find target inode outside of mutexes to avoid deadlocks */
 	rinfo = &req->r_reply_info;
-	if ((err >= 0) && rinfo->head->is_target) {
-		struct inode *in = xchg(&req->r_new_inode, NULL);
-		struct ceph_vino tvino = {
-			.ino  = le64_to_cpu(rinfo->targeti.in->ino),
-			.snap = le64_to_cpu(rinfo->targeti.in->snapid)
-		};
-
-		/*
-		 * If we ended up opening an existing inode, discard
-		 * r_new_inode
-		 */
-		if (req->r_op == CEPH_MDS_OP_CREATE &&
-		    !req->r_reply_info.has_create_ino) {
-			/* This should never happen on an async create */
-			WARN_ON_ONCE(req->r_deleg_ino);
-			iput(in);
-			in = NULL;
-		}
-
-		in = ceph_get_inode(mdsc->fsc->sb, tvino, in);
-		if (IS_ERR(in)) {
-			err = PTR_ERR(in);
-			mutex_lock(&session->s_mutex);
-			goto out_err;
-		}
-		req->r_target_inode = in;
-	}
-
 	mutex_lock(&session->s_mutex);
 	if (err < 0) {
 		pr_err_client(cl, "got corrupt reply mds%d(tid:%lld)\n",
-- 
2.39.2 (Apple Git-143)

Re: [PATCH v2] ceph: fix deadlock in ceph_readdir_prepopulate

Posted by Viacheslav Dubeyko 6 months ago

On Fri, 2025-08-08 at 15:08 +0800, Zhao Sun wrote:
> A deadlock can occur when ceph_get_inode is called outside of locks:
> 
> 1) handle_reply calls ceph_get_inode, gets a new inode with I_NEW,
>    and blocks on mdsc->snap_rwsem for write.
> 

Frankly speaking, it's hard to follow to your logic. Which particular mdsc-
>snap_rwsem lock do you mean in handle_reply()?

> 2) At the same time, ceph_readdir_prepopulate calls ceph_get_inode
>    for the same inode while holding mdsc->snap_rwsem for read,
>    and blocks on I_NEW.
> 

The same here. Which particular mdsc->snap_rwsem lock do you mean in
ceph_readdir_prepopulate()?

> This causes an ABBA deadlock between mdsc->snap_rwsem and the I_NEW bit.
> 
> The issue was introduced by commit bca9fc14c70f
> ("ceph: when filling trace, call ceph_get_inode outside of mutexes")
> which attempted to avoid a deadlock involving ceph_check_caps.
> 
> That concern is now obsolete since commit 6a92b08fdad2
> ("ceph: don't take s_mutex or snap_rwsem in ceph_check_caps")
> which made ceph_check_caps fully lock-free.
> 
> This patch primarily reverts bca9fc14c70f to resolve the new deadlock,
> with a few minor adjustments to fit the current codebase.
> 

I assume that you hit the issue. I believe it will be good to have the
explanation which use-case/workload trigger the issue and which symptoms do you
see (system log's content, for example).

Thanks,
Slava.

> Link: https://tracker.ceph.com/issues/72307  
> Signed-off-by: Zhao Sun <sunzhao03@kuaishou.com>
> ---
>  fs/ceph/inode.c      | 26 ++++++++++++++++++++++----
>  fs/ceph/mds_client.c | 29 -----------------------------
>  2 files changed, 22 insertions(+), 33 deletions(-)
> 
> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> index 06cd2963e41e..d0f0035ee117 100644
> --- a/fs/ceph/inode.c
> +++ b/fs/ceph/inode.c
> @@ -1623,10 +1623,28 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
>  	}
>  
>  	if (rinfo->head->is_target) {
> -		/* Should be filled in by handle_reply */
> -		BUG_ON(!req->r_target_inode);
> +		in = xchg(&req->r_new_inode, NULL);
> +		tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
> +		tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
> +
> +		/*
> +		 * If we ended up opening an existing inode, discard
> +		 * r_new_inode
> +		 */
> +		if (req->r_op == CEPH_MDS_OP_CREATE &&
> +		    !req->r_reply_info.has_create_ino) {
> +			/* This should never happen on an async create */
> +			WARN_ON_ONCE(req->r_deleg_ino);
> +			iput(in);
> +			in = NULL;
> +		}
> +
> +		in = ceph_get_inode(mdsc->fsc->sb, tvino, in);
> +		if (IS_ERR(in)) {
> +			err = PTR_ERR(in);
> +			goto done;
> +		}
>  
> -		in = req->r_target_inode;
>  		err = ceph_fill_inode(in, req->r_locked_page, &rinfo->targeti,
>  				NULL, session,
>  				(!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) &&
> @@ -1636,13 +1654,13 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
>  		if (err < 0) {
>  			pr_err_client(cl, "badness %p %llx.%llx\n", in,
>  				      ceph_vinop(in));
> -			req->r_target_inode = NULL;
>  			if (in->i_state & I_NEW)
>  				discard_new_inode(in);
>  			else
>  				iput(in);
>  			goto done;
>  		}
> +		req->r_target_inode = in;
>  		if (in->i_state & I_NEW)
>  			unlock_new_inode(in);
>  	}
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index 230e0c3f341f..8b70f2b96f46 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -3874,36 +3874,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
>  				       session->s_con.peer_features);
>  	mutex_unlock(&mdsc->mutex);
>  
> -	/* Must find target inode outside of mutexes to avoid deadlocks */
>  	rinfo = &req->r_reply_info;
> -	if ((err >= 0) && rinfo->head->is_target) {
> -		struct inode *in = xchg(&req->r_new_inode, NULL);
> -		struct ceph_vino tvino = {
> -			.ino  = le64_to_cpu(rinfo->targeti.in->ino),
> -			.snap = le64_to_cpu(rinfo->targeti.in->snapid)
> -		};
> -
> -		/*
> -		 * If we ended up opening an existing inode, discard
> -		 * r_new_inode
> -		 */
> -		if (req->r_op == CEPH_MDS_OP_CREATE &&
> -		    !req->r_reply_info.has_create_ino) {
> -			/* This should never happen on an async create */
> -			WARN_ON_ONCE(req->r_deleg_ino);
> -			iput(in);
> -			in = NULL;
> -		}
> -
> -		in = ceph_get_inode(mdsc->fsc->sb, tvino, in);
> -		if (IS_ERR(in)) {
> -			err = PTR_ERR(in);
> -			mutex_lock(&session->s_mutex);
> -			goto out_err;
> -		}
> -		req->r_target_inode = in;
> -	}
> -
>  	mutex_lock(&session->s_mutex);
>  	if (err < 0) {
>  		pr_err_client(cl, "got corrupt reply mds%d(tid:%lld)\n",