fs/ceph/inode.c | 26 ++++++++++++++++++++++---- fs/ceph/mds_client.c | 29 ----------------------------- 2 files changed, 22 insertions(+), 33 deletions(-)
A deadlock can occur when ceph_get_inode is called outside of locks:
1) handle_reply calls ceph_get_inode, gets a new inode with I_NEW,
and blocks on mdsc->snap_rwsem for write.
2) At the same time, ceph_readdir_prepopulate calls ceph_get_inode
for the same inode while holding mdsc->snap_rwsem for read,
and blocks on I_NEW.
This causes an ABBA deadlock between mdsc->snap_rwsem and the I_NEW bit.
The issue was introduced by commit bca9fc14c70f
("ceph: when filling trace, call ceph_get_inode outside of mutexes")
which attempted to avoid a deadlock involving ceph_check_caps.
That concern is now obsolete since commit 6a92b08fdad2
("ceph: don't take s_mutex or snap_rwsem in ceph_check_caps")
which made ceph_check_caps fully lock-free.
This patch primarily reverts bca9fc14c70f to resolve the new deadlock,
with a few minor adjustments to fit the current codebase.
Link: https://tracker.ceph.com/issues/72307
Signed-off-by: Zhao Sun <sunzhao03@kuaishou.com>
---
fs/ceph/inode.c | 26 ++++++++++++++++++++++----
fs/ceph/mds_client.c | 29 -----------------------------
2 files changed, 22 insertions(+), 33 deletions(-)
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 06cd2963e41e..d0f0035ee117 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1623,10 +1623,28 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
}
if (rinfo->head->is_target) {
- /* Should be filled in by handle_reply */
- BUG_ON(!req->r_target_inode);
+ in = xchg(&req->r_new_inode, NULL);
+ tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
+ tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
+
+ /*
+ * If we ended up opening an existing inode, discard
+ * r_new_inode
+ */
+ if (req->r_op == CEPH_MDS_OP_CREATE &&
+ !req->r_reply_info.has_create_ino) {
+ /* This should never happen on an async create */
+ WARN_ON_ONCE(req->r_deleg_ino);
+ iput(in);
+ in = NULL;
+ }
+
+ in = ceph_get_inode(mdsc->fsc->sb, tvino, in);
+ if (IS_ERR(in)) {
+ err = PTR_ERR(in);
+ goto done;
+ }
- in = req->r_target_inode;
err = ceph_fill_inode(in, req->r_locked_page, &rinfo->targeti,
NULL, session,
(!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) &&
@@ -1636,13 +1654,13 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
if (err < 0) {
pr_err_client(cl, "badness %p %llx.%llx\n", in,
ceph_vinop(in));
- req->r_target_inode = NULL;
if (in->i_state & I_NEW)
discard_new_inode(in);
else
iput(in);
goto done;
}
+ req->r_target_inode = in;
if (in->i_state & I_NEW)
unlock_new_inode(in);
}
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 230e0c3f341f..8b70f2b96f46 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -3874,36 +3874,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
session->s_con.peer_features);
mutex_unlock(&mdsc->mutex);
- /* Must find target inode outside of mutexes to avoid deadlocks */
rinfo = &req->r_reply_info;
- if ((err >= 0) && rinfo->head->is_target) {
- struct inode *in = xchg(&req->r_new_inode, NULL);
- struct ceph_vino tvino = {
- .ino = le64_to_cpu(rinfo->targeti.in->ino),
- .snap = le64_to_cpu(rinfo->targeti.in->snapid)
- };
-
- /*
- * If we ended up opening an existing inode, discard
- * r_new_inode
- */
- if (req->r_op == CEPH_MDS_OP_CREATE &&
- !req->r_reply_info.has_create_ino) {
- /* This should never happen on an async create */
- WARN_ON_ONCE(req->r_deleg_ino);
- iput(in);
- in = NULL;
- }
-
- in = ceph_get_inode(mdsc->fsc->sb, tvino, in);
- if (IS_ERR(in)) {
- err = PTR_ERR(in);
- mutex_lock(&session->s_mutex);
- goto out_err;
- }
- req->r_target_inode = in;
- }
-
mutex_lock(&session->s_mutex);
if (err < 0) {
pr_err_client(cl, "got corrupt reply mds%d(tid:%lld)\n",
--
2.39.2 (Apple Git-143)
On Fri, 2025-08-08 at 15:08 +0800, Zhao Sun wrote:
> A deadlock can occur when ceph_get_inode is called outside of locks:
>
> 1) handle_reply calls ceph_get_inode, gets a new inode with I_NEW,
> and blocks on mdsc->snap_rwsem for write.
>
Frankly speaking, it's hard to follow to your logic. Which particular mdsc-
>snap_rwsem lock do you mean in handle_reply()?
> 2) At the same time, ceph_readdir_prepopulate calls ceph_get_inode
> for the same inode while holding mdsc->snap_rwsem for read,
> and blocks on I_NEW.
>
The same here. Which particular mdsc->snap_rwsem lock do you mean in
ceph_readdir_prepopulate()?
> This causes an ABBA deadlock between mdsc->snap_rwsem and the I_NEW bit.
>
> The issue was introduced by commit bca9fc14c70f
> ("ceph: when filling trace, call ceph_get_inode outside of mutexes")
> which attempted to avoid a deadlock involving ceph_check_caps.
>
> That concern is now obsolete since commit 6a92b08fdad2
> ("ceph: don't take s_mutex or snap_rwsem in ceph_check_caps")
> which made ceph_check_caps fully lock-free.
>
> This patch primarily reverts bca9fc14c70f to resolve the new deadlock,
> with a few minor adjustments to fit the current codebase.
>
I assume that you hit the issue. I believe it will be good to have the
explanation which use-case/workload trigger the issue and which symptoms do you
see (system log's content, for example).
Thanks,
Slava.
> Link: https://tracker.ceph.com/issues/72307
> Signed-off-by: Zhao Sun <sunzhao03@kuaishou.com>
> ---
> fs/ceph/inode.c | 26 ++++++++++++++++++++++----
> fs/ceph/mds_client.c | 29 -----------------------------
> 2 files changed, 22 insertions(+), 33 deletions(-)
>
> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> index 06cd2963e41e..d0f0035ee117 100644
> --- a/fs/ceph/inode.c
> +++ b/fs/ceph/inode.c
> @@ -1623,10 +1623,28 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
> }
>
> if (rinfo->head->is_target) {
> - /* Should be filled in by handle_reply */
> - BUG_ON(!req->r_target_inode);
> + in = xchg(&req->r_new_inode, NULL);
> + tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
> + tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
> +
> + /*
> + * If we ended up opening an existing inode, discard
> + * r_new_inode
> + */
> + if (req->r_op == CEPH_MDS_OP_CREATE &&
> + !req->r_reply_info.has_create_ino) {
> + /* This should never happen on an async create */
> + WARN_ON_ONCE(req->r_deleg_ino);
> + iput(in);
> + in = NULL;
> + }
> +
> + in = ceph_get_inode(mdsc->fsc->sb, tvino, in);
> + if (IS_ERR(in)) {
> + err = PTR_ERR(in);
> + goto done;
> + }
>
> - in = req->r_target_inode;
> err = ceph_fill_inode(in, req->r_locked_page, &rinfo->targeti,
> NULL, session,
> (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) &&
> @@ -1636,13 +1654,13 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
> if (err < 0) {
> pr_err_client(cl, "badness %p %llx.%llx\n", in,
> ceph_vinop(in));
> - req->r_target_inode = NULL;
> if (in->i_state & I_NEW)
> discard_new_inode(in);
> else
> iput(in);
> goto done;
> }
> + req->r_target_inode = in;
> if (in->i_state & I_NEW)
> unlock_new_inode(in);
> }
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index 230e0c3f341f..8b70f2b96f46 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -3874,36 +3874,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
> session->s_con.peer_features);
> mutex_unlock(&mdsc->mutex);
>
> - /* Must find target inode outside of mutexes to avoid deadlocks */
> rinfo = &req->r_reply_info;
> - if ((err >= 0) && rinfo->head->is_target) {
> - struct inode *in = xchg(&req->r_new_inode, NULL);
> - struct ceph_vino tvino = {
> - .ino = le64_to_cpu(rinfo->targeti.in->ino),
> - .snap = le64_to_cpu(rinfo->targeti.in->snapid)
> - };
> -
> - /*
> - * If we ended up opening an existing inode, discard
> - * r_new_inode
> - */
> - if (req->r_op == CEPH_MDS_OP_CREATE &&
> - !req->r_reply_info.has_create_ino) {
> - /* This should never happen on an async create */
> - WARN_ON_ONCE(req->r_deleg_ino);
> - iput(in);
> - in = NULL;
> - }
> -
> - in = ceph_get_inode(mdsc->fsc->sb, tvino, in);
> - if (IS_ERR(in)) {
> - err = PTR_ERR(in);
> - mutex_lock(&session->s_mutex);
> - goto out_err;
> - }
> - req->r_target_inode = in;
> - }
> -
> mutex_lock(&session->s_mutex);
> if (err < 0) {
> pr_err_client(cl, "got corrupt reply mds%d(tid:%lld)\n",
© 2016 - 2026 Red Hat, Inc.