The nfsd4_callback workqueue jobs exist to queue backchannel RPCs to
rpciod. Because they run in different workqueue contexts, the rpc_task
can run concurrently with the workqueue job itself, should it become
requeued. This is problematic as there is no locking when accessing the
fields in the nfsd4_callback.
Add a new unsigned long to nfsd4_callback and declare a new
NFSD4_CALLBACK_RUNNING flag to be set in it. When attempting to run a
workqueue job, do a test_and_set_bit() on that flag first, and don't
queue the workqueue job if it returns true. Clear NFSD4_CALLBACK_RUNNING
in nfsd41_destroy_cb().
This also gives us a more reliable mechanism for handling queueing
failures in codepaths where we have to take references under spinlocks.
We can now do the test_and_set_bit on NFSD4_CALLBACK_RUNNING first, and
only take references to the objects if that returns false.
Most of the nfsd4_run_cb() callers are converted to use this new flag or
the nfsd4_try_run_cb() wrapper. The main exception is the callback
channel probe, which has its own synchronization.
Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
fs/nfsd/nfs4callback.c | 2 ++
fs/nfsd/nfs4layouts.c | 7 ++++---
fs/nfsd/nfs4proc.c | 2 +-
fs/nfsd/nfs4state.c | 14 ++++++++++----
fs/nfsd/state.h | 9 +++++++++
5 files changed, 26 insertions(+), 8 deletions(-)
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index ae4b7b6df47ff054db197bd2f6083905e4149bee..1f26c811e5f73c2e745ee68d0b6e668d1dd7c704 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -1312,6 +1312,7 @@ static void nfsd41_destroy_cb(struct nfsd4_callback *cb)
trace_nfsd_cb_destroy(clp, cb);
nfsd41_cb_release_slot(cb);
+ clear_bit(NFSD4_CALLBACK_RUNNING, &cb->cb_flags);
if (cb->cb_ops && cb->cb_ops->release)
cb->cb_ops->release(cb);
nfsd41_cb_inflight_end(clp);
@@ -1632,6 +1633,7 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
cb->cb_msg.rpc_proc = &nfs4_cb_procedures[op];
cb->cb_msg.rpc_argp = cb;
cb->cb_msg.rpc_resp = cb;
+ cb->cb_flags = 0;
cb->cb_ops = ops;
INIT_WORK(&cb->cb_work, nfsd4_run_cb_work);
cb->cb_status = 0;
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index fbfddd3c4c943c34aa3991328eacb5759e311cc4..290271ac424540e4405a5fd0eacc8db9f47603cd 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -344,9 +344,10 @@ nfsd4_recall_file_layout(struct nfs4_layout_stateid *ls)
atomic_inc(&ls->ls_stid.sc_file->fi_lo_recalls);
trace_nfsd_layout_recall(&ls->ls_stid.sc_stateid);
- refcount_inc(&ls->ls_stid.sc_count);
- nfsd4_run_cb(&ls->ls_recall);
-
+ if (!test_and_set_bit(NFSD4_CALLBACK_RUNNING, &ls->ls_recall.cb_flags)) {
+ refcount_inc(&ls->ls_stid.sc_count);
+ nfsd4_run_cb(&ls->ls_recall);
+ }
out_unlock:
spin_unlock(&ls->ls_lock);
}
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index f6e06c779d09dacdcea81fb3b4135bf600f6cc63..b397246dae7b7e8c2a0ba436bb3813213cfe4fa2 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1847,7 +1847,7 @@ static void nfsd4_send_cb_offload(struct nfsd4_copy *copy)
NFSPROC4_CLNT_CB_OFFLOAD);
trace_nfsd_cb_offload(copy->cp_clp, &cbo->co_res.cb_stateid,
&cbo->co_fh, copy->cp_count, copy->nfserr);
- nfsd4_run_cb(&cbo->co_cb);
+ nfsd4_try_run_cb(&cbo->co_cb);
}
/**
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index e806fd97cca972559d1929d37c1a24e760d9d6d8..fabcd979c40695ebcc795cfd2d8a035b7d589a37 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3232,8 +3232,10 @@ static void nfs4_cb_getattr(struct nfs4_cb_fattr *ncf)
/* set to proper status when nfsd4_cb_getattr_done runs */
ncf->ncf_cb_status = NFS4ERR_IO;
- refcount_inc(&dp->dl_stid.sc_count);
- nfsd4_run_cb(&ncf->ncf_getattr);
+ if (!test_and_set_bit(NFSD4_CALLBACK_RUNNING, &ncf->ncf_getattr.cb_flags)) {
+ refcount_inc(&dp->dl_stid.sc_count);
+ nfsd4_run_cb(&ncf->ncf_getattr);
+ }
}
static struct nfs4_client *create_client(struct xdr_netobj name,
@@ -5422,6 +5424,10 @@ static const struct nfsd4_callback_ops nfsd4_cb_recall_ops = {
static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
{
bool queued;
+
+ if (test_and_set_bit(NFSD4_CALLBACK_RUNNING, &dp->dl_recall.cb_flags))
+ return;
+
/*
* We're assuming the state code never drops its reference
* without first removing the lease. Since we're in this lease
@@ -6910,7 +6916,7 @@ deleg_reaper(struct nfsd_net *nn)
clp->cl_ra->ra_bmval[0] = BIT(RCA4_TYPE_MASK_RDATA_DLG) |
BIT(RCA4_TYPE_MASK_WDATA_DLG);
trace_nfsd_cb_recall_any(clp->cl_ra);
- nfsd4_run_cb(&clp->cl_ra->ra_cb);
+ nfsd4_try_run_cb(&clp->cl_ra->ra_cb);
}
}
@@ -7839,7 +7845,7 @@ nfsd4_lm_notify(struct file_lock *fl)
if (queue) {
trace_nfsd_cb_notify_lock(lo, nbl);
- nfsd4_run_cb(&nbl->nbl_cb);
+ nfsd4_try_run_cb(&nbl->nbl_cb);
}
}
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 74d2d7b42676d907bec9159b927aeed223d668c3..e6af969a03f26b1feb6768af5baa4887bf5fa5e9 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -67,6 +67,8 @@ typedef struct {
struct nfsd4_callback {
struct nfs4_client *cb_clp;
struct rpc_message cb_msg;
+#define NFSD4_CALLBACK_RUNNING BIT(0) // Callback is running
+ unsigned long cb_flags;
const struct nfsd4_callback_ops *cb_ops;
struct work_struct cb_work;
int cb_seq_status;
@@ -780,6 +782,13 @@ extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *
extern void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
const struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op);
extern bool nfsd4_run_cb(struct nfsd4_callback *cb);
+
+static inline void nfsd4_try_run_cb(struct nfsd4_callback *cb)
+{
+ if (!test_and_set_bit(NFSD4_CALLBACK_RUNNING, &cb->cb_flags))
+ WARN_ON_ONCE(!nfsd4_run_cb(cb));
+}
+
extern void nfsd4_shutdown_callback(struct nfs4_client *);
extern void nfsd4_shutdown_copy(struct nfs4_client *clp);
void nfsd4_async_copy_reaper(struct nfsd_net *nn);
--
2.48.1
在 2025/2/21 00:47, Jeff Layton 写道: > Most of the nfsd4_run_cb() callers are converted to use this new flag or > the nfsd4_try_run_cb() wrapper. The main exception is the callback > channel probe, which has its own synchronization. > Hi Jeff: We had a null-ptr-deref in nfsd4_probe_callback(): [24225.738349] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000 ... [24225.803480] Call trace: [24225.804639] __queue_work+0xb4/0x558 [24225.805949] queue_work_on+0x88/0x90 [24225.807306] nfsd4_probe_callback+0x4c/0x58 [nfsd] [24225.808896] nfsd4_probe_callback_sync+0x20/0x38 [nfsd] [24225.808909] nfsd4_init_conn.isra.57+0x8c/0xa8 [nfsd] [24225.815204] nfsd4_create_session+0x5b8/0x718 [nfsd] [24225.817711] nfsd4_proc_compound+0x4c0/0x710 [nfsd] [24225.819329] nfsd_dispatch+0x104/0x248 [nfsd] [24225.820742] svc_process_common+0x348/0x808 [sunrpc] [24225.822294] svc_process+0xb0/0xc8 [sunrpc] [24225.823760] nfsd+0xf0/0x160 [nfsd] [24225.825006] kthread+0x134/0x138 [24225.826336] ret_from_fork+0x10/0x18 Is this patch or patchset can fix this issue? And I'm having trouble understanding the commit message "callback channel probe has its own synchronization", I'd appreciate it if you could explain in more detail. Thanks, ChenXiaoSong.
On Tue, 2025-06-10 at 16:49 +0800, ChenXiaoSong wrote: > 在 2025/2/21 00:47, Jeff Layton 写道: > > Most of the nfsd4_run_cb() callers are converted to use this new flag or > > the nfsd4_try_run_cb() wrapper. The main exception is the callback > > channel probe, which has its own synchronization. > > > > Hi Jeff: > > We had a null-ptr-deref in nfsd4_probe_callback(): > > [24225.738349] Unable to handle kernel NULL pointer dereference at > virtual address 0000000000000000 > ... > [24225.803480] Call trace: > [24225.804639] __queue_work+0xb4/0x558 > [24225.805949] queue_work_on+0x88/0x90 > [24225.807306] nfsd4_probe_callback+0x4c/0x58 [nfsd] > [24225.808896] nfsd4_probe_callback_sync+0x20/0x38 [nfsd] > [24225.808909] nfsd4_init_conn.isra.57+0x8c/0xa8 [nfsd] > [24225.815204] nfsd4_create_session+0x5b8/0x718 [nfsd] > [24225.817711] nfsd4_proc_compound+0x4c0/0x710 [nfsd] > [24225.819329] nfsd_dispatch+0x104/0x248 [nfsd] > [24225.820742] svc_process_common+0x348/0x808 [sunrpc] > [24225.822294] svc_process+0xb0/0xc8 [sunrpc] > [24225.823760] nfsd+0xf0/0x160 [nfsd] > [24225.825006] kthread+0x134/0x138 > [24225.826336] ret_from_fork+0x10/0x18 > > Is this patch or patchset can fix this issue? And I'm having trouble > understanding the commit message "callback channel probe has its own > synchronization", I'd appreciate it if you could explain in more detail. > Synchronization was probably too strong a word. I remember looking over this code and convincing myself that the probe callback wasn't subject to the same races as the others, but I think that was mostly because the outcome of those races was not harmful. Note that the probe itself can actually be run at the start of a completely unrelated callback to the same client. So you hit a NULL pointer in __queue_work()? The work_struct is embedded in the nfs4_client so that would probably imply that that the nfs4_client struct was corrupt? You may want to get a vmcore and analyze it if you can reproduce this. -- Jeff Layton <jlayton@kernel.org>
在 2025/6/10 19:09, Jeff Layton 写道: > > Synchronization was probably too strong a word. I remember looking over > this code and convincing myself that the probe callback wasn't subject > to the same races as the others, but I think that was mostly because > the outcome of those races was not harmful. Note that the probe itself > can actually be run at the start of a completely unrelated callback to > the same client. > > So you hit a NULL pointer in __queue_work()? The work_struct is > embedded in the nfs4_client so that would probably imply that that the > nfs4_client struct was corrupt? > > You may want to get a vmcore and analyze it if you can reproduce this. Thanks for your reply. I have already got a vmcore. Here is the link to the vmcore analysis: https://chenxiaosong.com/en/nfs/en-null-ptr-deref-in-nfsd4_probe_callback.html Please let me know if you need any more detailed information. Thanks, ChenXiaoSong.
Hi Jeff: Do you have any suggestions for this null-ptr-deref issue? Here is the link to the vmcore analysis: https://chenxiaosong.com/en/nfs/en-null-ptr-deref-in-nfsd4_probe_callback.html Thanks, ChenXiaoSong. 在 2025/6/11 15:12, ChenXiaoSong 写道: > 在 2025/6/10 19:09, Jeff Layton 写道: >> >> Synchronization was probably too strong a word. I remember looking over >> this code and convincing myself that the probe callback wasn't subject >> to the same races as the others, but I think that was mostly because >> the outcome of those races was not harmful. Note that the probe itself >> can actually be run at the start of a completely unrelated callback to >> the same client. >> >> So you hit a NULL pointer in __queue_work()? The work_struct is >> embedded in the nfs4_client so that would probably imply that that the >> nfs4_client struct was corrupt? >> >> You may want to get a vmcore and analyze it if you can reproduce this. > > Thanks for your reply. > > I have already got a vmcore. Here is the link to the vmcore analysis: > > https://chenxiaosong.com/en/nfs/en-null-ptr-deref-in- > nfsd4_probe_callback.html > > Please let me know if you need any more detailed information. > > Thanks, > ChenXiaoSong.
On Mon, 2025-06-16 at 15:49 +0800, ChenXiaoSong wrote: > Hi Jeff: > > Do you have any suggestions for this null-ptr-deref issue? > > Here is the link to the vmcore analysis: > > https://chenxiaosong.com/en/nfs/en-null-ptr-deref-in-nfsd4_probe_callback.html > > Thanks, > ChenXiaoSong. > Not right offhand. My first guess would be some sort of UAF. Maybe a nfs4_client refcounting issue? If this is reproducible then you may want to turn up KASAN which might give you more info. That said, 4.19.90 is more than 6 years old at this point. You may want to consider moving to something more recent. > 在 2025/6/11 15:12, ChenXiaoSong 写道: > > 在 2025/6/10 19:09, Jeff Layton 写道: > > > > > > Synchronization was probably too strong a word. I remember looking over > > > this code and convincing myself that the probe callback wasn't subject > > > to the same races as the others, but I think that was mostly because > > > the outcome of those races was not harmful. Note that the probe itself > > > can actually be run at the start of a completely unrelated callback to > > > the same client. > > > > > > So you hit a NULL pointer in __queue_work()? The work_struct is > > > embedded in the nfs4_client so that would probably imply that that the > > > nfs4_client struct was corrupt? > > > > > > You may want to get a vmcore and analyze it if you can reproduce this. > > > > Thanks for your reply. > > > > I have already got a vmcore. Here is the link to the vmcore analysis: > > > > https://chenxiaosong.com/en/nfs/en-null-ptr-deref-in- > > nfsd4_probe_callback.html > > > > Please let me know if you need any more detailed information. > > > > Thanks, > > ChenXiaoSong. > -- Jeff Layton <jlayton@kernel.org>
OK, I'll try to reproduce it. Thanks for your reply. 在 2025/6/16 20:09, Jeff Layton 写道: > Not right offhand. My first guess would be some sort of UAF. Maybe a > nfs4_client refcounting issue? If this is reproducible then you may > want to turn up KASAN which might give you more info. > > That said, 4.19.90 is more than 6 years old at this point. You may want > to consider moving to something more recent.
© 2016 - 2025 Red Hat, Inc.