fs/nfs/sysfs.c | 19 ++++--------------- include/linux/sunrpc/sched.h | 1 + net/sunrpc/clnt.c | 12 ++++++++++++ net/sunrpc/debugfs.c | 15 +++++++++++++++ 4 files changed, 32 insertions(+), 15 deletions(-)
There have been confirmed reports where a container with an NFS mount
inside it dies abruptly, along with all of its processes, but the NFS
client sticks around and keeps trying to send RPCs after the networking
is gone.
We have a reproducer where if we SIGKILL a container with an NFS mount,
the RPC clients will stick around indefinitely. The orchestrator
does a MNT_DETACH unmount on the NFS mount, and then tears down the
networking while there are still RPCs in flight.
Recently new controls were added[1] that allow shutting down an NFS
mount. That doesn't help here since the mount namespace is detached from
any tasks at this point.
Transplant shutdown_client() to the sunrpc module, and give it a more
distinct name. Add a new debugfs sunrpc/rpc_clnt/*/shutdown knob that
allows the same functionality as the one in /sys/fs/nfs, but at the
rpc_clnt level.
[1]: commit d9615d166c7e ("NFS: add sysfs shutdown knob").
Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
fs/nfs/sysfs.c | 19 ++++---------------
include/linux/sunrpc/sched.h | 1 +
net/sunrpc/clnt.c | 12 ++++++++++++
net/sunrpc/debugfs.c | 15 +++++++++++++++
4 files changed, 32 insertions(+), 15 deletions(-)
diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c
index 7b59a40d40c061a41b0fbde91aa006314f02c1fb..c29c5fd639554461bdcd9ff612726194910d85b5 100644
--- a/fs/nfs/sysfs.c
+++ b/fs/nfs/sysfs.c
@@ -217,17 +217,6 @@ void nfs_netns_sysfs_destroy(struct nfs_net *netns)
}
}
-static bool shutdown_match_client(const struct rpc_task *task, const void *data)
-{
- return true;
-}
-
-static void shutdown_client(struct rpc_clnt *clnt)
-{
- clnt->cl_shutdown = 1;
- rpc_cancel_tasks(clnt, -EIO, shutdown_match_client, NULL);
-}
-
static ssize_t
shutdown_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
@@ -258,14 +247,14 @@ shutdown_store(struct kobject *kobj, struct kobj_attribute *attr,
goto out;
server->flags |= NFS_MOUNT_SHUTDOWN;
- shutdown_client(server->client);
- shutdown_client(server->nfs_client->cl_rpcclient);
+ rpc_clnt_shutdown(server->client);
+ rpc_clnt_shutdown(server->nfs_client->cl_rpcclient);
if (!IS_ERR(server->client_acl))
- shutdown_client(server->client_acl);
+ rpc_clnt_shutdown(server->client_acl);
if (server->nlm_host)
- shutdown_client(server->nlm_host->h_rpcclnt);
+ rpc_clnt_shutdown(server->nlm_host->h_rpcclnt);
out:
return count;
}
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index eac57914dcf3200c1a6ed39ab030e3fe8b4da3e1..fe7c39a17ce44ec68c0cf057133d0f8e7f0ae797 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -232,6 +232,7 @@ unsigned long rpc_cancel_tasks(struct rpc_clnt *clnt, int error,
bool (*fnmatch)(const struct rpc_task *,
const void *),
const void *data);
+void rpc_clnt_shutdown(struct rpc_clnt *clnt);
void rpc_execute(struct rpc_task *);
void rpc_init_priority_wait_queue(struct rpc_wait_queue *, const char *);
void rpc_init_wait_queue(struct rpc_wait_queue *, const char *);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 2fe88ea79a70c134e58abfb03fca230883eddf1f..0028858b12d97e7b45f4c24cfbd761ba2a734b32 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -934,6 +934,18 @@ unsigned long rpc_cancel_tasks(struct rpc_clnt *clnt, int error,
}
EXPORT_SYMBOL_GPL(rpc_cancel_tasks);
+static bool shutdown_match_client(const struct rpc_task *task, const void *data)
+{
+ return true;
+}
+
+void rpc_clnt_shutdown(struct rpc_clnt *clnt)
+{
+ clnt->cl_shutdown = 1;
+ rpc_cancel_tasks(clnt, -EIO, shutdown_match_client, NULL);
+}
+EXPORT_SYMBOL_GPL(rpc_clnt_shutdown);
+
static int rpc_clnt_disconnect_xprt(struct rpc_clnt *clnt,
struct rpc_xprt *xprt, void *dummy)
{
diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c
index 32417db340de3775c533d0ad683b5b37800d7fe5..4df31dcca2d747db6767c12ddfa29963ed7be204 100644
--- a/net/sunrpc/debugfs.c
+++ b/net/sunrpc/debugfs.c
@@ -145,6 +145,17 @@ static int do_xprt_debugfs(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *n
return 0;
}
+static int
+clnt_shutdown(void *data, u64 value)
+{
+ struct rpc_clnt *clnt = data;
+
+ rpc_clnt_shutdown(clnt);
+ return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(shutdown_fops, NULL, clnt_shutdown, "%llu\n");
+
void
rpc_clnt_debugfs_register(struct rpc_clnt *clnt)
{
@@ -163,6 +174,10 @@ rpc_clnt_debugfs_register(struct rpc_clnt *clnt)
debugfs_create_file("tasks", S_IFREG | 0400, clnt->cl_debugfs, clnt,
&tasks_fops);
+ /* make shutdown file */
+ debugfs_create_file("shutdown", S_IFREG | 0200, clnt->cl_debugfs, clnt,
+ &shutdown_fops);
+
rpc_clnt_iterate_for_each_xprt(clnt, do_xprt_debugfs, &xprtnum);
}
---
base-commit: 80e54e84911a923c40d7bee33a34c1b4be148d7a
change-id: 20250312-rpc-shutdown-ce9b6d3599da
Best regards,
--
Jeff Layton <jlayton@kernel.org>
On 12 Mar 2025, at 9:36, Jeff Layton wrote:
> There have been confirmed reports where a container with an NFS mount
> inside it dies abruptly, along with all of its processes, but the NFS
> client sticks around and keeps trying to send RPCs after the networking
> is gone.
>
> We have a reproducer where if we SIGKILL a container with an NFS mount,
> the RPC clients will stick around indefinitely. The orchestrator
> does a MNT_DETACH unmount on the NFS mount, and then tears down the
> networking while there are still RPCs in flight.
>
> Recently new controls were added[1] that allow shutting down an NFS
> mount. That doesn't help here since the mount namespace is detached from
> any tasks at this point.
That's interesting - seems like the orchestrator could just reorder its
request to shutdown before detaching the mount namespace. Not an objection,
just wondering why the MNT_DETACH must come first.
> Transplant shutdown_client() to the sunrpc module, and give it a more
> distinct name. Add a new debugfs sunrpc/rpc_clnt/*/shutdown knob that
> allows the same functionality as the one in /sys/fs/nfs, but at the
> rpc_clnt level.
>
> [1]: commit d9615d166c7e ("NFS: add sysfs shutdown knob").
>
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
I have a TODO to patch Documentation/ for this knob mostly to write warnings
because there are some potential "gotchas" here - for example you can have
shared RPC clients and shutting down one of those can cause problems for a
different mount (this is true today with the /sys/fs/nfs/[bdi]/shutdown
knob). Shutting down aribitrary clients will definitely break things in
weird ways, its not a safe place to explore.
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Ben
> ---
> fs/nfs/sysfs.c | 19 ++++---------------
> include/linux/sunrpc/sched.h | 1 +
> net/sunrpc/clnt.c | 12 ++++++++++++
> net/sunrpc/debugfs.c | 15 +++++++++++++++
> 4 files changed, 32 insertions(+), 15 deletions(-)
>
> diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c
> index 7b59a40d40c061a41b0fbde91aa006314f02c1fb..c29c5fd639554461bdcd9ff612726194910d85b5 100644
> --- a/fs/nfs/sysfs.c
> +++ b/fs/nfs/sysfs.c
> @@ -217,17 +217,6 @@ void nfs_netns_sysfs_destroy(struct nfs_net *netns)
> }
> }
>
> -static bool shutdown_match_client(const struct rpc_task *task, const void *data)
> -{
> - return true;
> -}
> -
> -static void shutdown_client(struct rpc_clnt *clnt)
> -{
> - clnt->cl_shutdown = 1;
> - rpc_cancel_tasks(clnt, -EIO, shutdown_match_client, NULL);
> -}
> -
> static ssize_t
> shutdown_show(struct kobject *kobj, struct kobj_attribute *attr,
> char *buf)
> @@ -258,14 +247,14 @@ shutdown_store(struct kobject *kobj, struct kobj_attribute *attr,
> goto out;
>
> server->flags |= NFS_MOUNT_SHUTDOWN;
> - shutdown_client(server->client);
> - shutdown_client(server->nfs_client->cl_rpcclient);
> + rpc_clnt_shutdown(server->client);
> + rpc_clnt_shutdown(server->nfs_client->cl_rpcclient);
>
> if (!IS_ERR(server->client_acl))
> - shutdown_client(server->client_acl);
> + rpc_clnt_shutdown(server->client_acl);
>
> if (server->nlm_host)
> - shutdown_client(server->nlm_host->h_rpcclnt);
> + rpc_clnt_shutdown(server->nlm_host->h_rpcclnt);
> out:
> return count;
> }
> diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
> index eac57914dcf3200c1a6ed39ab030e3fe8b4da3e1..fe7c39a17ce44ec68c0cf057133d0f8e7f0ae797 100644
> --- a/include/linux/sunrpc/sched.h
> +++ b/include/linux/sunrpc/sched.h
> @@ -232,6 +232,7 @@ unsigned long rpc_cancel_tasks(struct rpc_clnt *clnt, int error,
> bool (*fnmatch)(const struct rpc_task *,
> const void *),
> const void *data);
> +void rpc_clnt_shutdown(struct rpc_clnt *clnt);
> void rpc_execute(struct rpc_task *);
> void rpc_init_priority_wait_queue(struct rpc_wait_queue *, const char *);
> void rpc_init_wait_queue(struct rpc_wait_queue *, const char *);
> diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
> index 2fe88ea79a70c134e58abfb03fca230883eddf1f..0028858b12d97e7b45f4c24cfbd761ba2a734b32 100644
> --- a/net/sunrpc/clnt.c
> +++ b/net/sunrpc/clnt.c
> @@ -934,6 +934,18 @@ unsigned long rpc_cancel_tasks(struct rpc_clnt *clnt, int error,
> }
> EXPORT_SYMBOL_GPL(rpc_cancel_tasks);
>
> +static bool shutdown_match_client(const struct rpc_task *task, const void *data)
> +{
> + return true;
> +}
> +
> +void rpc_clnt_shutdown(struct rpc_clnt *clnt)
> +{
> + clnt->cl_shutdown = 1;
> + rpc_cancel_tasks(clnt, -EIO, shutdown_match_client, NULL);
> +}
> +EXPORT_SYMBOL_GPL(rpc_clnt_shutdown);
> +
> static int rpc_clnt_disconnect_xprt(struct rpc_clnt *clnt,
> struct rpc_xprt *xprt, void *dummy)
> {
> diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c
> index 32417db340de3775c533d0ad683b5b37800d7fe5..4df31dcca2d747db6767c12ddfa29963ed7be204 100644
> --- a/net/sunrpc/debugfs.c
> +++ b/net/sunrpc/debugfs.c
> @@ -145,6 +145,17 @@ static int do_xprt_debugfs(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *n
> return 0;
> }
>
> +static int
> +clnt_shutdown(void *data, u64 value)
> +{
> + struct rpc_clnt *clnt = data;
> +
> + rpc_clnt_shutdown(clnt);
> + return 0;
> +}
> +
> +DEFINE_DEBUGFS_ATTRIBUTE(shutdown_fops, NULL, clnt_shutdown, "%llu\n");
> +
> void
> rpc_clnt_debugfs_register(struct rpc_clnt *clnt)
> {
> @@ -163,6 +174,10 @@ rpc_clnt_debugfs_register(struct rpc_clnt *clnt)
> debugfs_create_file("tasks", S_IFREG | 0400, clnt->cl_debugfs, clnt,
> &tasks_fops);
>
> + /* make shutdown file */
> + debugfs_create_file("shutdown", S_IFREG | 0200, clnt->cl_debugfs, clnt,
> + &shutdown_fops);
> +
> rpc_clnt_iterate_for_each_xprt(clnt, do_xprt_debugfs, &xprtnum);
> }
>
>
> ---
> base-commit: 80e54e84911a923c40d7bee33a34c1b4be148d7a
> change-id: 20250312-rpc-shutdown-ce9b6d3599da
>
> Best regards,
> --
> Jeff Layton <jlayton@kernel.org>
On Wed, 2025-03-12 at 09:52 -0400, Benjamin Coddington wrote:
> On 12 Mar 2025, at 9:36, Jeff Layton wrote:
>
> > There have been confirmed reports where a container with an NFS mount
> > inside it dies abruptly, along with all of its processes, but the NFS
> > client sticks around and keeps trying to send RPCs after the networking
> > is gone.
> >
> > We have a reproducer where if we SIGKILL a container with an NFS mount,
> > the RPC clients will stick around indefinitely. The orchestrator
> > does a MNT_DETACH unmount on the NFS mount, and then tears down the
> > networking while there are still RPCs in flight.
> >
> > Recently new controls were added[1] that allow shutting down an NFS
> > mount. That doesn't help here since the mount namespace is detached from
> > any tasks at this point.
>
> That's interesting - seems like the orchestrator could just reorder its
> request to shutdown before detaching the mount namespace. Not an objection,
> just wondering why the MNT_DETACH must come first.
>
The reproducer we have is to systemd-nspawn a container, mount up an
NFS mount inside it, start some I/O on it with fio and then kill -9 the
systemd running inside the container. There isn't much the orchestrator
(root-level systemd) can do to at that point other than clean up what's
left.
I'm still working on a way to reliably detect when this has happened.
For now, we just have to notice that some clients aren't dying.
> > Transplant shutdown_client() to the sunrpc module, and give it a more
> > distinct name. Add a new debugfs sunrpc/rpc_clnt/*/shutdown knob that
> > allows the same functionality as the one in /sys/fs/nfs, but at the
> > rpc_clnt level.
> >
> > [1]: commit d9615d166c7e ("NFS: add sysfs shutdown knob").
> >
> > Signed-off-by: Jeff Layton <jlayton@kernel.org>
>
> I have a TODO to patch Documentation/ for this knob mostly to write warnings
> because there are some potential "gotchas" here - for example you can have
> shared RPC clients and shutting down one of those can cause problems for a
> different mount (this is true today with the /sys/fs/nfs/[bdi]/shutdown
> knob). Shutting down aribitrary clients will definitely break things in
> weird ways, its not a safe place to explore.
>
Yes, you really do need to know what you're doing. 0200 permissions are
essential for this file, IOW. Thanks for the R-b!
> Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
>
> Ben
>
> > ---
> > fs/nfs/sysfs.c | 19 ++++---------------
> > include/linux/sunrpc/sched.h | 1 +
> > net/sunrpc/clnt.c | 12 ++++++++++++
> > net/sunrpc/debugfs.c | 15 +++++++++++++++
> > 4 files changed, 32 insertions(+), 15 deletions(-)
> >
> > diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c
> > index 7b59a40d40c061a41b0fbde91aa006314f02c1fb..c29c5fd639554461bdcd9ff612726194910d85b5 100644
> > --- a/fs/nfs/sysfs.c
> > +++ b/fs/nfs/sysfs.c
> > @@ -217,17 +217,6 @@ void nfs_netns_sysfs_destroy(struct nfs_net *netns)
> > }
> > }
> >
> > -static bool shutdown_match_client(const struct rpc_task *task, const void *data)
> > -{
> > - return true;
> > -}
> > -
> > -static void shutdown_client(struct rpc_clnt *clnt)
> > -{
> > - clnt->cl_shutdown = 1;
> > - rpc_cancel_tasks(clnt, -EIO, shutdown_match_client, NULL);
> > -}
> > -
> > static ssize_t
> > shutdown_show(struct kobject *kobj, struct kobj_attribute *attr,
> > char *buf)
> > @@ -258,14 +247,14 @@ shutdown_store(struct kobject *kobj, struct kobj_attribute *attr,
> > goto out;
> >
> > server->flags |= NFS_MOUNT_SHUTDOWN;
> > - shutdown_client(server->client);
> > - shutdown_client(server->nfs_client->cl_rpcclient);
> > + rpc_clnt_shutdown(server->client);
> > + rpc_clnt_shutdown(server->nfs_client->cl_rpcclient);
> >
> > if (!IS_ERR(server->client_acl))
> > - shutdown_client(server->client_acl);
> > + rpc_clnt_shutdown(server->client_acl);
> >
> > if (server->nlm_host)
> > - shutdown_client(server->nlm_host->h_rpcclnt);
> > + rpc_clnt_shutdown(server->nlm_host->h_rpcclnt);
> > out:
> > return count;
> > }
> > diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
> > index eac57914dcf3200c1a6ed39ab030e3fe8b4da3e1..fe7c39a17ce44ec68c0cf057133d0f8e7f0ae797 100644
> > --- a/include/linux/sunrpc/sched.h
> > +++ b/include/linux/sunrpc/sched.h
> > @@ -232,6 +232,7 @@ unsigned long rpc_cancel_tasks(struct rpc_clnt *clnt, int error,
> > bool (*fnmatch)(const struct rpc_task *,
> > const void *),
> > const void *data);
> > +void rpc_clnt_shutdown(struct rpc_clnt *clnt);
> > void rpc_execute(struct rpc_task *);
> > void rpc_init_priority_wait_queue(struct rpc_wait_queue *, const char *);
> > void rpc_init_wait_queue(struct rpc_wait_queue *, const char *);
> > diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
> > index 2fe88ea79a70c134e58abfb03fca230883eddf1f..0028858b12d97e7b45f4c24cfbd761ba2a734b32 100644
> > --- a/net/sunrpc/clnt.c
> > +++ b/net/sunrpc/clnt.c
> > @@ -934,6 +934,18 @@ unsigned long rpc_cancel_tasks(struct rpc_clnt *clnt, int error,
> > }
> > EXPORT_SYMBOL_GPL(rpc_cancel_tasks);
> >
> > +static bool shutdown_match_client(const struct rpc_task *task, const void *data)
> > +{
> > + return true;
> > +}
> > +
> > +void rpc_clnt_shutdown(struct rpc_clnt *clnt)
> > +{
> > + clnt->cl_shutdown = 1;
> > + rpc_cancel_tasks(clnt, -EIO, shutdown_match_client, NULL);
> > +}
> > +EXPORT_SYMBOL_GPL(rpc_clnt_shutdown);
> > +
> > static int rpc_clnt_disconnect_xprt(struct rpc_clnt *clnt,
> > struct rpc_xprt *xprt, void *dummy)
> > {
> > diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c
> > index 32417db340de3775c533d0ad683b5b37800d7fe5..4df31dcca2d747db6767c12ddfa29963ed7be204 100644
> > --- a/net/sunrpc/debugfs.c
> > +++ b/net/sunrpc/debugfs.c
> > @@ -145,6 +145,17 @@ static int do_xprt_debugfs(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *n
> > return 0;
> > }
> >
> > +static int
> > +clnt_shutdown(void *data, u64 value)
> > +{
> > + struct rpc_clnt *clnt = data;
> > +
> > + rpc_clnt_shutdown(clnt);
> > + return 0;
> > +}
> > +
> > +DEFINE_DEBUGFS_ATTRIBUTE(shutdown_fops, NULL, clnt_shutdown, "%llu\n");
> > +
> > void
> > rpc_clnt_debugfs_register(struct rpc_clnt *clnt)
> > {
> > @@ -163,6 +174,10 @@ rpc_clnt_debugfs_register(struct rpc_clnt *clnt)
> > debugfs_create_file("tasks", S_IFREG | 0400, clnt->cl_debugfs, clnt,
> > &tasks_fops);
> >
> > + /* make shutdown file */
> > + debugfs_create_file("shutdown", S_IFREG | 0200, clnt->cl_debugfs, clnt,
> > + &shutdown_fops);
> > +
> > rpc_clnt_iterate_for_each_xprt(clnt, do_xprt_debugfs, &xprtnum);
> > }
> >
> >
> > ---
> > base-commit: 80e54e84911a923c40d7bee33a34c1b4be148d7a
> > change-id: 20250312-rpc-shutdown-ce9b6d3599da
> >
> > Best regards,
> > --
> > Jeff Layton <jlayton@kernel.org>
>
--
Jeff Layton <jlayton@kernel.org>
On Wed, 2025-03-12 at 10:37 -0400, Jeff Layton wrote:
> On Wed, 2025-03-12 at 09:52 -0400, Benjamin Coddington wrote:
> > On 12 Mar 2025, at 9:36, Jeff Layton wrote:
> >
> > > There have been confirmed reports where a container with an NFS
> > > mount
> > > inside it dies abruptly, along with all of its processes, but the
> > > NFS
> > > client sticks around and keeps trying to send RPCs after the
> > > networking
> > > is gone.
> > >
> > > We have a reproducer where if we SIGKILL a container with an NFS
> > > mount,
> > > the RPC clients will stick around indefinitely. The orchestrator
> > > does a MNT_DETACH unmount on the NFS mount, and then tears down
> > > the
> > > networking while there are still RPCs in flight.
> > >
> > > Recently new controls were added[1] that allow shutting down an
> > > NFS
> > > mount. That doesn't help here since the mount namespace is
> > > detached from
> > > any tasks at this point.
> >
> > That's interesting - seems like the orchestrator could just reorder
> > its
> > request to shutdown before detaching the mount namespace. Not an
> > objection,
> > just wondering why the MNT_DETACH must come first.
> >
>
> The reproducer we have is to systemd-nspawn a container, mount up an
> NFS mount inside it, start some I/O on it with fio and then kill -9
> the
> systemd running inside the container. There isn't much the
> orchestrator
> (root-level systemd) can do to at that point other than clean up
> what's
> left.
>
> I'm still working on a way to reliably detect when this has happened.
> For now, we just have to notice that some clients aren't dying.
>
> > > Transplant shutdown_client() to the sunrpc module, and give it a
> > > more
> > > distinct name. Add a new debugfs sunrpc/rpc_clnt/*/shutdown knob
> > > that
> > > allows the same functionality as the one in /sys/fs/nfs, but at
> > > the
> > > rpc_clnt level.
> > >
> > > [1]: commit d9615d166c7e ("NFS: add sysfs shutdown knob").
> > >
> > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> >
> > I have a TODO to patch Documentation/ for this knob mostly to write
> > warnings
> > because there are some potential "gotchas" here - for example you
> > can have
> > shared RPC clients and shutting down one of those can cause
> > problems for a
> > different mount (this is true today with the
> > /sys/fs/nfs/[bdi]/shutdown
> > knob). Shutting down aribitrary clients will definitely break
> > things in
> > weird ways, its not a safe place to explore.
> >
>
> Yes, you really do need to know what you're doing. 0200 permissions
> are
> essential for this file, IOW. Thanks for the R-b!
Sorry, but NACK! We should not be adding control mechanisms to debugfs.
One thing that might work in situations like this is perhaps to make
use of the fact that we are monitoring whether or not rpc_pipefs is
mounted. So if the mount is containerised, and the orchestrator
unmounts everything, including rpc_pipefs, we might take that as a hint
that we should treat any future connection errors as being fatal.
Otherwise, we'd have to be able to monitor the root task, and check if
it is still alive in order to figure out if out containerised world has
collapsed.
--
Trond Myklebust
Linux NFS client maintainer, Hammerspace
trond.myklebust@hammerspace.com
On Wed, 2025-03-12 at 22:31 +0000, Trond Myklebust wrote:
> On Wed, 2025-03-12 at 10:37 -0400, Jeff Layton wrote:
> > On Wed, 2025-03-12 at 09:52 -0400, Benjamin Coddington wrote:
> > > On 12 Mar 2025, at 9:36, Jeff Layton wrote:
> > >
> > > > There have been confirmed reports where a container with an NFS
> > > > mount
> > > > inside it dies abruptly, along with all of its processes, but the
> > > > NFS
> > > > client sticks around and keeps trying to send RPCs after the
> > > > networking
> > > > is gone.
> > > >
> > > > We have a reproducer where if we SIGKILL a container with an NFS
> > > > mount,
> > > > the RPC clients will stick around indefinitely. The orchestrator
> > > > does a MNT_DETACH unmount on the NFS mount, and then tears down
> > > > the
> > > > networking while there are still RPCs in flight.
> > > >
> > > > Recently new controls were added[1] that allow shutting down an
> > > > NFS
> > > > mount. That doesn't help here since the mount namespace is
> > > > detached from
> > > > any tasks at this point.
> > >
> > > That's interesting - seems like the orchestrator could just reorder
> > > its
> > > request to shutdown before detaching the mount namespace. Not an
> > > objection,
> > > just wondering why the MNT_DETACH must come first.
> > >
> >
> > The reproducer we have is to systemd-nspawn a container, mount up an
> > NFS mount inside it, start some I/O on it with fio and then kill -9
> > the
> > systemd running inside the container. There isn't much the
> > orchestrator
> > (root-level systemd) can do to at that point other than clean up
> > what's
> > left.
> >
> > I'm still working on a way to reliably detect when this has happened.
> > For now, we just have to notice that some clients aren't dying.
> >
> > > > Transplant shutdown_client() to the sunrpc module, and give it a
> > > > more
> > > > distinct name. Add a new debugfs sunrpc/rpc_clnt/*/shutdown knob
> > > > that
> > > > allows the same functionality as the one in /sys/fs/nfs, but at
> > > > the
> > > > rpc_clnt level.
> > > >
> > > > [1]: commit d9615d166c7e ("NFS: add sysfs shutdown knob").
> > > >
> > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > >
> > > I have a TODO to patch Documentation/ for this knob mostly to write
> > > warnings
> > > because there are some potential "gotchas" here - for example you
> > > can have
> > > shared RPC clients and shutting down one of those can cause
> > > problems for a
> > > different mount (this is true today with the
> > > /sys/fs/nfs/[bdi]/shutdown
> > > knob). Shutting down aribitrary clients will definitely break
> > > things in
> > > weird ways, its not a safe place to explore.
> > >
> >
> > Yes, you really do need to know what you're doing. 0200 permissions
> > are
> > essential for this file, IOW. Thanks for the R-b!
>
> Sorry, but NACK! We should not be adding control mechanisms to debugfs.
>
Ok. Would adding sunrpc controls under sysfs be more acceptable? I do
agree that this is a potential footgun, however. It would be nicer to
clean this situation up automagically.
> One thing that might work in situations like this is perhaps to make
> use of the fact that we are monitoring whether or not rpc_pipefs is
> mounted. So if the mount is containerised, and the orchestrator
> unmounts everything, including rpc_pipefs, we might take that as a hint
> that we should treat any future connection errors as being fatal.
>
rpc_pipefs isn't being mounted at all in the container I'm using. I
think that's not going to be a reliable test for this.
> Otherwise, we'd have to be able to monitor the root task, and check if
> it is still alive in order to figure out if out containerised world has
> collapsed.
>
If by the root task, you mean the initial task in the container, then
that method seems a little sketchy too. How would we determine that
from the RPC layer?
To be clear: the situation here is that we have a container with a veth
device that is communicating with the outside world. Once all of the
processes in the container exit, the veth device in the container
disappears. The rpc_xprt holds a ref on the netns though, so that
sticks around trying to retransmit indefinitely.
I think what we really need is a lightweight reference on the netns.
Something where we can tell that there are no userland tasks that care
about it anymore, so we can be more aggressive about giving up on it.
There is a "passive" refcount inside struct net, but that's not quite
what we need as it won't keep the sunrpc_net in place.
What if instead of holding a netns reference in the xprt, we have it
hold a reference on a new refcount_t that lives in sunrpc_net? Then, we
add a pre_exit pernet_ops callback that does a shutdown_client() on all
of the rpc_clnt's attached to the xprts in that netns. The pre_exit can
then just block until the sunrpc_net refcount goes to 0.
I think that would allow everything to be cleaned up properly?
--
Jeff Layton <jlayton@kernel.org>
On 13 Mar 2025, at 9:15, Jeff Layton wrote:
> On Wed, 2025-03-12 at 22:31 +0000, Trond Myklebust wrote:
>> On Wed, 2025-03-12 at 10:37 -0400, Jeff Layton wrote:
>>> On Wed, 2025-03-12 at 09:52 -0400, Benjamin Coddington wrote:
>>>> On 12 Mar 2025, at 9:36, Jeff Layton wrote:
>>>>
>>>>> There have been confirmed reports where a container with an NFS
>>>>> mount
>>>>> inside it dies abruptly, along with all of its processes, but the
>>>>> NFS
>>>>> client sticks around and keeps trying to send RPCs after the
>>>>> networking
>>>>> is gone.
>>>>>
>>>>> We have a reproducer where if we SIGKILL a container with an NFS
>>>>> mount,
>>>>> the RPC clients will stick around indefinitely. The orchestrator
>>>>> does a MNT_DETACH unmount on the NFS mount, and then tears down
>>>>> the
>>>>> networking while there are still RPCs in flight.
>>>>>
>>>>> Recently new controls were added[1] that allow shutting down an
>>>>> NFS
>>>>> mount. That doesn't help here since the mount namespace is
>>>>> detached from
>>>>> any tasks at this point.
>>>>
>>>> That's interesting - seems like the orchestrator could just reorder
>>>> its
>>>> request to shutdown before detaching the mount namespace. Not an
>>>> objection,
>>>> just wondering why the MNT_DETACH must come first.
>>>>
>>>
>>> The reproducer we have is to systemd-nspawn a container, mount up an
>>> NFS mount inside it, start some I/O on it with fio and then kill -9
>>> the
>>> systemd running inside the container. There isn't much the
>>> orchestrator
>>> (root-level systemd) can do to at that point other than clean up
>>> what's
>>> left.
>>>
>>> I'm still working on a way to reliably detect when this has happened.
>>> For now, we just have to notice that some clients aren't dying.
>>>
>>>>> Transplant shutdown_client() to the sunrpc module, and give it a
>>>>> more
>>>>> distinct name. Add a new debugfs sunrpc/rpc_clnt/*/shutdown knob
>>>>> that
>>>>> allows the same functionality as the one in /sys/fs/nfs, but at
>>>>> the
>>>>> rpc_clnt level.
>>>>>
>>>>> [1]: commit d9615d166c7e ("NFS: add sysfs shutdown knob").
>>>>>
>>>>> Signed-off-by: Jeff Layton <jlayton@kernel.org>
>>>>
>>>> I have a TODO to patch Documentation/ for this knob mostly to write
>>>> warnings
>>>> because there are some potential "gotchas" here - for example you
>>>> can have
>>>> shared RPC clients and shutting down one of those can cause
>>>> problems for a
>>>> different mount (this is true today with the
>>>> /sys/fs/nfs/[bdi]/shutdown
>>>> knob). Shutting down aribitrary clients will definitely break
>>>> things in
>>>> weird ways, its not a safe place to explore.
>>>>
>>>
>>> Yes, you really do need to know what you're doing. 0200 permissions
>>> are
>>> essential for this file, IOW. Thanks for the R-b!
>>
>> Sorry, but NACK! We should not be adding control mechanisms to debugfs.
>>
>
> Ok. Would adding sunrpc controls under sysfs be more acceptable? I do
> agree that this is a potential footgun, however. It would be nicer to
> clean this situation up automagically.
>
>> One thing that might work in situations like this is perhaps to make
>> use of the fact that we are monitoring whether or not rpc_pipefs is
>> mounted. So if the mount is containerised, and the orchestrator
>> unmounts everything, including rpc_pipefs, we might take that as a hint
>> that we should treat any future connection errors as being fatal.
>>
>
> rpc_pipefs isn't being mounted at all in the container I'm using. I
> think that's not going to be a reliable test for this.
>
>> Otherwise, we'd have to be able to monitor the root task, and check if
>> it is still alive in order to figure out if out containerised world has
>> collapsed.
>>
>
> If by the root task, you mean the initial task in the container, then
> that method seems a little sketchy too. How would we determine that
> from the RPC layer?
>
> To be clear: the situation here is that we have a container with a veth
> device that is communicating with the outside world. Once all of the
> processes in the container exit, the veth device in the container
> disappears. The rpc_xprt holds a ref on the netns though, so that
> sticks around trying to retransmit indefinitely.
>
> I think what we really need is a lightweight reference on the netns.
> Something where we can tell that there are no userland tasks that care
> about it anymore, so we can be more aggressive about giving up on it.
>
> There is a "passive" refcount inside struct net, but that's not quite
> what we need as it won't keep the sunrpc_net in place.
>
> What if instead of holding a netns reference in the xprt, we have it
> hold a reference on a new refcount_t that lives in sunrpc_net? Then, we
> add a pre_exit pernet_ops callback that does a shutdown_client() on all
> of the rpc_clnt's attached to the xprts in that netns. The pre_exit can
> then just block until the sunrpc_net refcount goes to 0.
>
> I think that would allow everything to be cleaned up properly?
Do you think that might create unwanted behaviors for a netns that might
still be repairable? Maybe that doesn't make a lot of sense if there are no
processes in it, but I imagine a network namespace could be in this state
and we'd still want to try to use it.
I think there's an out-of-kernel (haven't tried yet) way to do it with udev,
which, if used, creates an explicit requirement for the orchestrator to
define exactly what should happen if the veth goes away. When creating the
namespace, the orchestrator should insert a rule that says "when this veth
disappears, we shutdown this fs".
Again, I'm not sure if that's even possible, but I'm willing to muck around
a bit and give it a try.
Ben
On Thu, 2025-03-13 at 09:35 -0400, Benjamin Coddington wrote:
> On 13 Mar 2025, at 9:15, Jeff Layton wrote:
>
> > On Wed, 2025-03-12 at 22:31 +0000, Trond Myklebust wrote:
> > > On Wed, 2025-03-12 at 10:37 -0400, Jeff Layton wrote:
> > > > On Wed, 2025-03-12 at 09:52 -0400, Benjamin Coddington wrote:
> > > > > On 12 Mar 2025, at 9:36, Jeff Layton wrote:
> > > > >
> > > > > > There have been confirmed reports where a container with an NFS
> > > > > > mount
> > > > > > inside it dies abruptly, along with all of its processes, but the
> > > > > > NFS
> > > > > > client sticks around and keeps trying to send RPCs after the
> > > > > > networking
> > > > > > is gone.
> > > > > >
> > > > > > We have a reproducer where if we SIGKILL a container with an NFS
> > > > > > mount,
> > > > > > the RPC clients will stick around indefinitely. The orchestrator
> > > > > > does a MNT_DETACH unmount on the NFS mount, and then tears down
> > > > > > the
> > > > > > networking while there are still RPCs in flight.
> > > > > >
> > > > > > Recently new controls were added[1] that allow shutting down an
> > > > > > NFS
> > > > > > mount. That doesn't help here since the mount namespace is
> > > > > > detached from
> > > > > > any tasks at this point.
> > > > >
> > > > > That's interesting - seems like the orchestrator could just reorder
> > > > > its
> > > > > request to shutdown before detaching the mount namespace. Not an
> > > > > objection,
> > > > > just wondering why the MNT_DETACH must come first.
> > > > >
> > > >
> > > > The reproducer we have is to systemd-nspawn a container, mount up an
> > > > NFS mount inside it, start some I/O on it with fio and then kill -9
> > > > the
> > > > systemd running inside the container. There isn't much the
> > > > orchestrator
> > > > (root-level systemd) can do to at that point other than clean up
> > > > what's
> > > > left.
> > > >
> > > > I'm still working on a way to reliably detect when this has happened.
> > > > For now, we just have to notice that some clients aren't dying.
> > > >
> > > > > > Transplant shutdown_client() to the sunrpc module, and give it a
> > > > > > more
> > > > > > distinct name. Add a new debugfs sunrpc/rpc_clnt/*/shutdown knob
> > > > > > that
> > > > > > allows the same functionality as the one in /sys/fs/nfs, but at
> > > > > > the
> > > > > > rpc_clnt level.
> > > > > >
> > > > > > [1]: commit d9615d166c7e ("NFS: add sysfs shutdown knob").
> > > > > >
> > > > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > > >
> > > > > I have a TODO to patch Documentation/ for this knob mostly to write
> > > > > warnings
> > > > > because there are some potential "gotchas" here - for example you
> > > > > can have
> > > > > shared RPC clients and shutting down one of those can cause
> > > > > problems for a
> > > > > different mount (this is true today with the
> > > > > /sys/fs/nfs/[bdi]/shutdown
> > > > > knob). Shutting down aribitrary clients will definitely break
> > > > > things in
> > > > > weird ways, its not a safe place to explore.
> > > > >
> > > >
> > > > Yes, you really do need to know what you're doing. 0200 permissions
> > > > are
> > > > essential for this file, IOW. Thanks for the R-b!
> > >
> > > Sorry, but NACK! We should not be adding control mechanisms to debugfs.
> > >
> >
> > Ok. Would adding sunrpc controls under sysfs be more acceptable? I do
> > agree that this is a potential footgun, however. It would be nicer to
> > clean this situation up automagically.
> >
> > > One thing that might work in situations like this is perhaps to make
> > > use of the fact that we are monitoring whether or not rpc_pipefs is
> > > mounted. So if the mount is containerised, and the orchestrator
> > > unmounts everything, including rpc_pipefs, we might take that as a hint
> > > that we should treat any future connection errors as being fatal.
> > >
> >
> > rpc_pipefs isn't being mounted at all in the container I'm using. I
> > think that's not going to be a reliable test for this.
> >
> > > Otherwise, we'd have to be able to monitor the root task, and check if
> > > it is still alive in order to figure out if out containerised world has
> > > collapsed.
> > >
> >
> > If by the root task, you mean the initial task in the container, then
> > that method seems a little sketchy too. How would we determine that
> > from the RPC layer?
> >
> > To be clear: the situation here is that we have a container with a veth
> > device that is communicating with the outside world. Once all of the
> > processes in the container exit, the veth device in the container
> > disappears. The rpc_xprt holds a ref on the netns though, so that
> > sticks around trying to retransmit indefinitely.
> >
> > I think what we really need is a lightweight reference on the netns.
> > Something where we can tell that there are no userland tasks that care
> > about it anymore, so we can be more aggressive about giving up on it.
> >
> > There is a "passive" refcount inside struct net, but that's not quite
> > what we need as it won't keep the sunrpc_net in place.
> >
> > What if instead of holding a netns reference in the xprt, we have it
> > hold a reference on a new refcount_t that lives in sunrpc_net? Then, we
> > add a pre_exit pernet_ops callback that does a shutdown_client() on all
> > of the rpc_clnt's attached to the xprts in that netns. The pre_exit can
> > then just block until the sunrpc_net refcount goes to 0.
> >
> > I think that would allow everything to be cleaned up properly?
>
> Do you think that might create unwanted behaviors for a netns that might
> still be repairable? Maybe that doesn't make a lot of sense if there are no
> processes in it, but I imagine a network namespace could be in this state
> and we'd still want to try to use it.
>
I don't think so. Once there are no userland tasks holding a reference
to a namespace, there is no way to reach it from outside the kernel,
AFAICT, so there is no way repair it.
It would actually be nice if we had a way to say "open net namespace
with this inode number". I guess we could add filehandle and
open_by_handle_at() support to nsfs...
> which, if used, creates an explicit requirement for the orchestrator to
> define exactly what should happen if the veth goes away. When creating the
> namespace, the orchestrator should insert a rule that says "when this veth
> disappears, we shutdown this fs".
>
> Again, I'm not sure if that's even possible, but I'm willing to muck around
> a bit and give it a try.
>
I'd really prefer to do something that "just works" with existing
userland applications, but if we have to do something like that, then
so be it.
--
Jeff Layton <jlayton@kernel.org>
© 2016 - 2025 Red Hat, Inc.