fs/fuse/file.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+)
From: "Eric W. Biederman" <ebiederm@xmission.com>
In my very light testing this resolves a hang where a thread of the
fuse server was accessing the fuse filesystem (the fuse server is
serving up), when the fuse server is killed.
The practical problem is that the fuse server file descriptor was
being closed after the file descriptor into the fuse filesystem so
that the fuse filesystem operations were being blocked for instead of
being aborted. Simply skipping the unnecessary wait resolves this
issue.
This is just a proof of concept and someone should look to see if the
fuse max_background limit could cause a problem with this approach.
Additionally testing PF_EXITING is a very crude way to tell if someone
wants the return code from the vfs flush operation. As such in the
long run it probably makes sense to get some direct vfs support for
knowing if flush needs to block until all of the flushing is complete
and a status/return code can be returned.
Unless I have missed something this is a generic optimization that can
apply to many network filesystems.
Al, vfs folks? (igrab/iput sorted so as not to be distractions).
Perhaps a .flush_async method without a return code and a
filp_close_async function without a return code to take advantage of
this in the general sense.
Waiting potentially indefinitely for user space in do_exit seems like a
bad idea. Especially when all that the wait is for is to get a return
code that will never be examined.
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
[tycho: small fixups for releasing fuse file + nocred flag]
Signed-off-by: Tycho Andersen <tycho@tycho.pizza>
Reported-by: Tycho Andersen <tycho@tycho.pizza>
Tested-by: "Serge E. Hallyn" <serge@hallyn.com>
---
fs/fuse/file.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 64 insertions(+)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 05caa2b9272e..da45fb2dd740 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -464,6 +464,67 @@ static void fuse_sync_writes(struct inode *inode)
fuse_release_nowrite(inode);
}
+struct fuse_flush_args {
+ struct fuse_args args;
+ struct fuse_flush_in inarg;
+ struct inode *inode;
+ struct fuse_file *ff;
+};
+
+static void fuse_flush_end(struct fuse_mount *fm, struct fuse_args *args, int err)
+{
+ struct fuse_flush_args *fa = container_of(args, typeof(*fa), args);
+
+ if (err == -ENOSYS) {
+ fm->fc->no_flush = 1;
+ err = 0;
+ }
+
+ /*
+ * In memory i_blocks is not maintained by fuse, if writeback cache is
+ * enabled, i_blocks from cached attr may not be accurate.
+ */
+ if (!err && fm->fc->writeback_cache)
+ fuse_invalidate_attr_mask(fa->inode, STATX_BLOCKS);
+
+
+ iput(fa->inode);
+ fuse_file_put(fa->ff, false, false);
+ kfree(fa);
+}
+
+static int fuse_flush_async(struct file *file, fl_owner_t id)
+{
+ struct inode *inode = file_inode(file);
+ struct fuse_mount *fm = get_fuse_mount(inode);
+ struct fuse_file *ff = file->private_data;
+ struct fuse_flush_args *fa;
+ int err;
+
+ fa = kzalloc(sizeof(*fa), GFP_KERNEL);
+ if (!fa)
+ return -ENOMEM;
+
+ fa->inarg.fh = ff->fh;
+ fa->inarg.lock_owner = fuse_lock_owner_id(fm->fc, id);
+ fa->args.opcode = FUSE_FLUSH;
+ fa->args.nodeid = get_node_id(inode);
+ fa->args.in_numargs = 1;
+ fa->args.in_args[0].size = sizeof(fa->inarg);
+ fa->args.in_args[0].value = &fa->inarg;
+ fa->args.force = true;
+ fa->args.nocreds = true;
+ fa->args.end = fuse_flush_end;
+ fa->inode = igrab(inode);
+ fa->ff = fuse_file_get(ff);
+
+ err = fuse_simple_background(fm, &fa->args, GFP_KERNEL);
+ if (err)
+ fuse_flush_end(fm, &fa->args, err);
+
+ return err;
+}
+
static int fuse_flush(struct file *file, fl_owner_t id)
{
struct inode *inode = file_inode(file);
@@ -495,6 +556,9 @@ static int fuse_flush(struct file *file, fl_owner_t id)
if (fm->fc->no_flush)
goto inval_attr_out;
+ if (current->flags & PF_EXITING)
+ return fuse_flush_async(file, id);
+
memset(&inarg, 0, sizeof(inarg));
inarg.fh = ff->fh;
inarg.lock_owner = fuse_lock_owner_id(fm->fc, id);
base-commit: 3d7cb6b04c3f3115719235cc6866b10326de34cd
--
2.34.1
On Thu, 1 Sept 2022 at 16:07, Tycho Andersen <tycho@tycho.pizza> wrote:
>
> From: "Eric W. Biederman" <ebiederm@xmission.com>
>
> In my very light testing this resolves a hang where a thread of the
> fuse server was accessing the fuse filesystem (the fuse server is
> serving up), when the fuse server is killed.
>
> The practical problem is that the fuse server file descriptor was
> being closed after the file descriptor into the fuse filesystem so
> that the fuse filesystem operations were being blocked for instead of
> being aborted. Simply skipping the unnecessary wait resolves this
> issue.
>
> This is just a proof of concept and someone should look to see if the
> fuse max_background limit could cause a problem with this approach.
Maybe you missed my comments here:
https://lore.kernel.org/all/CAJfpegsTmiO-sKaBLgoVT4WxDXBkRES=HF1YmQN1ES7gfJEJ+w@mail.gmail.com/
I'm generally okay with this, but please write a proper changelog for
the patch, also mentioning the issues related to posix locks.
> --- a/fs/fuse/file.c
> +++ b/fs/fuse/file.c
> @@ -464,6 +464,67 @@ static void fuse_sync_writes(struct inode *inode)
> fuse_release_nowrite(inode);
> }
>
> +struct fuse_flush_args {
> + struct fuse_args args;
> + struct fuse_flush_in inarg;
> + struct inode *inode;
> + struct fuse_file *ff;
> +};
> +
> +static void fuse_flush_end(struct fuse_mount *fm, struct fuse_args *args, int err)
> +{
> + struct fuse_flush_args *fa = container_of(args, typeof(*fa), args);
> +
> + if (err == -ENOSYS) {
> + fm->fc->no_flush = 1;
> + err = 0;
> + }
> +
> + /*
> + * In memory i_blocks is not maintained by fuse, if writeback cache is
> + * enabled, i_blocks from cached attr may not be accurate.
> + */
> + if (!err && fm->fc->writeback_cache)
> + fuse_invalidate_attr_mask(fa->inode, STATX_BLOCKS);
> +
> +
> + iput(fa->inode);
> + fuse_file_put(fa->ff, false, false);
> + kfree(fa);
> +}
> +
> +static int fuse_flush_async(struct file *file, fl_owner_t id)
> +{
> + struct inode *inode = file_inode(file);
> + struct fuse_mount *fm = get_fuse_mount(inode);
> + struct fuse_file *ff = file->private_data;
> + struct fuse_flush_args *fa;
> + int err;
> +
> + fa = kzalloc(sizeof(*fa), GFP_KERNEL);
> + if (!fa)
> + return -ENOMEM;
> +
> + fa->inarg.fh = ff->fh;
> + fa->inarg.lock_owner = fuse_lock_owner_id(fm->fc, id);
> + fa->args.opcode = FUSE_FLUSH;
> + fa->args.nodeid = get_node_id(inode);
> + fa->args.in_numargs = 1;
> + fa->args.in_args[0].size = sizeof(fa->inarg);
> + fa->args.in_args[0].value = &fa->inarg;
> + fa->args.force = true;
> + fa->args.nocreds = true;
> + fa->args.end = fuse_flush_end;
> + fa->inode = igrab(inode);
Grabbing the inode should already taken care of by fuse_file_release().
Also please try to reduce duplication in both the above functions.
Thanks,
Miklos
On Tue, Sep 27, 2022 at 11:46:44AM +0200, Miklos Szeredi wrote:
> On Thu, 1 Sept 2022 at 16:07, Tycho Andersen <tycho@tycho.pizza> wrote:
> >
> > From: "Eric W. Biederman" <ebiederm@xmission.com>
> >
> > In my very light testing this resolves a hang where a thread of the
> > fuse server was accessing the fuse filesystem (the fuse server is
> > serving up), when the fuse server is killed.
> >
> > The practical problem is that the fuse server file descriptor was
> > being closed after the file descriptor into the fuse filesystem so
> > that the fuse filesystem operations were being blocked for instead of
> > being aborted. Simply skipping the unnecessary wait resolves this
> > issue.
> >
> > This is just a proof of concept and someone should look to see if the
> > fuse max_background limit could cause a problem with this approach.
>
> Maybe you missed my comments here:
>
> https://lore.kernel.org/all/CAJfpegsTmiO-sKaBLgoVT4WxDXBkRES=HF1YmQN1ES7gfJEJ+w@mail.gmail.com/
That's odd - fwiw I too had completely missed that reply, sorry.
> I'm generally okay with this, but please write a proper changelog for
> the patch, also mentioning the issues related to posix locks.
>
> > --- a/fs/fuse/file.c
> > +++ b/fs/fuse/file.c
> > @@ -464,6 +464,67 @@ static void fuse_sync_writes(struct inode *inode)
> > fuse_release_nowrite(inode);
> > }
> >
> > +struct fuse_flush_args {
> > + struct fuse_args args;
> > + struct fuse_flush_in inarg;
> > + struct inode *inode;
> > + struct fuse_file *ff;
> > +};
> > +
> > +static void fuse_flush_end(struct fuse_mount *fm, struct fuse_args *args, int err)
> > +{
> > + struct fuse_flush_args *fa = container_of(args, typeof(*fa), args);
> > +
> > + if (err == -ENOSYS) {
> > + fm->fc->no_flush = 1;
> > + err = 0;
> > + }
> > +
> > + /*
> > + * In memory i_blocks is not maintained by fuse, if writeback cache is
> > + * enabled, i_blocks from cached attr may not be accurate.
> > + */
> > + if (!err && fm->fc->writeback_cache)
> > + fuse_invalidate_attr_mask(fa->inode, STATX_BLOCKS);
> > +
> > +
> > + iput(fa->inode);
> > + fuse_file_put(fa->ff, false, false);
> > + kfree(fa);
> > +}
> > +
> > +static int fuse_flush_async(struct file *file, fl_owner_t id)
> > +{
> > + struct inode *inode = file_inode(file);
> > + struct fuse_mount *fm = get_fuse_mount(inode);
> > + struct fuse_file *ff = file->private_data;
> > + struct fuse_flush_args *fa;
> > + int err;
> > +
> > + fa = kzalloc(sizeof(*fa), GFP_KERNEL);
> > + if (!fa)
> > + return -ENOMEM;
> > +
> > + fa->inarg.fh = ff->fh;
> > + fa->inarg.lock_owner = fuse_lock_owner_id(fm->fc, id);
> > + fa->args.opcode = FUSE_FLUSH;
> > + fa->args.nodeid = get_node_id(inode);
> > + fa->args.in_numargs = 1;
> > + fa->args.in_args[0].size = sizeof(fa->inarg);
> > + fa->args.in_args[0].value = &fa->inarg;
> > + fa->args.force = true;
> > + fa->args.nocreds = true;
> > + fa->args.end = fuse_flush_end;
> > + fa->inode = igrab(inode);
>
> Grabbing the inode should already taken care of by fuse_file_release().
>
> Also please try to reduce duplication in both the above functions.
>
> Thanks,
> Miklos
Hi,
I've some idea;s about the cause of the error.
In the first message about this:
"However, there's a problem when the fuse daemon
itself spawns a thread that does a flush: since the thread has a copy of
the fd table with an fd pointing to the same fuse device, the reference
count isn't decremented to zero in fuse_dev_release(), and the task hangs
forever."
If the kernel starts to abort the filesystem (since the daemon in
userspace is terminated), and cannot do that since a file handle is
still open due to a flush, resulting in a hang, maybe the reason to
stop/abort the filesystem is wrong. The kernel should look at the fuse
device fd (which is duplicated after spawning), find there is still
one fd open, and should not go into aborting the fs.
I hope this helps,
Stef Bon
the Netherlands
Op di 27 sep. 2022 om 11:48 schreef Miklos Szeredi via fuse-devel
<fuse-devel@lists.sourceforge.net>:
>
> On Thu, 1 Sept 2022 at 16:07, Tycho Andersen <tycho@tycho.pizza> wrote:
> >
> > From: "Eric W. Biederman" <ebiederm@xmission.com>
> >
> > In my very light testing this resolves a hang where a thread of the
> > fuse server was accessing the fuse filesystem (the fuse server is
> > serving up), when the fuse server is killed.
> >
> > The practical problem is that the fuse server file descriptor was
> > being closed after the file descriptor into the fuse filesystem so
> > that the fuse filesystem operations were being blocked for instead of
> > being aborted. Simply skipping the unnecessary wait resolves this
> > issue.
> >
> > This is just a proof of concept and someone should look to see if the
> > fuse max_background limit could cause a problem with this approach.
>
> Maybe you missed my comments here:
>
> https://lore.kernel.org/all/CAJfpegsTmiO-sKaBLgoVT4WxDXBkRES=HF1YmQN1ES7gfJEJ+w@mail.gmail.com/
>
> I'm generally okay with this, but please write a proper changelog for
> the patch, also mentioning the issues related to posix locks.
>
> > --- a/fs/fuse/file.c
> > +++ b/fs/fuse/file.c
> > @@ -464,6 +464,67 @@ static void fuse_sync_writes(struct inode *inode)
> > fuse_release_nowrite(inode);
> > }
> >
> > +struct fuse_flush_args {
> > + struct fuse_args args;
> > + struct fuse_flush_in inarg;
> > + struct inode *inode;
> > + struct fuse_file *ff;
> > +};
> > +
> > +static void fuse_flush_end(struct fuse_mount *fm, struct fuse_args *args, int err)
> > +{
> > + struct fuse_flush_args *fa = container_of(args, typeof(*fa), args);
> > +
> > + if (err == -ENOSYS) {
> > + fm->fc->no_flush = 1;
> > + err = 0;
> > + }
> > +
> > + /*
> > + * In memory i_blocks is not maintained by fuse, if writeback cache is
> > + * enabled, i_blocks from cached attr may not be accurate.
> > + */
> > + if (!err && fm->fc->writeback_cache)
> > + fuse_invalidate_attr_mask(fa->inode, STATX_BLOCKS);
> > +
> > +
> > + iput(fa->inode);
> > + fuse_file_put(fa->ff, false, false);
> > + kfree(fa);
> > +}
> > +
> > +static int fuse_flush_async(struct file *file, fl_owner_t id)
> > +{
> > + struct inode *inode = file_inode(file);
> > + struct fuse_mount *fm = get_fuse_mount(inode);
> > + struct fuse_file *ff = file->private_data;
> > + struct fuse_flush_args *fa;
> > + int err;
> > +
> > + fa = kzalloc(sizeof(*fa), GFP_KERNEL);
> > + if (!fa)
> > + return -ENOMEM;
> > +
> > + fa->inarg.fh = ff->fh;
> > + fa->inarg.lock_owner = fuse_lock_owner_id(fm->fc, id);
> > + fa->args.opcode = FUSE_FLUSH;
> > + fa->args.nodeid = get_node_id(inode);
> > + fa->args.in_numargs = 1;
> > + fa->args.in_args[0].size = sizeof(fa->inarg);
> > + fa->args.in_args[0].value = &fa->inarg;
> > + fa->args.force = true;
> > + fa->args.nocreds = true;
> > + fa->args.end = fuse_flush_end;
> > + fa->inode = igrab(inode);
>
> Grabbing the inode should already taken care of by fuse_file_release().
>
> Also please try to reduce duplication in both the above functions.
>
> Thanks,
> Miklos
>
>
> --
> fuse-devel mailing list
> To unsubscribe or subscribe, visit https://lists.sourceforge.net/lists/listinfo/fuse-devel
If a fuse filesystem is mounted inside a container, there is a problem
during pid namespace destruction. The scenario is:
1. task (a thread in the fuse server, with a fuse file open) starts
exiting, does exit_signals(), goes into fuse_flush() -> wait
2. fuse daemon gets killed, tries to wake everyone up
3. task from 1 is stuck because complete_signal() doesn't wake it up, since
it has PF_EXITING.
The result is that the thread will never be woken up, and pid namespace
destruction will block indefinitely.
To add insult to injury, nobody is waiting for these return codes, since
the pid namespace is being destroyed.
To fix this, let's not block on flush operations when the current task has
PF_EXITING.
This does change the semantics slightly: the wait here is for posix locks
to be unlocked, so the task will exit before things are unlocked. To quote
Miklos: https://lore.kernel.org/all/CAJfpegsTmiO-sKaBLgoVT4WxDXBkRES=HF1YmQN1ES7gfJEJ+w@mail.gmail.com/
> "remote" posix locks are almost never used due to problems like this,
> so I think it's safe to do this.
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Tycho Andersen <tycho@tycho.pizza>
Link: https://lore.kernel.org/all/YrShFXRLtRt6T%2Fj+@risky/
---
v2: drop the fuse_flush_async() function and just re-use the already
prepared args; add a description of the problem+note about posix locks
---
fs/fuse/file.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 50 insertions(+)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 05caa2b9272e..20bbe3e1afc7 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -464,6 +464,34 @@ static void fuse_sync_writes(struct inode *inode)
fuse_release_nowrite(inode);
}
+struct fuse_flush_args {
+ struct fuse_args args;
+ struct fuse_flush_in inarg;
+ struct inode *inode;
+ struct fuse_file *ff;
+};
+
+static void fuse_flush_end(struct fuse_mount *fm, struct fuse_args *args, int err)
+{
+ struct fuse_flush_args *fa = container_of(args, typeof(*fa), args);
+
+ if (err == -ENOSYS) {
+ fm->fc->no_flush = 1;
+ err = 0;
+ }
+
+ /*
+ * In memory i_blocks is not maintained by fuse, if writeback cache is
+ * enabled, i_blocks from cached attr may not be accurate.
+ */
+ if (!err && fm->fc->writeback_cache)
+ fuse_invalidate_attr_mask(fa->inode, STATX_BLOCKS);
+
+
+ fuse_file_put(fa->ff, false, false);
+ kfree(fa);
+}
+
static int fuse_flush(struct file *file, fl_owner_t id)
{
struct inode *inode = file_inode(file);
@@ -505,6 +533,28 @@ static int fuse_flush(struct file *file, fl_owner_t id)
args.in_args[0].value = &inarg;
args.force = true;
+ if (current->flags & PF_EXITING) {
+ struct fuse_flush_args *fa;
+
+ err = -ENOMEM;
+ fa = kzalloc(sizeof(*fa), GFP_KERNEL);
+ if (!fa)
+ goto inval_attr_out;
+
+ memcpy(&fa->args, &args, sizeof(args));
+ memcpy(&fa->inarg, &inarg, sizeof(inarg));
+ fa->args.nocreds = true;
+ fa->args.end = fuse_flush_end;
+ fa->ff = fuse_file_get(ff);
+ fa->inode = inode;
+
+ err = fuse_simple_background(fm, &fa->args, GFP_KERNEL);
+ if (err)
+ fuse_flush_end(fm, &fa->args, err);
+
+ return err;
+ }
+
err = fuse_simple_request(fm, &args);
if (err == -ENOSYS) {
fm->fc->no_flush = 1;
base-commit: 3d7cb6b04c3f3115719235cc6866b10326de34cd
--
2.34.1
On Thu, 29 Sept 2022 at 18:40, Tycho Andersen <tycho@tycho.pizza> wrote:
>
> If a fuse filesystem is mounted inside a container, there is a problem
> during pid namespace destruction. The scenario is:
>
> 1. task (a thread in the fuse server, with a fuse file open) starts
> exiting, does exit_signals(), goes into fuse_flush() -> wait
Can't the same happen through
fuse_flush -> fuse_sync_writes -> fuse_set_nowrite -> wait
?
> 2. fuse daemon gets killed, tries to wake everyone up
> 3. task from 1 is stuck because complete_signal() doesn't wake it up, since
> it has PF_EXITING.
>
> The result is that the thread will never be woken up, and pid namespace
> destruction will block indefinitely.
>
> To add insult to injury, nobody is waiting for these return codes, since
> the pid namespace is being destroyed.
>
> To fix this, let's not block on flush operations when the current task has
> PF_EXITING.
>
> This does change the semantics slightly: the wait here is for posix locks
> to be unlocked, so the task will exit before things are unlocked. To quote
> Miklos: https://lore.kernel.org/all/CAJfpegsTmiO-sKaBLgoVT4WxDXBkRES=HF1YmQN1ES7gfJEJ+w@mail.gmail.com/
>
> > "remote" posix locks are almost never used due to problems like this,
> > so I think it's safe to do this.
>
> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
> Signed-off-by: Tycho Andersen <tycho@tycho.pizza>
> Link: https://lore.kernel.org/all/YrShFXRLtRt6T%2Fj+@risky/
> ---
> v2: drop the fuse_flush_async() function and just re-use the already
> prepared args; add a description of the problem+note about posix locks
> ---
> fs/fuse/file.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 50 insertions(+)
>
> diff --git a/fs/fuse/file.c b/fs/fuse/file.c
> index 05caa2b9272e..20bbe3e1afc7 100644
> --- a/fs/fuse/file.c
> +++ b/fs/fuse/file.c
> @@ -464,6 +464,34 @@ static void fuse_sync_writes(struct inode *inode)
> fuse_release_nowrite(inode);
> }
>
> +struct fuse_flush_args {
> + struct fuse_args args;
> + struct fuse_flush_in inarg;
> + struct inode *inode;
> + struct fuse_file *ff;
> +};
> +
> +static void fuse_flush_end(struct fuse_mount *fm, struct fuse_args *args, int err)
> +{
> + struct fuse_flush_args *fa = container_of(args, typeof(*fa), args);
> +
> + if (err == -ENOSYS) {
> + fm->fc->no_flush = 1;
> + err = 0;
> + }
> +
> + /*
> + * In memory i_blocks is not maintained by fuse, if writeback cache is
> + * enabled, i_blocks from cached attr may not be accurate.
> + */
> + if (!err && fm->fc->writeback_cache)
> + fuse_invalidate_attr_mask(fa->inode, STATX_BLOCKS);
This is still duplicating code, can you please create a helper?
Thanks,
Miklos
On Fri, Sep 30, 2022 at 03:35:16PM +0200, Miklos Szeredi wrote:
> On Thu, 29 Sept 2022 at 18:40, Tycho Andersen <tycho@tycho.pizza> wrote:
> >
> > If a fuse filesystem is mounted inside a container, there is a problem
> > during pid namespace destruction. The scenario is:
> >
> > 1. task (a thread in the fuse server, with a fuse file open) starts
> > exiting, does exit_signals(), goes into fuse_flush() -> wait
>
> Can't the same happen through
>
> fuse_flush -> fuse_sync_writes -> fuse_set_nowrite -> wait
>
> ?
Looks like yes, though I haven't seen this in the wild, I guess
because there aren't multiple writers most of the time the user code
that causes this.
I'm not exactly sure how to fix this. Reading through 3be5a52b30aa
("fuse: support writable mmap"), we don't want to allow multiple
writes since that may do allocations, which could cause deadlocks. But
in this case we have no reliable way to wait (besides a busy loop, I
suppose).
Maybe just a check for PF_EXITING and a pr_warn() with "echo 1 >
/sys/fs/fuse/connections/$N/abort" or something?
> > + /*
> > + * In memory i_blocks is not maintained by fuse, if writeback cache is
> > + * enabled, i_blocks from cached attr may not be accurate.
> > + */
> > + if (!err && fm->fc->writeback_cache)
> > + fuse_invalidate_attr_mask(fa->inode, STATX_BLOCKS);
>
> This is still duplicating code, can you please create a helper?
Yep, will do, pending the outcome of the above discussion.
Tycho
On Fri, 30 Sept 2022 at 16:01, Tycho Andersen <tycho@tycho.pizza> wrote:
>
> On Fri, Sep 30, 2022 at 03:35:16PM +0200, Miklos Szeredi wrote:
> > On Thu, 29 Sept 2022 at 18:40, Tycho Andersen <tycho@tycho.pizza> wrote:
> > >
> > > If a fuse filesystem is mounted inside a container, there is a problem
> > > during pid namespace destruction. The scenario is:
> > >
> > > 1. task (a thread in the fuse server, with a fuse file open) starts
> > > exiting, does exit_signals(), goes into fuse_flush() -> wait
> >
> > Can't the same happen through
> >
> > fuse_flush -> fuse_sync_writes -> fuse_set_nowrite -> wait
> >
> > ?
>
> Looks like yes, though I haven't seen this in the wild, I guess
> because there aren't multiple writers most of the time the user code
> that causes this.
>
> I'm not exactly sure how to fix this. Reading through 3be5a52b30aa
> ("fuse: support writable mmap"), we don't want to allow multiple
> writes since that may do allocations, which could cause deadlocks. But
> in this case we have no reliable way to wait (besides a busy loop, I
> suppose).
>
> Maybe just a check for PF_EXITING and a pr_warn() with "echo 1 >
> /sys/fs/fuse/connections/$N/abort" or something?
AFAICS it should be perfectly normal (and trivial to trigger) for an
exiting process to have its dirty pages flushed through fuse_flush().
We could do that asynchronously as well, generally there are no
promises about dirty pages being synced as part of the process exiting
. But ordering between dirty page flushing and sending the FUSE_FLUSH
request should be kept. Which needs more complexity, unfortunately.
Thanks,
Miklos
On Fri, Sep 30, 2022 at 04:41:37PM +0200, Miklos Szeredi wrote:
> On Fri, 30 Sept 2022 at 16:01, Tycho Andersen <tycho@tycho.pizza> wrote:
> >
> > On Fri, Sep 30, 2022 at 03:35:16PM +0200, Miklos Szeredi wrote:
> > > On Thu, 29 Sept 2022 at 18:40, Tycho Andersen <tycho@tycho.pizza> wrote:
> > > >
> > > > If a fuse filesystem is mounted inside a container, there is a problem
> > > > during pid namespace destruction. The scenario is:
> > > >
> > > > 1. task (a thread in the fuse server, with a fuse file open) starts
> > > > exiting, does exit_signals(), goes into fuse_flush() -> wait
> > >
> > > Can't the same happen through
> > >
> > > fuse_flush -> fuse_sync_writes -> fuse_set_nowrite -> wait
> > >
> > > ?
> >
> > Looks like yes, though I haven't seen this in the wild, I guess
> > because there aren't multiple writers most of the time the user code
> > that causes this.
> >
> > I'm not exactly sure how to fix this. Reading through 3be5a52b30aa
> > ("fuse: support writable mmap"), we don't want to allow multiple
> > writes since that may do allocations, which could cause deadlocks. But
> > in this case we have no reliable way to wait (besides a busy loop, I
> > suppose).
> >
> > Maybe just a check for PF_EXITING and a pr_warn() with "echo 1 >
> > /sys/fs/fuse/connections/$N/abort" or something?
>
> AFAICS it should be perfectly normal (and trivial to trigger) for an
> exiting process to have its dirty pages flushed through fuse_flush().
Agreed.
> We could do that asynchronously as well, generally there are no
> promises about dirty pages being synced as part of the process exiting
> . But ordering between dirty page flushing and sending the FUSE_FLUSH
> request should be kept. Which needs more complexity, unfortunately.
How can we wait in fuse_set_nowrite()? Or are you suggesting we just
do a fuse_flush_writepages() in the async part and hope for the best?
Thanks,
Tycho
On Fri, 30 Sept 2022 at 18:10, Tycho Andersen <tycho@tycho.pizza> wrote:
>
> On Fri, Sep 30, 2022 at 04:41:37PM +0200, Miklos Szeredi wrote:
> > On Fri, 30 Sept 2022 at 16:01, Tycho Andersen <tycho@tycho.pizza> wrote:
> > >
> > > On Fri, Sep 30, 2022 at 03:35:16PM +0200, Miklos Szeredi wrote:
> > > > On Thu, 29 Sept 2022 at 18:40, Tycho Andersen <tycho@tycho.pizza> wrote:
> > > > >
> > > > > If a fuse filesystem is mounted inside a container, there is a problem
> > > > > during pid namespace destruction. The scenario is:
> > > > >
> > > > > 1. task (a thread in the fuse server, with a fuse file open) starts
> > > > > exiting, does exit_signals(), goes into fuse_flush() -> wait
> > > >
> > > > Can't the same happen through
> > > >
> > > > fuse_flush -> fuse_sync_writes -> fuse_set_nowrite -> wait
> > > >
> > > > ?
> > >
> > > Looks like yes, though I haven't seen this in the wild, I guess
> > > because there aren't multiple writers most of the time the user code
> > > that causes this.
> > >
> > > I'm not exactly sure how to fix this. Reading through 3be5a52b30aa
> > > ("fuse: support writable mmap"), we don't want to allow multiple
> > > writes since that may do allocations, which could cause deadlocks. But
> > > in this case we have no reliable way to wait (besides a busy loop, I
> > > suppose).
> > >
> > > Maybe just a check for PF_EXITING and a pr_warn() with "echo 1 >
> > > /sys/fs/fuse/connections/$N/abort" or something?
> >
> > AFAICS it should be perfectly normal (and trivial to trigger) for an
> > exiting process to have its dirty pages flushed through fuse_flush().
>
> Agreed.
>
> > We could do that asynchronously as well, generally there are no
> > promises about dirty pages being synced as part of the process exiting
> > . But ordering between dirty page flushing and sending the FUSE_FLUSH
> > request should be kept. Which needs more complexity, unfortunately.
>
> How can we wait in fuse_set_nowrite()? Or are you suggesting we just
> do a fuse_flush_writepages() in the async part and hope for the best?
I was thinking along the lines of calling schedule_work() in the
exiting case to do the flush.
Thanks,
Miklos
If a fuse filesystem is mounted inside a container, there is a problem
during pid namespace destruction. The scenario is:
1. task (a thread in the fuse server, with a fuse file open) starts
exiting, does exit_signals(), goes into fuse_flush() -> wait
2. fuse daemon gets killed, tries to wake everyone up
3. task from 1 is stuck because complete_signal() doesn't wake it up, since
it has PF_EXITING.
The result is that the thread will never be woken up, and pid namespace
destruction will block indefinitely.
To add insult to injury, nobody is waiting for these return codes, since
the pid namespace is being destroyed.
To fix this, let's not block on flush operations when the current task has
PF_EXITING.
This does change the semantics slightly: the wait here is for posix locks
to be unlocked, so the task will exit before things are unlocked. To quote
Miklos: https://lore.kernel.org/all/CAJfpegsTmiO-sKaBLgoVT4WxDXBkRES=HF1YmQN1ES7gfJEJ+w@mail.gmail.com/
> "remote" posix locks are almost never used due to problems like this,
> so I think it's safe to do this.
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Tycho Andersen <tycho@tycho.pizza>
Link: https://lore.kernel.org/all/YrShFXRLtRt6T%2Fj+@risky/
---
v2: drop the fuse_flush_async() function and just re-use the already
prepared args; add a description of the problem+note about posix locks
v3: use schedule_work() to avoid other sleeps in inode_write_now() and
fuse_sync_writes(). Fix a UAF of the stack-based inarg.
---
fs/fuse/file.c | 106 +++++++++++++++++++++++++++++++++++++++----------
1 file changed, 84 insertions(+), 22 deletions(-)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 71bfb663aac5..10173b0e74b7 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -18,6 +18,7 @@
#include <linux/falloc.h>
#include <linux/uio.h>
#include <linux/fs.h>
+#include <linux/file.h>
static int fuse_send_open(struct fuse_mount *fm, u64 nodeid,
unsigned int open_flags, int opcode,
@@ -477,20 +478,20 @@ static void fuse_sync_writes(struct inode *inode)
fuse_release_nowrite(inode);
}
-static int fuse_flush(struct file *file, fl_owner_t id)
+static void fuse_invalidate_attrs(struct fuse_mount *fm, int err, struct inode *inode)
{
- struct inode *inode = file_inode(file);
- struct fuse_mount *fm = get_fuse_mount(inode);
- struct fuse_file *ff = file->private_data;
- struct fuse_flush_in inarg;
- FUSE_ARGS(args);
- int err;
-
- if (fuse_is_bad(inode))
- return -EIO;
+ /*
+ * In memory i_blocks is not maintained by fuse, if writeback cache is
+ * enabled, i_blocks from cached attr may not be accurate.
+ */
+ if (!err && fm->fc->writeback_cache)
+ fuse_invalidate_attr_mask(inode, STATX_BLOCKS);
+}
- if (ff->open_flags & FOPEN_NOFLUSH && !fm->fc->writeback_cache)
- return 0;
+static int do_fuse_flush(struct fuse_mount *fm, struct inode *inode,
+ struct file *file, struct fuse_args *args)
+{
+ int err;
err = write_inode_now(inode, 1);
if (err)
@@ -504,6 +505,53 @@ static int fuse_flush(struct file *file, fl_owner_t id)
if (err)
return err;
+ err = fuse_simple_request(fm, args);
+ if (err == -ENOSYS) {
+ fm->fc->no_flush = 1;
+ err = 0;
+ }
+
+ return err;
+}
+
+struct fuse_flush_args {
+ struct fuse_args args;
+ struct fuse_flush_in inarg;
+ struct inode *inode;
+ struct fuse_file *ff;
+ struct work_struct work;
+ struct file *file;
+};
+
+static void fuse_flush_async(struct work_struct *work)
+{
+ struct fuse_flush_args *fa = container_of(work, typeof(*fa), work);
+ struct fuse_mount *fm = get_fuse_mount(fa->inode);
+ int err;
+
+ err = do_fuse_flush(fm, fa->inode, fa->file, &fa->args);
+ if (err < 0)
+ fuse_invalidate_attrs(fm, err, fa->inode);
+ fuse_file_put(fa->ff, false, false);
+ fput(fa->file);
+ kfree(fa);
+}
+
+static int fuse_flush(struct file *file, fl_owner_t id)
+{
+ struct inode *inode = file_inode(file);
+ struct fuse_mount *fm = get_fuse_mount(inode);
+ struct fuse_file *ff = file->private_data;
+ struct fuse_flush_in inarg;
+ FUSE_ARGS(args);
+ int err;
+
+ if (fuse_is_bad(inode))
+ return -EIO;
+
+ if (ff->open_flags & FOPEN_NOFLUSH && !fm->fc->writeback_cache)
+ return 0;
+
err = 0;
if (fm->fc->no_flush)
goto inval_attr_out;
@@ -518,19 +566,33 @@ static int fuse_flush(struct file *file, fl_owner_t id)
args.in_args[0].value = &inarg;
args.force = true;
- err = fuse_simple_request(fm, &args);
- if (err == -ENOSYS) {
- fm->fc->no_flush = 1;
- err = 0;
+ if (current->flags & PF_EXITING) {
+ struct fuse_flush_args *fa;
+
+ err = -ENOMEM;
+ fa = kzalloc(sizeof(*fa), GFP_KERNEL);
+ if (!fa)
+ goto inval_attr_out;
+
+ memcpy(&fa->args, &args, sizeof(args));
+ memcpy(&fa->inarg, &inarg, sizeof(inarg));
+ fa->args.in_args[0].value = &fa->inarg;
+ fa->args.nocreds = true;
+ fa->ff = fuse_file_get(ff);
+ fa->inode = inode;
+ fa->file = get_file(file);
+
+ INIT_WORK(&fa->work, fuse_flush_async);
+ schedule_work(&fa->work);
+ return 0;
}
+ err = do_fuse_flush(fm, inode, file, &args);
+ if (!err)
+ return 0;
+
inval_attr_out:
- /*
- * In memory i_blocks is not maintained by fuse, if writeback cache is
- * enabled, i_blocks from cached attr may not be accurate.
- */
- if (!err && fm->fc->writeback_cache)
- fuse_invalidate_attr_mask(inode, STATX_BLOCKS);
+ fuse_invalidate_attrs(fm, err, inode);
return err;
}
base-commit: f0c4d9fc9cc9462659728d168387191387e903cc
--
2.34.1
Hi Milkos, On Mon, Nov 14, 2022 at 09:02:09AM -0700, Tycho Andersen wrote: > v3: use schedule_work() to avoid other sleeps in inode_write_now() and > fuse_sync_writes(). Fix a UAF of the stack-based inarg. Thoughts on this version? Thanks, Tycho
On Mon, 28 Nov 2022 at 16:01, Tycho Andersen <tycho@tycho.pizza> wrote: > > Hi Milkos, > > On Mon, Nov 14, 2022 at 09:02:09AM -0700, Tycho Andersen wrote: > > v3: use schedule_work() to avoid other sleeps in inode_write_now() and > > fuse_sync_writes(). Fix a UAF of the stack-based inarg. > > Thoughts on this version? Skipping attr invalidation on success is wrong. And there's still too much duplication, IMO. How about the attached (untested) patch? Thanks, Miklos
On Thu, Dec 08, 2022 at 03:26:19PM +0100, Miklos Szeredi wrote: > On Mon, 28 Nov 2022 at 16:01, Tycho Andersen <tycho@tycho.pizza> wrote: > > > > Hi Milkos, > > > > On Mon, Nov 14, 2022 at 09:02:09AM -0700, Tycho Andersen wrote: > > > v3: use schedule_work() to avoid other sleeps in inode_write_now() and > > > fuse_sync_writes(). Fix a UAF of the stack-based inarg. > > > > Thoughts on this version? > > Skipping attr invalidation on success is wrong. Agreed, that looks like my mistake. > How about the attached (untested) patch? It passes my reproducer with no warnings or anything. Feel free to add: Tested-by: Tycho Andersen <tycho@tycho.pizza> if you want to commit it. Tycho
On Thu, Dec 08, 2022 at 10:49:30AM -0700, Tycho Andersen wrote: > On Thu, Dec 08, 2022 at 03:26:19PM +0100, Miklos Szeredi wrote: > > On Mon, 28 Nov 2022 at 16:01, Tycho Andersen <tycho@tycho.pizza> wrote: > > > > > > Hi Milkos, > > > > > > On Mon, Nov 14, 2022 at 09:02:09AM -0700, Tycho Andersen wrote: > > > > v3: use schedule_work() to avoid other sleeps in inode_write_now() and > > > > fuse_sync_writes(). Fix a UAF of the stack-based inarg. > > > > > > Thoughts on this version? > > > > Skipping attr invalidation on success is wrong. > > Agreed, that looks like my mistake. > > > How about the attached (untested) patch? > > It passes my reproducer with no warnings or anything. Feel free to > add: > > Tested-by: Tycho Andersen <tycho@tycho.pizza> > > if you want to commit it. Ping, thoughts on landing this? Thanks, Tycho
On Mon, Dec 19, 2022 at 12:16:50PM -0700, Tycho Andersen wrote: > On Thu, Dec 08, 2022 at 10:49:30AM -0700, Tycho Andersen wrote: > > On Thu, Dec 08, 2022 at 03:26:19PM +0100, Miklos Szeredi wrote: > > > On Mon, 28 Nov 2022 at 16:01, Tycho Andersen <tycho@tycho.pizza> wrote: > > > > > > > > Hi Milkos, > > > > > > > > On Mon, Nov 14, 2022 at 09:02:09AM -0700, Tycho Andersen wrote: > > > > > v3: use schedule_work() to avoid other sleeps in inode_write_now() and > > > > > fuse_sync_writes(). Fix a UAF of the stack-based inarg. > > > > > > > > Thoughts on this version? > > > > > > Skipping attr invalidation on success is wrong. > > > > Agreed, that looks like my mistake. > > > > > How about the attached (untested) patch? > > > > It passes my reproducer with no warnings or anything. Feel free to > > add: > > > > Tested-by: Tycho Andersen <tycho@tycho.pizza> > > > > if you want to commit it. > > Ping, thoughts on landing this? Happy new year all. Any update here? Thanks, Tycho
On Tue, 3 Jan 2023 at 15:51, Tycho Andersen <tycho@tycho.pizza> wrote: > Happy new year all. Any update here? Applied, thanks. Miklos
On Tue, Jan 03, 2023 at 07:51:22AM -0700, Tycho Andersen wrote: > On Mon, Dec 19, 2022 at 12:16:50PM -0700, Tycho Andersen wrote: > > On Thu, Dec 08, 2022 at 10:49:30AM -0700, Tycho Andersen wrote: > > > On Thu, Dec 08, 2022 at 03:26:19PM +0100, Miklos Szeredi wrote: > > > > On Mon, 28 Nov 2022 at 16:01, Tycho Andersen <tycho@tycho.pizza> wrote: > > > > > > > > > > Hi Milkos, > > > > > > > > > > On Mon, Nov 14, 2022 at 09:02:09AM -0700, Tycho Andersen wrote: > > > > > > v3: use schedule_work() to avoid other sleeps in inode_write_now() and > > > > > > fuse_sync_writes(). Fix a UAF of the stack-based inarg. > > > > > > > > > > Thoughts on this version? > > > > > > > > Skipping attr invalidation on success is wrong. > > > > > > Agreed, that looks like my mistake. > > > > > > > How about the attached (untested) patch? > > > > > > It passes my reproducer with no warnings or anything. Feel free to > > > add: > > > > > > Tested-by: Tycho Andersen <tycho@tycho.pizza> > > > > > > if you want to commit it. > > > > Ping, thoughts on landing this? > > Happy new year all. Any update here? > > Thanks, > > Tycho Thanks for pushing on this, Tycho. I'd suggest sending a clean new version incorporating Miklos' fix. -serge
Hi Miklos, On Thu, Sep 01, 2022 at 08:06:47AM -0600, Tycho Andersen wrote: > From: "Eric W. Biederman" <ebiederm@xmission.com> > > In my very light testing this resolves a hang where a thread of the > fuse server was accessing the fuse filesystem (the fuse server is > serving up), when the fuse server is killed. > > The practical problem is that the fuse server file descriptor was > being closed after the file descriptor into the fuse filesystem so > that the fuse filesystem operations were being blocked for instead of > being aborted. Simply skipping the unnecessary wait resolves this > issue. > > This is just a proof of concept and someone should look to see if the > fuse max_background limit could cause a problem with this approach. > > Additionally testing PF_EXITING is a very crude way to tell if someone > wants the return code from the vfs flush operation. As such in the > long run it probably makes sense to get some direct vfs support for > knowing if flush needs to block until all of the flushing is complete > and a status/return code can be returned. > > Unless I have missed something this is a generic optimization that can > apply to many network filesystems. > > Al, vfs folks? (igrab/iput sorted so as not to be distractions). > > Perhaps a .flush_async method without a return code and a > filp_close_async function without a return code to take advantage of > this in the general sense. > > Waiting potentially indefinitely for user space in do_exit seems like a > bad idea. Especially when all that the wait is for is to get a return > code that will never be examined. > > Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com> > [tycho: small fixups for releasing fuse file + nocred flag] > Signed-off-by: Tycho Andersen <tycho@tycho.pizza> > Reported-by: Tycho Andersen <tycho@tycho.pizza> > Tested-by: "Serge E. Hallyn" <serge@hallyn.com> Any chance you're willing to take this patch? We're still seeing this a lot and it would be great to get it fixed. Thanks. Tycho
Hi, On Mon, Sep 19, 2022 at 09:03:47AM -0600, Tycho Andersen wrote: > Hi Miklos, > > On Thu, Sep 01, 2022 at 08:06:47AM -0600, Tycho Andersen wrote: > > From: "Eric W. Biederman" <ebiederm@xmission.com> > > > > In my very light testing this resolves a hang where a thread of the > > fuse server was accessing the fuse filesystem (the fuse server is > > serving up), when the fuse server is killed. > > > > The practical problem is that the fuse server file descriptor was > > being closed after the file descriptor into the fuse filesystem so > > that the fuse filesystem operations were being blocked for instead of > > being aborted. Simply skipping the unnecessary wait resolves this > > issue. > > > > This is just a proof of concept and someone should look to see if the > > fuse max_background limit could cause a problem with this approach. > > > > Additionally testing PF_EXITING is a very crude way to tell if someone > > wants the return code from the vfs flush operation. As such in the > > long run it probably makes sense to get some direct vfs support for > > knowing if flush needs to block until all of the flushing is complete > > and a status/return code can be returned. > > > > Unless I have missed something this is a generic optimization that can > > apply to many network filesystems. > > > > Al, vfs folks? (igrab/iput sorted so as not to be distractions). > > > > Perhaps a .flush_async method without a return code and a > > filp_close_async function without a return code to take advantage of > > this in the general sense. > > > > Waiting potentially indefinitely for user space in do_exit seems like a > > bad idea. Especially when all that the wait is for is to get a return > > code that will never be examined. > > > > Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com> > > [tycho: small fixups for releasing fuse file + nocred flag] > > Signed-off-by: Tycho Andersen <tycho@tycho.pizza> > > Reported-by: Tycho Andersen <tycho@tycho.pizza> > > Tested-by: "Serge E. Hallyn" <serge@hallyn.com> > > Any chance you're willing to take this patch? We're still seeing this > a lot and it would be great to get it fixed. Another ping here, can someone take this? Miklos? Thanks, Tycho
On Mon, Sep 19, 2022 at 09:03:41AM -0600, Tycho Andersen wrote: > Hi Miklos, > > On Thu, Sep 01, 2022 at 08:06:47AM -0600, Tycho Andersen wrote: > > From: "Eric W. Biederman" <ebiederm@xmission.com> > > > > In my very light testing this resolves a hang where a thread of the > > fuse server was accessing the fuse filesystem (the fuse server is > > serving up), when the fuse server is killed. > > > > The practical problem is that the fuse server file descriptor was > > being closed after the file descriptor into the fuse filesystem so > > that the fuse filesystem operations were being blocked for instead of > > being aborted. Simply skipping the unnecessary wait resolves this > > issue. > > > > This is just a proof of concept and someone should look to see if the > > fuse max_background limit could cause a problem with this approach. I tried to track this down last week, but it looks to me like since the max_background is per-connection, this should work as expected and not affect any other connections. > > Additionally testing PF_EXITING is a very crude way to tell if someone > > wants the return code from the vfs flush operation. As such in the > > long run it probably makes sense to get some direct vfs support for > > knowing if flush needs to block until all of the flushing is complete > > and a status/return code can be returned. > > > > Unless I have missed something this is a generic optimization that can > > apply to many network filesystems. > > > > Al, vfs folks? (igrab/iput sorted so as not to be distractions). > > > > Perhaps a .flush_async method without a return code and a > > filp_close_async function without a return code to take advantage of > > this in the general sense. > > > > Waiting potentially indefinitely for user space in do_exit seems like a > > bad idea. Especially when all that the wait is for is to get a return > > code that will never be examined. > > > > Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com> > > [tycho: small fixups for releasing fuse file + nocred flag] > > Signed-off-by: Tycho Andersen <tycho@tycho.pizza> > > Reported-by: Tycho Andersen <tycho@tycho.pizza> > > Tested-by: "Serge E. Hallyn" <serge@hallyn.com> > > Any chance you're willing to take this patch? We're still seeing this > a lot and it would be great to get it fixed. > > Thanks. > > Tycho
© 2016 - 2026 Red Hat, Inc.