export/fuse: Add FUSE-over-io_uring for Storage Exports

[PATCH 3/4] export/fuse: Safe termination for FUSE-uring

Posted by Brian Song 4 weeks, 1 day ago

When the user sends a termination signal, storage-export-daemon stops
the export, exits the main loop (main_loop_wait), and begins cleaning
up associated resources. At this point, some SQEs submitted via FUSE_IO
_URING_CMD_COMMIT_AND_FETCH may still be pending in the kernel, waiting
for incoming FUSE requests, which can trigger CQE handlers in user
space.

Currently, there is no way to manually cancel these pending CQEs in the
kernel. As a result, after export termination, the related data
structures might be deleted before the pending CQEs return, causing the
CQE handler to be invoked after it has been freed, which may lead to a
segfault.

As a workaround, when submitting an SQE to the kernel, we increment the
block reference (blk_exp_ref) to prevent the CQE handler from being
deleted during export termination. Once the CQE is received, we
decrement the reference (blk_exp_unref).

Suggested-by: Kevin Wolf <kwolf@redhat.com>
Suggested-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Brian Song <hibriansong@gmail.com>
---
 block/export/fuse.c | 75 +++++++++++++++++++++++++++++++++++++++------
 1 file changed, 65 insertions(+), 10 deletions(-)

diff --git a/block/export/fuse.c b/block/export/fuse.c
index 07f74fc8ec..ab2eb895ad 100644
--- a/block/export/fuse.c
+++ b/block/export/fuse.c
@@ -39,6 +39,7 @@
 
 #include "standard-headers/linux/fuse.h"
 #include <sys/ioctl.h>
+#include <sys/sysinfo.h>
 
 #if defined(CONFIG_FALLOCATE_ZERO_RANGE)
 #include <linux/falloc.h>
@@ -321,6 +322,8 @@ static void coroutine_fn co_fuse_uring_queue_handle_cqes(void *opaque)
     fuse_inc_in_flight(exp);
 
     /* A ring entry returned */
+    blk_exp_unref(&exp->common);
+
     fuse_uring_co_process_request(ent);
 
     /* Finished processing requests */
@@ -345,6 +348,9 @@ static void fuse_uring_cqe_handler(CqeHandler *cqe_handler)
             err != -ENOTCONN) {
             fuse_export_halt(exp);
         }
+
+        /* A ring entry returned */
+        blk_exp_unref(&exp->common);
     } else {
         co = qemu_coroutine_create(co_fuse_uring_queue_handle_cqes, ent);
         qemu_coroutine_enter(co);
@@ -392,6 +398,8 @@ static void fuse_uring_submit_register(void *opaque)
     FuseRingEnt *ent = opaque;
     FuseExport *exp = ent->rq->q->exp;
 
+    /* Commit and fetch a ring entry */
+    blk_exp_ref(&exp->common);
 
     aio_add_sqe(fuse_uring_prep_sqe_register, ent, &(ent->fuse_cqe_handler));
 }
@@ -886,6 +894,38 @@ static void read_from_fuse_fd(void *opaque)
     qemu_coroutine_enter(co);
 }
 
+#ifdef CONFIG_LINUX_IO_URING
+static void fuse_ring_queue_manager_destroy(FuseRingQueueManager *manager)
+{
+    if (!manager) {
+        return;
+    }
+
+    for (int i = 0; i < manager->num_ring_queues; i++) {
+        FuseRingQueue *rq = &manager->ring_queues[i];
+
+        for (int j = 0; j < FUSE_DEFAULT_RING_QUEUE_DEPTH; j++) {
+            g_free(rq->ent[j].op_payload);
+        }
+        g_free(rq->ent);
+    }
+
+    g_free(manager->ring_queues);
+    g_free(manager);
+}
+
+static void fuse_export_delete_uring(FuseExport *exp)
+{
+    exp->is_uring = false;
+
+    /* Clean up ring queue manager */
+    if (exp->ring_queue_manager) {
+        fuse_ring_queue_manager_destroy(exp->ring_queue_manager);
+        exp->ring_queue_manager = NULL;
+    }
+}
+#endif
+
 static void fuse_export_shutdown(BlockExport *blk_exp)
 {
     FuseExport *exp = container_of(blk_exp, FuseExport, common);
@@ -901,24 +941,15 @@ static void fuse_export_shutdown(BlockExport *blk_exp)
          */
         g_hash_table_remove(exports, exp->mountpoint);
     }
-}
-
-static void fuse_export_delete(BlockExport *blk_exp)
-{
-    FuseExport *exp = container_of(blk_exp, FuseExport, common);
 
-    for (int i = 0; i < exp->num_queues; i++) {
+    for (size_t i = 0; i < exp->num_queues; i++) {
         FuseQueue *q = &exp->queues[i];
 
         /* Queue 0's FD belongs to the FUSE session */
         if (i > 0 && q->fuse_fd >= 0) {
             close(q->fuse_fd);
         }
-        if (q->spillover_buf) {
-            qemu_vfree(q->spillover_buf);
-        }
     }
-    g_free(exp->queues);
 
     if (exp->fuse_session) {
         if (exp->mounted) {
@@ -927,8 +958,29 @@ static void fuse_export_delete(BlockExport *blk_exp)
 
         fuse_session_destroy(exp->fuse_session);
     }
+}
+
+static void fuse_export_delete(BlockExport *blk_exp)
+{
+    FuseExport *exp = container_of(blk_exp, FuseExport, common);
+
+    for (size_t i = 0; i < exp->num_queues; i++) {
+        FuseQueue *q = &exp->queues[i];
+
+        if (q->spillover_buf) {
+            qemu_vfree(q->spillover_buf);
+        }
+    }
 
     g_free(exp->mountpoint);
+
+#ifdef CONFIG_LINUX_IO_URING
+    if (exp->is_uring) {
+        fuse_export_delete_uring(exp);
+    }
+#endif
+
+    g_free(exp->queues);
 }
 
 /**
@@ -1917,6 +1969,9 @@ fuse_uring_send_response(FuseRingEnt *ent, uint32_t req_id, ssize_t ret,
     out_header->unique = req_id;
     /* out_header->len = ret > 0 ? ret : 0; */
     ent_in_out->payload_sz = ret > 0 ? ret : 0;
+
+    /* Commit and fetch a ring entry */
+    blk_exp_ref(&exp->common);
     aio_add_sqe(fuse_uring_prep_sqe_commit, ent,
                     &ent->fuse_cqe_handler);
 }
-- 
2.45.2

Re: [PATCH 3/4] export/fuse: Safe termination for FUSE-uring

Posted by Stefan Hajnoczi 2 weeks, 4 days ago

On Fri, Aug 29, 2025 at 10:50:24PM -0400, Brian Song wrote:
> @@ -901,24 +941,15 @@ static void fuse_export_shutdown(BlockExport *blk_exp)
>           */
>          g_hash_table_remove(exports, exp->mountpoint);
>      }
> -}
> -
> -static void fuse_export_delete(BlockExport *blk_exp)
> -{
> -    FuseExport *exp = container_of(blk_exp, FuseExport, common);
>  
> -    for (int i = 0; i < exp->num_queues; i++) {
> +    for (size_t i = 0; i < exp->num_queues; i++) {
>          FuseQueue *q = &exp->queues[i];
>  
>          /* Queue 0's FD belongs to the FUSE session */
>          if (i > 0 && q->fuse_fd >= 0) {
>              close(q->fuse_fd);

This changes the behavior of the non-io_uring code. Now all fuse fds and
fuse_session are closed while requests are potentially still being
processed.

There is a race condition: if an IOThread is processing a request here
then it may invoke a system call on q->fuse_fd just after it has been
closed but not set to -1. If another thread has also opened a new file
then the fd could be reused, resulting in an accidental write(2) to the
new file. I'm not sure whether there is a way to trigger this in
practice, but it looks like a problem waiting to happen.

Simply setting q->fuse_fd to -1 here doesn't fix the race. It would be
necessary to stop processing fuse_fd in the thread before closing it
here or to schedule a BH in each thread so that fuse_fd can be closed
in the thread that uses the fd.

Re: [PATCH 3/4] export/fuse: Safe termination for FUSE-uring

Posted by Brian Song 1 week, 6 days ago

Hi Hanna,

Stefan raised the above issue and proposed a preliminary solution: keep
closing the file descriptor in the delete section, but perform
umount separately for FUSE uring and traditional FUSE in the shutdown
and delete sections respectively. This approach avoids the race
condition on the file descriptor.

In the case of FUSE uring, umount must be performed in the shutdown
section. The reason is that the kernel currently lacks an interface to
explicitly cancel submitted SQEs. Performing umount forces the kernel to
flush all pending SQEs and return their CQEs. Without this step, CQEs
may arrive after the export has already been deleted, and invoking the
CQE handler at that point would dereference freed memory and trigger a
segmentation fault.

I’m curious about traditional FUSE: is it strictly necessary to perform
umount in the delete section, or could it also be done in shutdown?
Additionally, what is the correct ordering between close(fd) and
umount, does one need to precede the other?

Thanks,
Brian

On 9/9/25 3:33 PM, Stefan Hajnoczi wrote:
 > On Fri, Aug 29, 2025 at 10:50:24PM -0400, Brian Song wrote:
 >> @@ -901,24 +941,15 @@ static void fuse_export_shutdown(BlockExport
*blk_exp)
 >>            */
 >>           g_hash_table_remove(exports, exp->mountpoint);
 >>       }
 >> -}
 >> -
 >> -static void fuse_export_delete(BlockExport *blk_exp)
 >> -{
 >> -    FuseExport *exp = container_of(blk_exp, FuseExport, common);
 >>
 >> -    for (int i = 0; i < exp->num_queues; i++) {
 >> +    for (size_t i = 0; i < exp->num_queues; i++) {
 >>           FuseQueue *q = &exp->queues[i];
 >>
 >>           /* Queue 0's FD belongs to the FUSE session */
 >>           if (i > 0 && q->fuse_fd >= 0) {
 >>               close(q->fuse_fd);
 >
 > This changes the behavior of the non-io_uring code. Now all fuse fds and
 > fuse_session are closed while requests are potentially still being
 > processed.
 >
 > There is a race condition: if an IOThread is processing a request here
 > then it may invoke a system call on q->fuse_fd just after it has been
 > closed but not set to -1. If another thread has also opened a new file
 > then the fd could be reused, resulting in an accidental write(2) to the
 > new file. I'm not sure whether there is a way to trigger this in
 > practice, but it looks like a problem waiting to happen.
 >
 > Simply setting q->fuse_fd to -1 here doesn't fix the race. It would be
 > necessary to stop processing fuse_fd in the thread before closing it
 > here or to schedule a BH in each thread so that fuse_fd can be closed
 > in the thread that uses the fd.

Re: [PATCH 3/4] export/fuse: Safe termination for FUSE-uring

Posted by Hanna Czenczek 1 week, 4 days ago

On 15.09.25 07:43, Brian Song wrote:
> Hi Hanna,

Hi Brian!

(Thanks for your heads-up!)

> Stefan raised the above issue and proposed a preliminary solution: keep
> closing the file descriptor in the delete section, but perform
> umount separately for FUSE uring and traditional FUSE in the shutdown
> and delete sections respectively. This approach avoids the race
> condition on the file descriptor.
>
> In the case of FUSE uring, umount must be performed in the shutdown
> section. The reason is that the kernel currently lacks an interface to
> explicitly cancel submitted SQEs. Performing umount forces the kernel to
> flush all pending SQEs and return their CQEs. Without this step, CQEs
> may arrive after the export has already been deleted, and invoking the
> CQE handler at that point would dereference freed memory and trigger a
> segmentation fault.

The commit message says that incrementing the BB reference would be 
enough to solve the problem (i.e. deleting is delayed until all requests 
are done).  Why isn’t it?

> I’m curious about traditional FUSE: is it strictly necessary to perform
> umount in the delete section, or could it also be done in shutdown?

Looking into libfuse, fuse_session_unmount() (in fuse_kern_unmount()) 
closes the FUSE FD.  I can imagine that might result in the potential 
problems Stefan described.

> Additionally, what is the correct ordering between close(fd) and
> umount, does one need to precede the other?

fuse_kern_unmount() closes the (queue 0) FD first before actually 
unmounting, with a comment: “Need to close file descriptor, otherwise 
synchronous umount would recurse into filesystem, and deadlock.”

Given that, I assume the FDs should all be closed before unmounting.

(Though to be fair, before looking into it now, I don’t think I’ve ever 
given it much thought…)

Hanna

> Thanks,
> Brian
>
> On 9/9/25 3:33 PM, Stefan Hajnoczi wrote:
>   > On Fri, Aug 29, 2025 at 10:50:24PM -0400, Brian Song wrote:
>   >> @@ -901,24 +941,15 @@ static void fuse_export_shutdown(BlockExport
> *blk_exp)
>   >>            */
>   >>           g_hash_table_remove(exports, exp->mountpoint);
>   >>       }
>   >> -}
>   >> -
>   >> -static void fuse_export_delete(BlockExport *blk_exp)
>   >> -{
>   >> -    FuseExport *exp = container_of(blk_exp, FuseExport, common);
>   >>
>   >> -    for (int i = 0; i < exp->num_queues; i++) {
>   >> +    for (size_t i = 0; i < exp->num_queues; i++) {
>   >>           FuseQueue *q = &exp->queues[i];
>   >>
>   >>           /* Queue 0's FD belongs to the FUSE session */
>   >>           if (i > 0 && q->fuse_fd >= 0) {
>   >>               close(q->fuse_fd);
>   >
>   > This changes the behavior of the non-io_uring code. Now all fuse fds and
>   > fuse_session are closed while requests are potentially still being
>   > processed.
>   >
>   > There is a race condition: if an IOThread is processing a request here
>   > then it may invoke a system call on q->fuse_fd just after it has been
>   > closed but not set to -1. If another thread has also opened a new file
>   > then the fd could be reused, resulting in an accidental write(2) to the
>   > new file. I'm not sure whether there is a way to trigger this in
>   > practice, but it looks like a problem waiting to happen.
>   >
>   > Simply setting q->fuse_fd to -1 here doesn't fix the race. It would be
>   > necessary to stop processing fuse_fd in the thread before closing it
>   > here or to schedule a BH in each thread so that fuse_fd can be closed
>   > in the thread that uses the fd.
>

Re: [PATCH 3/4] export/fuse: Safe termination for FUSE-uring

Posted by Brian Song 1 week, 3 days ago


On 9/17/25 9:01 AM, Hanna Czenczek wrote:
> On 15.09.25 07:43, Brian Song wrote:
>> Hi Hanna,
> 
> Hi Brian!
> 
> (Thanks for your heads-up!)
> 
>> Stefan raised the above issue and proposed a preliminary solution: keep
>> closing the file descriptor in the delete section, but perform
>> umount separately for FUSE uring and traditional FUSE in the shutdown
>> and delete sections respectively. This approach avoids the race
>> condition on the file descriptor.
>>
>> In the case of FUSE uring, umount must be performed in the shutdown
>> section. The reason is that the kernel currently lacks an interface to
>> explicitly cancel submitted SQEs. Performing umount forces the kernel to
>> flush all pending SQEs and return their CQEs. Without this step, CQEs
>> may arrive after the export has already been deleted, and invoking the
>> CQE handler at that point would dereference freed memory and trigger a
>> segmentation fault.
> 
> The commit message says that incrementing the BB reference would be 
> enough to solve the problem (i.e. deleting is delayed until all requests 
> are done).  Why isn’t it?

Hanna:

If we place umount in the delete section instead of the shutdown 
section, the kernel FUSE driver will continue waiting for user FUSE 
requests and therefore won't return CQEs to userspace. As a result, the 
BB reference remains held (since the reference is acquired during 
registration and submission and only released once the CQE returns), 
preventing the delete operation from being invoked (invoked once the 
reference is decreased to 0). This is why umount must be placed in the 
shutdown section.

> 
>> I’m curious about traditional FUSE: is it strictly necessary to perform
>> umount in the delete section, or could it also be done in shutdown?
> 
> Looking into libfuse, fuse_session_unmount() (in fuse_kern_unmount()) 
> closes the FUSE FD.  I can imagine that might result in the potential 
> problems Stefan described.
> 
>> Additionally, what is the correct ordering between close(fd) and
>> umount, does one need to precede the other?
> 
> fuse_kern_unmount() closes the (queue 0) FD first before actually 
> unmounting, with a comment: “Need to close file descriptor, otherwise 
> synchronous umount would recurse into filesystem, and deadlock.”
> 
> Given that, I assume the FDs should all be closed before unmounting.
> 
> (Though to be fair, before looking into it now, I don’t think I’ve ever 
> given it much thought…)
> 
> Hanna
>
Stefan:

I roughly went through the umount and close system calls:

umount:
fuse_kill_sb_anon -> fuse_sb_destroy -> fuse_abort_conn

close:
__fput -> file->f_op->release(inode, file) -> fuse_dev_release -> 
fuse_abort_conn
(this only runs after all /dev/fuse FDs have been closed).

And as Hanna mentioned, libfuse points out: “Need to close file 
descriptor, otherwise synchronous umount would recurse into filesystem, 
and deadlock.”

So ideally, we should close each queue FD first, then call umount at the 
end — even though calling umount directly also works. The root issue is 
that the kernel doesn't provide an interface to cancel already submitted 
SQEs.

You mentioned that in fuse over io_uring mode we perform close in the 
shutdown path, but at that point the server may still be processing 
requests. While handling requests, it may still write to the FD, but 
that FD might not be /dev/fuse. I’m not sure how this gets triggered, 
since in fuse uring mode all FUSE requests are handled by io_uring, and 
our FUSE requests should be completed via io_uring. After shutdown 
closes the FD, it may call fuse_abort_conn, which terminates all request 
processing in the kernel. There’s also locking in place to protect the 
termination of requests and the subsequent uring cleanup.

That’s why I think the best approach for now is:

in shutdown, handle close and umount for fuse over io_uring;

in delete, handle close and umount for traditional FUSE.

>> Thanks,
>> Brian
>>
>> On 9/9/25 3:33 PM, Stefan Hajnoczi wrote:
>>   > On Fri, Aug 29, 2025 at 10:50:24PM -0400, Brian Song wrote:
>>   >> @@ -901,24 +941,15 @@ static void fuse_export_shutdown(BlockExport
>> *blk_exp)
>>   >>            */
>>   >>           g_hash_table_remove(exports, exp->mountpoint);
>>   >>       }
>>   >> -}
>>   >> -
>>   >> -static void fuse_export_delete(BlockExport *blk_exp)
>>   >> -{
>>   >> -    FuseExport *exp = container_of(blk_exp, FuseExport, common);
>>   >>
>>   >> -    for (int i = 0; i < exp->num_queues; i++) {
>>   >> +    for (size_t i = 0; i < exp->num_queues; i++) {
>>   >>           FuseQueue *q = &exp->queues[i];
>>   >>
>>   >>           /* Queue 0's FD belongs to the FUSE session */
>>   >>           if (i > 0 && q->fuse_fd >= 0) {
>>   >>               close(q->fuse_fd);
>>   >
>>   > This changes the behavior of the non-io_uring code. Now all fuse 
>> fds and
>>   > fuse_session are closed while requests are potentially still being
>>   > processed.
>>   >
>>   > There is a race condition: if an IOThread is processing a request 
>> here
>>   > then it may invoke a system call on q->fuse_fd just after it has been
>>   > closed but not set to -1. If another thread has also opened a new 
>> file
>>   > then the fd could be reused, resulting in an accidental write(2) 
>> to the
>>   > new file. I'm not sure whether there is a way to trigger this in
>>   > practice, but it looks like a problem waiting to happen.
>>   >
>>   > Simply setting q->fuse_fd to -1 here doesn't fix the race. It 
>> would be
>>   > necessary to stop processing fuse_fd in the thread before closing it
>>   > here or to schedule a BH in each thread so that fuse_fd can be closed
>>   > in the thread that uses the fd.
>>
>

Re: [PATCH 3/4] export/fuse: Safe termination for FUSE-uring

Posted by Stefan Hajnoczi 5 days, 21 hours ago

On Wed, Sep 17, 2025 at 06:06:55PM -0400, Brian Song wrote:
> 
> 
> On 9/17/25 9:01 AM, Hanna Czenczek wrote:
> > On 15.09.25 07:43, Brian Song wrote:
> > > Hi Hanna,
> > 
> > Hi Brian!
> > 
> > (Thanks for your heads-up!)
> > 
> > > Stefan raised the above issue and proposed a preliminary solution: keep
> > > closing the file descriptor in the delete section, but perform
> > > umount separately for FUSE uring and traditional FUSE in the shutdown
> > > and delete sections respectively. This approach avoids the race
> > > condition on the file descriptor.
> > > 
> > > In the case of FUSE uring, umount must be performed in the shutdown
> > > section. The reason is that the kernel currently lacks an interface to
> > > explicitly cancel submitted SQEs. Performing umount forces the kernel to
> > > flush all pending SQEs and return their CQEs. Without this step, CQEs
> > > may arrive after the export has already been deleted, and invoking the
> > > CQE handler at that point would dereference freed memory and trigger a
> > > segmentation fault.
> > 
> > The commit message says that incrementing the BB reference would be
> > enough to solve the problem (i.e. deleting is delayed until all requests
> > are done).  Why isn’t it?
> 
> Hanna:
> 
> If we place umount in the delete section instead of the shutdown section,
> the kernel FUSE driver will continue waiting for user FUSE requests and
> therefore won't return CQEs to userspace. As a result, the BB reference
> remains held (since the reference is acquired during registration and
> submission and only released once the CQE returns), preventing the delete
> operation from being invoked (invoked once the reference is decreased to 0).
> This is why umount must be placed in the shutdown section.
> 
> > 
> > > I’m curious about traditional FUSE: is it strictly necessary to perform
> > > umount in the delete section, or could it also be done in shutdown?
> > 
> > Looking into libfuse, fuse_session_unmount() (in fuse_kern_unmount())
> > closes the FUSE FD.  I can imagine that might result in the potential
> > problems Stefan described.
> > 
> > > Additionally, what is the correct ordering between close(fd) and
> > > umount, does one need to precede the other?
> > 
> > fuse_kern_unmount() closes the (queue 0) FD first before actually
> > unmounting, with a comment: “Need to close file descriptor, otherwise
> > synchronous umount would recurse into filesystem, and deadlock.”
> > 
> > Given that, I assume the FDs should all be closed before unmounting.
> > 
> > (Though to be fair, before looking into it now, I don’t think I’ve ever
> > given it much thought…)
> > 
> > Hanna
> > 
> Stefan:
> 
> I roughly went through the umount and close system calls:
> 
> umount:
> fuse_kill_sb_anon -> fuse_sb_destroy -> fuse_abort_conn
> 
> close:
> __fput -> file->f_op->release(inode, file) -> fuse_dev_release ->
> fuse_abort_conn
> (this only runs after all /dev/fuse FDs have been closed).
> 
> And as Hanna mentioned, libfuse points out: “Need to close file descriptor,
> otherwise synchronous umount would recurse into filesystem, and deadlock.”
> 
> So ideally, we should close each queue FD first, then call umount at the end
> — even though calling umount directly also works. The root issue is that the
> kernel doesn't provide an interface to cancel already submitted SQEs.
> 
> You mentioned that in fuse over io_uring mode we perform close in the
> shutdown path, but at that point the server may still be processing
> requests. While handling requests, it may still write to the FD, but that FD
> might not be /dev/fuse. I’m not sure how this gets triggered, since in fuse
> uring mode all FUSE requests are handled by io_uring, and our FUSE requests
> should be completed via io_uring. After shutdown closes the FD, it may call
> fuse_abort_conn, which terminates all request processing in the kernel.

If another thread opens a new file descriptor, the kernel will hand out
the lowest numbered available file descriptor. That fd could be the
FUSE-over-io_uring fd that was just closed by the main loop thread while
the IOThread is still waiting for CQEs or in the middle of processing a
FUSE-over-io_uring request. An IOThread must not use the stale fd (e.g.
as part of an io_uring SQE) thinking it is a FUSE fd.

> There’s also locking in place to protect the termination of requests and the
> subsequent uring cleanup.
> 
> That’s why I think the best approach for now is:
> 
> in shutdown, handle close and umount for fuse over io_uring;
> 
> in delete, handle close and umount for traditional FUSE.

Yes. I would refine the FUSE-over-io_uring part like this:

I remember we discussed scheduling a BH in the IOThreads so they can
call close(2). That way there's no race between the IOThreads, which are
still using the fds, and the main loop thread, which is in shutdown().

It sounds like the main loop thread should only umount once all
IOThreads have closed their fds. The IOThreads will need to notify the
main loop thread when they are done. An async callback in the main loop
thread will invoke umount and drop the reference to the export. Then
delete() will finally be called.

If someone can think of a way to achieve the same thing with less
synchronization, that would be simpler. But if not, then I think we need
this for correctness (to avoid the race with IOThreads still using the
fd).

Stefan

> 
> > > Thanks,
> > > Brian
> > > 
> > > On 9/9/25 3:33 PM, Stefan Hajnoczi wrote:
> > >   > On Fri, Aug 29, 2025 at 10:50:24PM -0400, Brian Song wrote:
> > >   >> @@ -901,24 +941,15 @@ static void fuse_export_shutdown(BlockExport
> > > *blk_exp)
> > >   >>            */
> > >   >>           g_hash_table_remove(exports, exp->mountpoint);
> > >   >>       }
> > >   >> -}
> > >   >> -
> > >   >> -static void fuse_export_delete(BlockExport *blk_exp)
> > >   >> -{
> > >   >> -    FuseExport *exp = container_of(blk_exp, FuseExport, common);
> > >   >>
> > >   >> -    for (int i = 0; i < exp->num_queues; i++) {
> > >   >> +    for (size_t i = 0; i < exp->num_queues; i++) {
> > >   >>           FuseQueue *q = &exp->queues[i];
> > >   >>
> > >   >>           /* Queue 0's FD belongs to the FUSE session */
> > >   >>           if (i > 0 && q->fuse_fd >= 0) {
> > >   >>               close(q->fuse_fd);
> > >   >
> > >   > This changes the behavior of the non-io_uring code. Now all fuse
> > > fds and
> > >   > fuse_session are closed while requests are potentially still being
> > >   > processed.
> > >   >
> > >   > There is a race condition: if an IOThread is processing a
> > > request here
> > >   > then it may invoke a system call on q->fuse_fd just after it has been
> > >   > closed but not set to -1. If another thread has also opened a
> > > new file
> > >   > then the fd could be reused, resulting in an accidental write(2)
> > > to the
> > >   > new file. I'm not sure whether there is a way to trigger this in
> > >   > practice, but it looks like a problem waiting to happen.
> > >   >
> > >   > Simply setting q->fuse_fd to -1 here doesn't fix the race. It
> > > would be
> > >   > necessary to stop processing fuse_fd in the thread before closing it
> > >   > here or to schedule a BH in each thread so that fuse_fd can be closed
> > >   > in the thread that uses the fd.
> > > 
> > 
>

Re: [PATCH 3/4] export/fuse: Safe termination for FUSE-uring

Posted by Stefan Hajnoczi 5 days, 21 hours ago

On Wed, Sep 17, 2025 at 06:06:55PM -0400, Brian Song wrote:
> 
> 
> On 9/17/25 9:01 AM, Hanna Czenczek wrote:
> > On 15.09.25 07:43, Brian Song wrote:
> > > Hi Hanna,
> > 
> > Hi Brian!
> > 
> > (Thanks for your heads-up!)
> > 
> > > Stefan raised the above issue and proposed a preliminary solution: keep
> > > closing the file descriptor in the delete section, but perform
> > > umount separately for FUSE uring and traditional FUSE in the shutdown
> > > and delete sections respectively. This approach avoids the race
> > > condition on the file descriptor.
> > > 
> > > In the case of FUSE uring, umount must be performed in the shutdown
> > > section. The reason is that the kernel currently lacks an interface to
> > > explicitly cancel submitted SQEs. Performing umount forces the kernel to
> > > flush all pending SQEs and return their CQEs. Without this step, CQEs
> > > may arrive after the export has already been deleted, and invoking the
> > > CQE handler at that point would dereference freed memory and trigger a
> > > segmentation fault.
> > 
> > The commit message says that incrementing the BB reference would be
> > enough to solve the problem (i.e. deleting is delayed until all requests
> > are done).  Why isn’t it?
> 
> Hanna:
> 
> If we place umount in the delete section instead of the shutdown section,
> the kernel FUSE driver will continue waiting for user FUSE requests and
> therefore won't return CQEs to userspace. As a result, the BB reference
> remains held (since the reference is acquired during registration and
> submission and only released once the CQE returns), preventing the delete
> operation from being invoked (invoked once the reference is decreased to 0).
> This is why umount must be placed in the shutdown section.
> 
> > 
> > > I’m curious about traditional FUSE: is it strictly necessary to perform
> > > umount in the delete section, or could it also be done in shutdown?
> > 
> > Looking into libfuse, fuse_session_unmount() (in fuse_kern_unmount())
> > closes the FUSE FD.  I can imagine that might result in the potential
> > problems Stefan described.
> > 
> > > Additionally, what is the correct ordering between close(fd) and
> > > umount, does one need to precede the other?
> > 
> > fuse_kern_unmount() closes the (queue 0) FD first before actually
> > unmounting, with a comment: “Need to close file descriptor, otherwise
> > synchronous umount would recurse into filesystem, and deadlock.”
> > 
> > Given that, I assume the FDs should all be closed before unmounting.
> > 
> > (Though to be fair, before looking into it now, I don’t think I’ve ever
> > given it much thought…)
> > 
> > Hanna
> > 
> Stefan:
> 
> I roughly went through the umount and close system calls:
> 
> umount:
> fuse_kill_sb_anon -> fuse_sb_destroy -> fuse_abort_conn
> 
> close:
> __fput -> file->f_op->release(inode, file) -> fuse_dev_release ->
> fuse_abort_conn
> (this only runs after all /dev/fuse FDs have been closed).
> 
> And as Hanna mentioned, libfuse points out: “Need to close file descriptor,
> otherwise synchronous umount would recurse into filesystem, and deadlock.”
> 
> So ideally, we should close each queue FD first, then call umount at the end
> — even though calling umount directly also works. The root issue is that the
> kernel doesn't provide an interface to cancel already submitted SQEs.

Hi Bernd,
I wanted to check with you to see if you have thought more about
ASYNC_CANCEL support for FUSE-over-io_uring SQEs?

If you don't have time to implement it, maybe you could share your
thoughts on how one would go about doing this? That would be a nice
starting point if someone else wants to try it out.

Thanks,
Stefan

> 
> You mentioned that in fuse over io_uring mode we perform close in the
> shutdown path, but at that point the server may still be processing
> requests. While handling requests, it may still write to the FD, but that FD
> might not be /dev/fuse. I’m not sure how this gets triggered, since in fuse
> uring mode all FUSE requests are handled by io_uring, and our FUSE requests
> should be completed via io_uring. After shutdown closes the FD, it may call
> fuse_abort_conn, which terminates all request processing in the kernel.
> There’s also locking in place to protect the termination of requests and the
> subsequent uring cleanup.
> 
> That’s why I think the best approach for now is:
> 
> in shutdown, handle close and umount for fuse over io_uring;
> 
> in delete, handle close and umount for traditional FUSE.
> 
> > > Thanks,
> > > Brian
> > > 
> > > On 9/9/25 3:33 PM, Stefan Hajnoczi wrote:
> > >   > On Fri, Aug 29, 2025 at 10:50:24PM -0400, Brian Song wrote:
> > >   >> @@ -901,24 +941,15 @@ static void fuse_export_shutdown(BlockExport
> > > *blk_exp)
> > >   >>            */
> > >   >>           g_hash_table_remove(exports, exp->mountpoint);
> > >   >>       }
> > >   >> -}
> > >   >> -
> > >   >> -static void fuse_export_delete(BlockExport *blk_exp)
> > >   >> -{
> > >   >> -    FuseExport *exp = container_of(blk_exp, FuseExport, common);
> > >   >>
> > >   >> -    for (int i = 0; i < exp->num_queues; i++) {
> > >   >> +    for (size_t i = 0; i < exp->num_queues; i++) {
> > >   >>           FuseQueue *q = &exp->queues[i];
> > >   >>
> > >   >>           /* Queue 0's FD belongs to the FUSE session */
> > >   >>           if (i > 0 && q->fuse_fd >= 0) {
> > >   >>               close(q->fuse_fd);
> > >   >
> > >   > This changes the behavior of the non-io_uring code. Now all fuse
> > > fds and
> > >   > fuse_session are closed while requests are potentially still being
> > >   > processed.
> > >   >
> > >   > There is a race condition: if an IOThread is processing a
> > > request here
> > >   > then it may invoke a system call on q->fuse_fd just after it has been
> > >   > closed but not set to -1. If another thread has also opened a
> > > new file
> > >   > then the fd could be reused, resulting in an accidental write(2)
> > > to the
> > >   > new file. I'm not sure whether there is a way to trigger this in
> > >   > practice, but it looks like a problem waiting to happen.
> > >   >
> > >   > Simply setting q->fuse_fd to -1 here doesn't fix the race. It
> > > would be
> > >   > necessary to stop processing fuse_fd in the thread before closing it
> > >   > here or to schedule a BH in each thread so that fuse_fd can be closed
> > >   > in the thread that uses the fd.
> > > 
> > 
>

Re: [PATCH 3/4] export/fuse: Safe termination for FUSE-uring

Posted by Brian Song 2 weeks, 4 days ago


On 9/9/25 3:33 PM, Stefan Hajnoczi wrote:
> On Fri, Aug 29, 2025 at 10:50:24PM -0400, Brian Song wrote:
>> @@ -901,24 +941,15 @@ static void fuse_export_shutdown(BlockExport *blk_exp)
>>            */
>>           g_hash_table_remove(exports, exp->mountpoint);
>>       }
>> -}
>> -
>> -static void fuse_export_delete(BlockExport *blk_exp)
>> -{
>> -    FuseExport *exp = container_of(blk_exp, FuseExport, common);
>>   
>> -    for (int i = 0; i < exp->num_queues; i++) {
>> +    for (size_t i = 0; i < exp->num_queues; i++) {
>>           FuseQueue *q = &exp->queues[i];
>>   
>>           /* Queue 0's FD belongs to the FUSE session */
>>           if (i > 0 && q->fuse_fd >= 0) {
>>               close(q->fuse_fd);
> 
> This changes the behavior of the non-io_uring code. Now all fuse fds and
> fuse_session are closed while requests are potentially still being
> processed.
> 
> There is a race condition: if an IOThread is processing a request here
> then it may invoke a system call on q->fuse_fd just after it has been
> closed but not set to -1. If another thread has also opened a new file
> then the fd could be reused, resulting in an accidental write(2) to the
> new file. I'm not sure whether there is a way to trigger this in
> practice, but it looks like a problem waiting to happen.
> 
> Simply setting q->fuse_fd to -1 here doesn't fix the race. It would be
> necessary to stop processing fuse_fd in the thread before closing it
> here or to schedule a BH in each thread so that fuse_fd can be closed
> in the thread that uses the fd.

I get what you mean. This newly introduced cleanup code was originally 
in the deletion section, after the reconf counter decreased to 0, and it 
was meant to cancel the pending SQEs. But now we've moved it to the 
shutdown section, which may introduce a potential problem. How do you 
think we should fix it? This is the last week of GSoC, I'm already busy 
on weekdays since the new term has started.

Re: [PATCH 3/4] export/fuse: Safe termination for FUSE-uring

Posted by Stefan Hajnoczi 2 weeks, 4 days ago

On Tue, Sep 09, 2025 at 04:51:32PM -0400, Brian Song wrote:
> 
> 
> On 9/9/25 3:33 PM, Stefan Hajnoczi wrote:
> > On Fri, Aug 29, 2025 at 10:50:24PM -0400, Brian Song wrote:
> > > @@ -901,24 +941,15 @@ static void fuse_export_shutdown(BlockExport *blk_exp)
> > >            */
> > >           g_hash_table_remove(exports, exp->mountpoint);
> > >       }
> > > -}
> > > -
> > > -static void fuse_export_delete(BlockExport *blk_exp)
> > > -{
> > > -    FuseExport *exp = container_of(blk_exp, FuseExport, common);
> > > -    for (int i = 0; i < exp->num_queues; i++) {
> > > +    for (size_t i = 0; i < exp->num_queues; i++) {
> > >           FuseQueue *q = &exp->queues[i];
> > >           /* Queue 0's FD belongs to the FUSE session */
> > >           if (i > 0 && q->fuse_fd >= 0) {
> > >               close(q->fuse_fd);
> > 
> > This changes the behavior of the non-io_uring code. Now all fuse fds and
> > fuse_session are closed while requests are potentially still being
> > processed.
> > 
> > There is a race condition: if an IOThread is processing a request here
> > then it may invoke a system call on q->fuse_fd just after it has been
> > closed but not set to -1. If another thread has also opened a new file
> > then the fd could be reused, resulting in an accidental write(2) to the
> > new file. I'm not sure whether there is a way to trigger this in
> > practice, but it looks like a problem waiting to happen.
> > 
> > Simply setting q->fuse_fd to -1 here doesn't fix the race. It would be
> > necessary to stop processing fuse_fd in the thread before closing it
> > here or to schedule a BH in each thread so that fuse_fd can be closed
> > in the thread that uses the fd.
> 
> I get what you mean. This newly introduced cleanup code was originally in
> the deletion section, after the reconf counter decreased to 0, and it was
> meant to cancel the pending SQEs. But now we've moved it to the shutdown
> section, which may introduce a potential problem. How do you think we should
> fix it? This is the last week of GSoC, I'm already busy on weekdays since
> the new term has started.

Hi Brian,
Two issues:
1. Change of behavior for non-io_uring code. It would be safer to keep
   the old behavior for non-io_uring code.
2. The race condition. Schedule a BH in each queue's IOThread and call
   close(fuse_fd) from the BH function. That way there is no race
   between threads.

Stefan

[PATCH 1/4] export/fuse: add opt to enable FUSE-over-io_uring
[PATCH 2/4] export/fuse: process FUSE-over-io_uring requests
[PATCH 3/4] export/fuse: Safe termination for FUSE-uring
[PATCH 4/4] iotests: add tests for FUSE-over-io_uring