block/export: Add FUSE-over-io_uring for Storage Exports

[PATCH 1/3] fuse: add FUSE-over-io_uring enable opt and init

Posted by Zhi Song 3 months ago

From: Brian Song <hibriansong@gmail.com>

This patch adds a new export option for storage-export-daemon to enable
or disable FUSE-over-io_uring via the switch io-uring=on|off (disable
by default). It also implements the protocol handshake with the Linux
kernel during the FUSE-over-io_uring initialization phase.

See: https://docs.kernel.org/filesystems/fuse-io-uring.html

The kernel documentation describes in detail how FUSE-over-io_uring
works. This patch implements the Initial SQE stage shown in thediagram:
it initializes one queue per IOThread, each currently supporting a
single submission queue entry (SQE). When the FUSE driver sends the
first FUSE request (FUSE_INIT), storage-export-daemon calls
fuse_uring_start() to complete initialization, ultimately submitting
the SQE with the FUSE_IO_URING_CMD_REGISTER command to confirm
successful initialization with the kernel.

Suggested-by: Kevin Wolf <kwolf@redhat.com>
Suggested-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Brian Song <hibriansong@gmail.com>
---
 block/export/fuse.c                  | 161 ++++++++++++++++++++++++---
 docs/tools/qemu-storage-daemon.rst   |  11 +-
 qapi/block-export.json               |   5 +-
 storage-daemon/qemu-storage-daemon.c |   1 +
 util/fdmon-io_uring.c                |   5 +-
 5 files changed, 159 insertions(+), 24 deletions(-)

diff --git a/block/export/fuse.c b/block/export/fuse.c
index c0ad4696ce..59fa79f486 100644
--- a/block/export/fuse.c
+++ b/block/export/fuse.c
@@ -48,6 +48,11 @@
 #include <linux/fs.h>
 #endif

+#define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32
+
+/* room needed in buffer to accommodate header */
+#define FUSE_BUFFER_HEADER_SIZE 0x1000
+
 /* Prevent overly long bounce buffer allocations */
 #define FUSE_MAX_READ_BYTES (MIN(BDRV_REQUEST_MAX_BYTES, 1 * 1024 * 1024))
 /*
@@ -63,12 +68,31 @@
     (FUSE_MAX_WRITE_BYTES - FUSE_IN_PLACE_WRITE_BYTES)

 typedef struct FuseExport FuseExport;
+typedef struct FuseQueue FuseQueue;
+
+typedef struct FuseRingEnt {
+    /* back pointer */
+    FuseQueue *q;
+
+    /* commit id of a fuse request */
+    uint64_t req_commit_id;
+
+    /* fuse request header and payload */
+    struct fuse_uring_req_header req_header;
+    void *op_payload;
+    size_t req_payload_sz;
+
+    /* The vector passed to the kernel */
+    struct iovec iov[2];
+
+    CqeHandler fuse_cqe_handler;
+} FuseRingEnt;

 /*
  * One FUSE "queue", representing one FUSE FD from which requests are fetched
  * and processed.  Each queue is tied to an AioContext.
  */
-typedef struct FuseQueue {
+struct FuseQueue {
     FuseExport *exp;

     AioContext *ctx;
@@ -109,7 +133,12 @@ typedef struct FuseQueue {
      * Free this buffer with qemu_vfree().
      */
     void *spillover_buf;
-} FuseQueue;
+
+#ifdef CONFIG_LINUX_IO_URING
+    int qid;
+    FuseRingEnt ent;
+#endif
+};

 /*
  * Verify that FuseQueue.request_buf plus the spill-over buffer together
@@ -148,6 +177,7 @@ struct FuseExport {
     bool growable;
     /* Whether allow_other was used as a mount option or not */
     bool allow_other;
+    bool is_uring;

     mode_t st_mode;
     uid_t st_uid;
@@ -257,6 +287,93 @@ static const BlockDevOps fuse_export_blk_dev_ops = {
     .drained_poll  = fuse_export_drained_poll,
 };

+#ifdef CONFIG_LINUX_IO_URING
+
+static void fuse_uring_sqe_set_req_data(struct fuse_uring_cmd_req *req,
+                    const unsigned int qid,
+                    const unsigned int commit_id)
+{
+    req->qid = qid;
+    req->commit_id = commit_id;
+    req->flags = 0;
+}
+
+static void fuse_uring_sqe_prepare(struct io_uring_sqe *sqe, FuseQueue *q,
+               __u32 cmd_op)
+{
+    sqe->opcode = IORING_OP_URING_CMD;
+
+    sqe->fd = q->fuse_fd;
+    sqe->rw_flags = 0;
+    sqe->ioprio = 0;
+    sqe->off = 0;
+
+    sqe->cmd_op = cmd_op;
+    sqe->__pad1 = 0;
+}
+
+static void fuse_uring_prep_sqe_register(struct io_uring_sqe *sqe, void *opaque)
+{
+    FuseQueue *q = opaque;
+    struct fuse_uring_cmd_req *req = (void *)&sqe->cmd[0];
+
+    fuse_uring_sqe_prepare(sqe, q, FUSE_IO_URING_CMD_REGISTER);
+
+    sqe->addr = (uint64_t)(q->ent.iov);
+    sqe->len = 2;
+
+    fuse_uring_sqe_set_req_data(req, q->qid, 0);
+}
+
+static void fuse_uring_submit_register(void *opaque)
+{
+    FuseQueue *q = opaque;
+    FuseExport *exp = q->exp;
+
+
+    aio_add_sqe(fuse_uring_prep_sqe_register, q, &(q->ent.fuse_cqe_handler));
+}
+
+static void fuse_uring_start(FuseExport *exp, struct fuse_init_out *out)
+{
+    /*
+     * Since we didn't enable the FUSE_MAX_PAGES feature, the value of
+     * fc->max_pages should be FUSE_DEFAULT_MAX_PAGES_PER_REQ, which is set by
+     * the kernel by default. Also, max_write should not exceed
+     * FUSE_DEFAULT_MAX_PAGES_PER_REQ * PAGE_SIZE.
+     */
+    size_t bufsize = out->max_write + FUSE_BUFFER_HEADER_SIZE;
+
+    if (!(out->flags & FUSE_MAX_PAGES)) {
+        bufsize = FUSE_DEFAULT_MAX_PAGES_PER_REQ * qemu_real_host_page_size()
+                         + FUSE_BUFFER_HEADER_SIZE;
+    }
+
+    for (int i = 0; i < exp->num_queues; i++) {
+        FuseQueue *q = &exp->queues[i];
+        FuseRingEnt *ent = &q->ent;
+
+        ent->q = q;
+
+        ent->req_payload_sz = bufsize - FUSE_BUFFER_HEADER_SIZE;
+        ent->op_payload = g_malloc0(ent->req_payload_sz);
+
+        ent->iov[0] = (struct iovec) {
+            &(ent->req_header),
+            sizeof(struct fuse_uring_req_header)
+        };
+        ent->iov[1] = (struct iovec) {
+            ent->op_payload,
+            ent->req_payload_sz
+        };
+
+        ent->fuse_cqe_handler.cb = fuse_uring_cqe_handler;
+
+        aio_bh_schedule_oneshot(q->ctx, fuse_uring_submit_register, q);
+    }
+}
+#endif
+
 static int fuse_export_create(BlockExport *blk_exp,
                               BlockExportOptions *blk_exp_args,
                               AioContext *const *multithread,
@@ -280,6 +397,9 @@ static int fuse_export_create(BlockExport *blk_exp,

         for (size_t i = 0; i < mt_count; i++) {
             exp->queues[i] = (FuseQueue) {
+#ifdef CONFIG_LINUX_IO_URING
+                .qid = i,
+#endif
                 .exp = exp,
                 .ctx = multithread[i],
                 .fuse_fd = -1,
@@ -293,6 +413,9 @@ static int fuse_export_create(BlockExport *blk_exp,
         exp->num_queues = 1;
         exp->queues = g_new(FuseQueue, 1);
         exp->queues[0] = (FuseQueue) {
+#ifdef CONFIG_LINUX_IO_URING
+            .qid = 0,
+#endif
             .exp = exp,
             .ctx = exp->common.ctx,
             .fuse_fd = -1,
@@ -312,6 +435,8 @@ static int fuse_export_create(BlockExport *blk_exp,
         }
     }

+    exp->is_uring = args->io_uring ? true : false;
+
     blk_set_dev_ops(exp->common.blk, &fuse_export_blk_dev_ops, exp);

     /*
@@ -687,15 +812,22 @@ static ssize_t coroutine_fn
 fuse_co_init(FuseExport *exp, struct fuse_init_out *out,
              uint32_t max_readahead, uint32_t flags)
 {
-    const uint32_t supported_flags = FUSE_ASYNC_READ | FUSE_ASYNC_DIO;
+    const uint32_t supported_flags = FUSE_ASYNC_READ | FUSE_ASYNC_DIO
+                                     | FUSE_INIT_EXT;
+    uint64_t outargflags = flags;
+
+#ifdef CONFIG_LINUX_IO_URING
+    if (exp->is_uring)
+        outargflags |= FUSE_OVER_IO_URING;
+#endif

     *out = (struct fuse_init_out) {
         .major = FUSE_KERNEL_VERSION,
         .minor = FUSE_KERNEL_MINOR_VERSION,
         .max_readahead = max_readahead,
         .max_write = FUSE_MAX_WRITE_BYTES,
-        .flags = flags & supported_flags,
-        .flags2 = 0,
+        .flags = outargflags & supported_flags,
+        .flags2 = outargflags >> 32,

         /* libfuse maximum: 2^16 - 1 */
         .max_background = UINT16_MAX,
@@ -1393,22 +1525,17 @@ fuse_co_process_request(FuseQueue *q, void *spillover_buf)
     struct fuse_out_header *out_hdr = (struct fuse_out_header *)out_buf;
     /* For read requests: Data to be returned */
     void *out_data_buffer = NULL;
-    ssize_t ret;

-    /* Limit scope to ensure pointer is no longer used after yielding */
-    {
-        const struct fuse_in_header *in_hdr =
-            (const struct fuse_in_header *)q->request_buf;
-
-        opcode = in_hdr->opcode;
-        req_id = in_hdr->unique;
-    }
+    bool is_uring = exp->is_uring;

     switch (opcode) {
     case FUSE_INIT: {
-        const struct fuse_init_in *in = FUSE_IN_OP_STRUCT(init, q);
-        ret = fuse_co_init(exp, FUSE_OUT_OP_STRUCT(init, out_buf),
-                           in->max_readahead, in->flags);
+#ifdef CONFIG_LINUX_IO_URING
+        /* FUSE-over-io_uring enabled && start from the tradition path */
+        if (is_uring) {
+            fuse_uring_start(exp, out);
+        }
+#endif
         break;
     }

diff --git a/docs/tools/qemu-storage-daemon.rst b/docs/tools/qemu-storage-daemon.rst
index 35ab2d7807..c5076101e0 100644
--- a/docs/tools/qemu-storage-daemon.rst
+++ b/docs/tools/qemu-storage-daemon.rst
@@ -78,7 +78,7 @@ Standard options:
 .. option:: --export [type=]nbd,id=<id>,node-name=<node-name>[,name=<export-name>][,writable=on|off][,bitmap=<name>]
   --export [type=]vhost-user-blk,id=<id>,node-name=<node-name>,addr.type=unix,addr.path=<socket-path>[,writable=on|off][,logical-block-size=<block-size>][,num-queues=<num-queues>]
   --export [type=]vhost-user-blk,id=<id>,node-name=<node-name>,addr.type=fd,addr.str=<fd>[,writable=on|off][,logical-block-size=<block-size>][,num-queues=<num-queues>]
-  --export [type=]fuse,id=<id>,node-name=<node-name>,mountpoint=<file>[,growable=on|off][,writable=on|off][,allow-other=on|off|auto]
+  --export [type=]fuse,id=<id>,node-name=<node-name>,mountpoint=<file>[,growable=on|off][,writable=on|off][,allow-other=on|off|auto][,io-uring=on|off]
   --export [type=]vduse-blk,id=<id>,node-name=<node-name>,name=<vduse-name>[,writable=on|off][,num-queues=<num-queues>][,queue-size=<queue-size>][,logical-block-size=<block-size>][,serial=<serial-number>]

   is a block export definition. ``node-name`` is the block node that should be
@@ -111,10 +111,11 @@ Standard options:
   that enabling this option as a non-root user requires enabling the
   user_allow_other option in the global fuse.conf configuration file.  Setting
   ``allow-other`` to auto (the default) will try enabling this option, and on
-  error fall back to disabling it.
-
-  The ``vduse-blk`` export type takes a ``name`` (must be unique across the host)
-  to create the VDUSE device.
+  error fall back to disabling it. Once ``io-uring`` is enabled (off by default),
+  the FUSE-over-io_uring-related settings will be initialized to bypass the
+  traditional /dev/fuse communication mechanism and instead use io_uring to
+  handle FUSE operations. The ``vduse-blk`` export type takes a ``name``
+  (must be unique across the host) to create the VDUSE device.
   ``num-queues`` sets the number of virtqueues (the default is 1).
   ``queue-size`` sets the virtqueue descriptor table size (the default is 256).

diff --git a/qapi/block-export.json b/qapi/block-export.json
index 9ae703ad01..37f2fc47e2 100644
--- a/qapi/block-export.json
+++ b/qapi/block-export.json
@@ -184,12 +184,15 @@
 #     mount the export with allow_other, and if that fails, try again
 #     without.  (since 6.1; default: auto)
 #
+# @io-uring: Use FUSE-over-io-uring.  (since 10.2; default: false)
+#
 # Since: 6.0
 ##
 { 'struct': 'BlockExportOptionsFuse',
   'data': { 'mountpoint': 'str',
             '*growable': 'bool',
-            '*allow-other': 'FuseExportAllowOther' },
+            '*allow-other': 'FuseExportAllowOther',
+            '*io-uring': 'bool' },
   'if': 'CONFIG_FUSE' }

 ##
diff --git a/storage-daemon/qemu-storage-daemon.c b/storage-daemon/qemu-storage-daemon.c
index eb72561358..0cd4cd2b58 100644
--- a/storage-daemon/qemu-storage-daemon.c
+++ b/storage-daemon/qemu-storage-daemon.c
@@ -107,6 +107,7 @@ static void help(void)
 #ifdef CONFIG_FUSE
 "  --export [type=]fuse,id=<id>,node-name=<node-name>,mountpoint=<file>\n"
 "           [,growable=on|off][,writable=on|off][,allow-other=on|off|auto]\n"
+"           [,io-uring=on|off]"
 "                         export the specified block node over FUSE\n"
 "\n"
 #endif /* CONFIG_FUSE */
diff --git a/util/fdmon-io_uring.c b/util/fdmon-io_uring.c
index d2433d1d99..68d3fe8e01 100644
--- a/util/fdmon-io_uring.c
+++ b/util/fdmon-io_uring.c
@@ -452,10 +452,13 @@ static const FDMonOps fdmon_io_uring_ops = {
 void fdmon_io_uring_setup(AioContext *ctx, Error **errp)
 {
     int ret;
+    int flags;

     ctx->io_uring_fd_tag = NULL;
+    flags = IORING_SETUP_SQE128;

-    ret = io_uring_queue_init(FDMON_IO_URING_ENTRIES, &ctx->fdmon_io_uring, 0);
+    ret = io_uring_queue_init(FDMON_IO_URING_ENTRIES,
+                            &ctx->fdmon_io_uring, flags);
     if (ret != 0) {
         error_setg_errno(errp, -ret, "Failed to initialize io_uring");
         return;
--
2.45.2

Re: [PATCH 1/3] fuse: add FUSE-over-io_uring enable opt and init

Posted by Brian Song 3 months ago


On 8/14/25 11:46 PM, Brian Song wrote:
> From: Brian Song <hibriansong@gmail.com>
> 
> This patch adds a new export option for storage-export-daemon to enable
> or disable FUSE-over-io_uring via the switch io-uring=on|off (disable
> by default). It also implements the protocol handshake with the Linux
> kernel during the FUSE-over-io_uring initialization phase.
> 
> See: https://docs.kernel.org/filesystems/fuse-io-uring.html
> 
> The kernel documentation describes in detail how FUSE-over-io_uring
> works. This patch implements the Initial SQE stage shown in thediagram:
> it initializes one queue per IOThread, each currently supporting a
> single submission queue entry (SQE). When the FUSE driver sends the
> first FUSE request (FUSE_INIT), storage-export-daemon calls
> fuse_uring_start() to complete initialization, ultimately submitting
> the SQE with the FUSE_IO_URING_CMD_REGISTER command to confirm
> successful initialization with the kernel.
> 
> Suggested-by: Kevin Wolf <kwolf@redhat.com>
> Suggested-by: Stefan Hajnoczi <stefanha@redhat.com>
> Signed-off-by: Brian Song <hibriansong@gmail.com>
> ---
>   block/export/fuse.c                  | 161 ++++++++++++++++++++++++---
>   docs/tools/qemu-storage-daemon.rst   |  11 +-
>   qapi/block-export.json               |   5 +-
>   storage-daemon/qemu-storage-daemon.c |   1 +
>   util/fdmon-io_uring.c                |   5 +-
>   5 files changed, 159 insertions(+), 24 deletions(-)
> 
> diff --git a/block/export/fuse.c b/block/export/fuse.c
> index c0ad4696ce..59fa79f486 100644
> --- a/block/export/fuse.c
> +++ b/block/export/fuse.c
> @@ -48,6 +48,11 @@
>   #include <linux/fs.h>
>   #endif
> 
> +#define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32
> +
> +/* room needed in buffer to accommodate header */
> +#define FUSE_BUFFER_HEADER_SIZE 0x1000
> +
>   /* Prevent overly long bounce buffer allocations */
>   #define FUSE_MAX_READ_BYTES (MIN(BDRV_REQUEST_MAX_BYTES, 1 * 1024 * 1024))
>   /*
> @@ -63,12 +68,31 @@
>       (FUSE_MAX_WRITE_BYTES - FUSE_IN_PLACE_WRITE_BYTES)
> 
>   typedef struct FuseExport FuseExport;
> +typedef struct FuseQueue FuseQueue;
> +
> +typedef struct FuseRingEnt {
> +    /* back pointer */
> +    FuseQueue *q;
> +
> +    /* commit id of a fuse request */
> +    uint64_t req_commit_id;
> +
> +    /* fuse request header and payload */
> +    struct fuse_uring_req_header req_header;
> +    void *op_payload;
> +    size_t req_payload_sz;
> +
> +    /* The vector passed to the kernel */
> +    struct iovec iov[2];
> +
> +    CqeHandler fuse_cqe_handler;
> +} FuseRingEnt;
> 
>   /*
>    * One FUSE "queue", representing one FUSE FD from which requests are fetched
>    * and processed.  Each queue is tied to an AioContext.
>    */
> -typedef struct FuseQueue {
> +struct FuseQueue {
>       FuseExport *exp;
> 
>       AioContext *ctx;
> @@ -109,7 +133,12 @@ typedef struct FuseQueue {
>        * Free this buffer with qemu_vfree().
>        */
>       void *spillover_buf;
> -} FuseQueue;
> +
> +#ifdef CONFIG_LINUX_IO_URING
> +    int qid;
> +    FuseRingEnt ent;
> +#endif
> +};
> 
>   /*
>    * Verify that FuseQueue.request_buf plus the spill-over buffer together
> @@ -148,6 +177,7 @@ struct FuseExport {
>       bool growable;
>       /* Whether allow_other was used as a mount option or not */
>       bool allow_other;
> +    bool is_uring;
> 
>       mode_t st_mode;
>       uid_t st_uid;
> @@ -257,6 +287,93 @@ static const BlockDevOps fuse_export_blk_dev_ops = {
>       .drained_poll  = fuse_export_drained_poll,
>   };
> 
> +#ifdef CONFIG_LINUX_IO_URING
> +
> +static void fuse_uring_sqe_set_req_data(struct fuse_uring_cmd_req *req,
> +                    const unsigned int qid,
> +                    const unsigned int commit_id)
> +{
> +    req->qid = qid;
> +    req->commit_id = commit_id;
> +    req->flags = 0;
> +}
> +
> +static void fuse_uring_sqe_prepare(struct io_uring_sqe *sqe, FuseQueue *q,
> +               __u32 cmd_op)
> +{
> +    sqe->opcode = IORING_OP_URING_CMD;
> +
> +    sqe->fd = q->fuse_fd;
> +    sqe->rw_flags = 0;
> +    sqe->ioprio = 0;
> +    sqe->off = 0;
> +
> +    sqe->cmd_op = cmd_op;
> +    sqe->__pad1 = 0;
> +}
> +
> +static void fuse_uring_prep_sqe_register(struct io_uring_sqe *sqe, void *opaque)
> +{
> +    FuseQueue *q = opaque;
> +    struct fuse_uring_cmd_req *req = (void *)&sqe->cmd[0];
> +
> +    fuse_uring_sqe_prepare(sqe, q, FUSE_IO_URING_CMD_REGISTER);
> +
> +    sqe->addr = (uint64_t)(q->ent.iov);
> +    sqe->len = 2;
> +
> +    fuse_uring_sqe_set_req_data(req, q->qid, 0);
> +}
> +
> +static void fuse_uring_submit_register(void *opaque)
> +{
> +    FuseQueue *q = opaque;
> +    FuseExport *exp = q->exp;
> +
> +
> +    aio_add_sqe(fuse_uring_prep_sqe_register, q, &(q->ent.fuse_cqe_handler));

I think there might be a tricky issue with the io_uring integration in 
QEMU. Currently, when the number of IOThreads goes above ~6 or 7, 
there’s a pretty high chance of a hang. I added some debug logging in 
the kernel’s fuse_uring_cmd() registration part, and noticed that the 
number of register calls is less than the total number of entries in the 
queue. In theory, we should be registering each entry for each queue.

On the userspace side, everything seems normal, the number of 
aio_add_sqe() calls matches the number of IOThreads. But here’s the 
weird part: if I add a printf inside the while loop in 
fdmon-io_uring.c::fdmon_io_uring_wait(), suddenly everything works fine, 
and the kernel receives registration requests for all entries as expected.

     do {
         ret = io_uring_submit_and_wait(&ctx->fdmon_io_uring, wait_nr);
         fprintf(stderr, "io_uring_submit_and_wait ret: %d\n", ret);
     } while (ret == -EINTR);

My guess is that printf is just slowing down the loop, or maybe there’s 
some implicit memory barrier happening. Obviously, the right fix isn’t 
to sprinkle fprintfs around. I suspect there might be a subtle 
synchronization/race issue here.

Brian

Re: [PATCH 1/3] fuse: add FUSE-over-io_uring enable opt and init

Posted by Bernd Schubert 2 months, 4 weeks ago


On 8/17/25 01:13, Brian Song wrote:
> 
> 
> On 8/14/25 11:46 PM, Brian Song wrote:
>> From: Brian Song <hibriansong@gmail.com>
>>
>> This patch adds a new export option for storage-export-daemon to enable
>> or disable FUSE-over-io_uring via the switch io-uring=on|off (disable
>> by default). It also implements the protocol handshake with the Linux
>> kernel during the FUSE-over-io_uring initialization phase.
>>
>> See: https://docs.kernel.org/filesystems/fuse-io-uring.html
>>
>> The kernel documentation describes in detail how FUSE-over-io_uring
>> works. This patch implements the Initial SQE stage shown in thediagram:
>> it initializes one queue per IOThread, each currently supporting a
>> single submission queue entry (SQE). When the FUSE driver sends the
>> first FUSE request (FUSE_INIT), storage-export-daemon calls
>> fuse_uring_start() to complete initialization, ultimately submitting
>> the SQE with the FUSE_IO_URING_CMD_REGISTER command to confirm
>> successful initialization with the kernel.
>>
>> Suggested-by: Kevin Wolf <kwolf@redhat.com>
>> Suggested-by: Stefan Hajnoczi <stefanha@redhat.com>
>> Signed-off-by: Brian Song <hibriansong@gmail.com>
>> ---
>>   block/export/fuse.c                  | 161 ++++++++++++++++++++++++---
>>   docs/tools/qemu-storage-daemon.rst   |  11 +-
>>   qapi/block-export.json               |   5 +-
>>   storage-daemon/qemu-storage-daemon.c |   1 +
>>   util/fdmon-io_uring.c                |   5 +-
>>   5 files changed, 159 insertions(+), 24 deletions(-)
>>
>> diff --git a/block/export/fuse.c b/block/export/fuse.c
>> index c0ad4696ce..59fa79f486 100644
>> --- a/block/export/fuse.c
>> +++ b/block/export/fuse.c
>> @@ -48,6 +48,11 @@
>>   #include <linux/fs.h>
>>   #endif
>>
>> +#define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32
>> +
>> +/* room needed in buffer to accommodate header */
>> +#define FUSE_BUFFER_HEADER_SIZE 0x1000
>> +
>>   /* Prevent overly long bounce buffer allocations */
>>   #define FUSE_MAX_READ_BYTES (MIN(BDRV_REQUEST_MAX_BYTES, 1 * 1024 * 1024))
>>   /*
>> @@ -63,12 +68,31 @@
>>       (FUSE_MAX_WRITE_BYTES - FUSE_IN_PLACE_WRITE_BYTES)
>>
>>   typedef struct FuseExport FuseExport;
>> +typedef struct FuseQueue FuseQueue;
>> +
>> +typedef struct FuseRingEnt {
>> +    /* back pointer */
>> +    FuseQueue *q;
>> +
>> +    /* commit id of a fuse request */
>> +    uint64_t req_commit_id;
>> +
>> +    /* fuse request header and payload */
>> +    struct fuse_uring_req_header req_header;
>> +    void *op_payload;
>> +    size_t req_payload_sz;
>> +
>> +    /* The vector passed to the kernel */
>> +    struct iovec iov[2];
>> +
>> +    CqeHandler fuse_cqe_handler;
>> +} FuseRingEnt;
>>
>>   /*
>>    * One FUSE "queue", representing one FUSE FD from which requests are fetched
>>    * and processed.  Each queue is tied to an AioContext.
>>    */
>> -typedef struct FuseQueue {
>> +struct FuseQueue {
>>       FuseExport *exp;
>>
>>       AioContext *ctx;
>> @@ -109,7 +133,12 @@ typedef struct FuseQueue {
>>        * Free this buffer with qemu_vfree().
>>        */
>>       void *spillover_buf;
>> -} FuseQueue;
>> +
>> +#ifdef CONFIG_LINUX_IO_URING
>> +    int qid;
>> +    FuseRingEnt ent;
>> +#endif
>> +};
>>
>>   /*
>>    * Verify that FuseQueue.request_buf plus the spill-over buffer together
>> @@ -148,6 +177,7 @@ struct FuseExport {
>>       bool growable;
>>       /* Whether allow_other was used as a mount option or not */
>>       bool allow_other;
>> +    bool is_uring;
>>
>>       mode_t st_mode;
>>       uid_t st_uid;
>> @@ -257,6 +287,93 @@ static const BlockDevOps fuse_export_blk_dev_ops = {
>>       .drained_poll  = fuse_export_drained_poll,
>>   };
>>
>> +#ifdef CONFIG_LINUX_IO_URING
>> +
>> +static void fuse_uring_sqe_set_req_data(struct fuse_uring_cmd_req *req,
>> +                    const unsigned int qid,
>> +                    const unsigned int commit_id)
>> +{
>> +    req->qid = qid;
>> +    req->commit_id = commit_id;
>> +    req->flags = 0;
>> +}
>> +
>> +static void fuse_uring_sqe_prepare(struct io_uring_sqe *sqe, FuseQueue *q,
>> +               __u32 cmd_op)
>> +{
>> +    sqe->opcode = IORING_OP_URING_CMD;
>> +
>> +    sqe->fd = q->fuse_fd;
>> +    sqe->rw_flags = 0;
>> +    sqe->ioprio = 0;
>> +    sqe->off = 0;
>> +
>> +    sqe->cmd_op = cmd_op;
>> +    sqe->__pad1 = 0;
>> +}
>> +
>> +static void fuse_uring_prep_sqe_register(struct io_uring_sqe *sqe, void *opaque)
>> +{
>> +    FuseQueue *q = opaque;
>> +    struct fuse_uring_cmd_req *req = (void *)&sqe->cmd[0];
>> +
>> +    fuse_uring_sqe_prepare(sqe, q, FUSE_IO_URING_CMD_REGISTER);
>> +
>> +    sqe->addr = (uint64_t)(q->ent.iov);
>> +    sqe->len = 2;
>> +
>> +    fuse_uring_sqe_set_req_data(req, q->qid, 0);
>> +}
>> +
>> +static void fuse_uring_submit_register(void *opaque)
>> +{
>> +    FuseQueue *q = opaque;
>> +    FuseExport *exp = q->exp;
>> +
>> +
>> +    aio_add_sqe(fuse_uring_prep_sqe_register, q, &(q->ent.fuse_cqe_handler));
> 
> I think there might be a tricky issue with the io_uring integration in 
> QEMU. Currently, when the number of IOThreads goes above ~6 or 7, 
> there’s a pretty high chance of a hang. I added some debug logging in 
> the kernel’s fuse_uring_cmd() registration part, and noticed that the 
> number of register calls is less than the total number of entries in the 
> queue. In theory, we should be registering each entry for each queue.

Did you also try to add logging at the top of fuse_uring_cmd()? I wonder
if there is a start up race and if initial commands are just getting
refused. I had run into issues you are describing in some versions of 
the -rfc patches, but thought that everything was fixed for that. 
I.e. not excluded that there is still a kernel issue left.

Thanks,
Bernd

Re: [PATCH 1/3] fuse: add FUSE-over-io_uring enable opt and init

Posted by Brian Song 2 months, 4 weeks ago


On 8/18/25 7:04 PM, Bernd Schubert wrote:
> 
> 
> On 8/17/25 01:13, Brian Song wrote:
>>
>>
>> On 8/14/25 11:46 PM, Brian Song wrote:
>>> From: Brian Song <hibriansong@gmail.com>
>>>
>>> This patch adds a new export option for storage-export-daemon to enable
>>> or disable FUSE-over-io_uring via the switch io-uring=on|off (disable
>>> by default). It also implements the protocol handshake with the Linux
>>> kernel during the FUSE-over-io_uring initialization phase.
>>>
>>> See: https://docs.kernel.org/filesystems/fuse-io-uring.html
>>>
>>> The kernel documentation describes in detail how FUSE-over-io_uring
>>> works. This patch implements the Initial SQE stage shown in thediagram:
>>> it initializes one queue per IOThread, each currently supporting a
>>> single submission queue entry (SQE). When the FUSE driver sends the
>>> first FUSE request (FUSE_INIT), storage-export-daemon calls
>>> fuse_uring_start() to complete initialization, ultimately submitting
>>> the SQE with the FUSE_IO_URING_CMD_REGISTER command to confirm
>>> successful initialization with the kernel.
>>>
>>> Suggested-by: Kevin Wolf <kwolf@redhat.com>
>>> Suggested-by: Stefan Hajnoczi <stefanha@redhat.com>
>>> Signed-off-by: Brian Song <hibriansong@gmail.com>
>>> ---
>>>    block/export/fuse.c                  | 161 ++++++++++++++++++++++++---
>>>    docs/tools/qemu-storage-daemon.rst   |  11 +-
>>>    qapi/block-export.json               |   5 +-
>>>    storage-daemon/qemu-storage-daemon.c |   1 +
>>>    util/fdmon-io_uring.c                |   5 +-
>>>    5 files changed, 159 insertions(+), 24 deletions(-)
>>>
>>> diff --git a/block/export/fuse.c b/block/export/fuse.c
>>> index c0ad4696ce..59fa79f486 100644
>>> --- a/block/export/fuse.c
>>> +++ b/block/export/fuse.c
>>> @@ -48,6 +48,11 @@
>>>    #include <linux/fs.h>
>>>    #endif
>>>
>>> +#define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32
>>> +
>>> +/* room needed in buffer to accommodate header */
>>> +#define FUSE_BUFFER_HEADER_SIZE 0x1000
>>> +
>>>    /* Prevent overly long bounce buffer allocations */
>>>    #define FUSE_MAX_READ_BYTES (MIN(BDRV_REQUEST_MAX_BYTES, 1 * 1024 * 1024))
>>>    /*
>>> @@ -63,12 +68,31 @@
>>>        (FUSE_MAX_WRITE_BYTES - FUSE_IN_PLACE_WRITE_BYTES)
>>>
>>>    typedef struct FuseExport FuseExport;
>>> +typedef struct FuseQueue FuseQueue;
>>> +
>>> +typedef struct FuseRingEnt {
>>> +    /* back pointer */
>>> +    FuseQueue *q;
>>> +
>>> +    /* commit id of a fuse request */
>>> +    uint64_t req_commit_id;
>>> +
>>> +    /* fuse request header and payload */
>>> +    struct fuse_uring_req_header req_header;
>>> +    void *op_payload;
>>> +    size_t req_payload_sz;
>>> +
>>> +    /* The vector passed to the kernel */
>>> +    struct iovec iov[2];
>>> +
>>> +    CqeHandler fuse_cqe_handler;
>>> +} FuseRingEnt;
>>>
>>>    /*
>>>     * One FUSE "queue", representing one FUSE FD from which requests are fetched
>>>     * and processed.  Each queue is tied to an AioContext.
>>>     */
>>> -typedef struct FuseQueue {
>>> +struct FuseQueue {
>>>        FuseExport *exp;
>>>
>>>        AioContext *ctx;
>>> @@ -109,7 +133,12 @@ typedef struct FuseQueue {
>>>         * Free this buffer with qemu_vfree().
>>>         */
>>>        void *spillover_buf;
>>> -} FuseQueue;
>>> +
>>> +#ifdef CONFIG_LINUX_IO_URING
>>> +    int qid;
>>> +    FuseRingEnt ent;
>>> +#endif
>>> +};
>>>
>>>    /*
>>>     * Verify that FuseQueue.request_buf plus the spill-over buffer together
>>> @@ -148,6 +177,7 @@ struct FuseExport {
>>>        bool growable;
>>>        /* Whether allow_other was used as a mount option or not */
>>>        bool allow_other;
>>> +    bool is_uring;
>>>
>>>        mode_t st_mode;
>>>        uid_t st_uid;
>>> @@ -257,6 +287,93 @@ static const BlockDevOps fuse_export_blk_dev_ops = {
>>>        .drained_poll  = fuse_export_drained_poll,
>>>    };
>>>
>>> +#ifdef CONFIG_LINUX_IO_URING
>>> +
>>> +static void fuse_uring_sqe_set_req_data(struct fuse_uring_cmd_req *req,
>>> +                    const unsigned int qid,
>>> +                    const unsigned int commit_id)
>>> +{
>>> +    req->qid = qid;
>>> +    req->commit_id = commit_id;
>>> +    req->flags = 0;
>>> +}
>>> +
>>> +static void fuse_uring_sqe_prepare(struct io_uring_sqe *sqe, FuseQueue *q,
>>> +               __u32 cmd_op)
>>> +{
>>> +    sqe->opcode = IORING_OP_URING_CMD;
>>> +
>>> +    sqe->fd = q->fuse_fd;
>>> +    sqe->rw_flags = 0;
>>> +    sqe->ioprio = 0;
>>> +    sqe->off = 0;
>>> +
>>> +    sqe->cmd_op = cmd_op;
>>> +    sqe->__pad1 = 0;
>>> +}
>>> +
>>> +static void fuse_uring_prep_sqe_register(struct io_uring_sqe *sqe, void *opaque)
>>> +{
>>> +    FuseQueue *q = opaque;
>>> +    struct fuse_uring_cmd_req *req = (void *)&sqe->cmd[0];
>>> +
>>> +    fuse_uring_sqe_prepare(sqe, q, FUSE_IO_URING_CMD_REGISTER);
>>> +
>>> +    sqe->addr = (uint64_t)(q->ent.iov);
>>> +    sqe->len = 2;
>>> +
>>> +    fuse_uring_sqe_set_req_data(req, q->qid, 0);
>>> +}
>>> +
>>> +static void fuse_uring_submit_register(void *opaque)
>>> +{
>>> +    FuseQueue *q = opaque;
>>> +    FuseExport *exp = q->exp;
>>> +
>>> +
>>> +    aio_add_sqe(fuse_uring_prep_sqe_register, q, &(q->ent.fuse_cqe_handler));
>>
>> I think there might be a tricky issue with the io_uring integration in
>> QEMU. Currently, when the number of IOThreads goes above ~6 or 7,
>> there’s a pretty high chance of a hang. I added some debug logging in
>> the kernel’s fuse_uring_cmd() registration part, and noticed that the
>> number of register calls is less than the total number of entries in the
>> queue. In theory, we should be registering each entry for each queue.
> 
> Did you also try to add logging at the top of fuse_uring_cmd()? I wonder
> if there is a start up race and if initial commands are just getting
> refused. I had run into issues you are describing in some versions of
> the -rfc patches, but thought that everything was fixed for that.
> I.e. not excluded that there is still a kernel issue left.
> 
> Thanks,
> Bernd
> 
> 

Yes. I added a printk at the beginning of fuse_uring_cmd(), another at 
the beginning of fuse_uring_register(), and one more at the end of 
fuse_uring_do_register(). Then I created and registered 20 queues, each 
with a single ring entry. It printed 37 times(diff every time) with 
opcode FUSE_IO_URING_CMD_REGISTER (would expect 20), and only 6 queues 
were registered successfully. The rest of fuse_uring_cmd (x31) exited 
inside the if (!fc->initialized) branch in fuse_uring_cmd()

dmesg: https://gist.github.com/hibriansong/4eda6e7e92601df497282dcd56fd5470

Thanks,
Brian

Re: [PATCH 1/3] fuse: add FUSE-over-io_uring enable opt and init

Posted by Bernd Schubert 2 months, 3 weeks ago


On 8/19/25 03:15, Brian Song wrote:
> 
> 
> On 8/18/25 7:04 PM, Bernd Schubert wrote:
>>
>>
>> On 8/17/25 01:13, Brian Song wrote:
>>>
>>>
>>> On 8/14/25 11:46 PM, Brian Song wrote:
>>>> From: Brian Song <hibriansong@gmail.com>
>>>>
>>>> This patch adds a new export option for storage-export-daemon to enable
>>>> or disable FUSE-over-io_uring via the switch io-uring=on|off (disable
>>>> by default). It also implements the protocol handshake with the Linux
>>>> kernel during the FUSE-over-io_uring initialization phase.
>>>>
>>>> See: https://docs.kernel.org/filesystems/fuse-io-uring.html
>>>>
>>>> The kernel documentation describes in detail how FUSE-over-io_uring
>>>> works. This patch implements the Initial SQE stage shown in thediagram:
>>>> it initializes one queue per IOThread, each currently supporting a
>>>> single submission queue entry (SQE). When the FUSE driver sends the
>>>> first FUSE request (FUSE_INIT), storage-export-daemon calls
>>>> fuse_uring_start() to complete initialization, ultimately submitting
>>>> the SQE with the FUSE_IO_URING_CMD_REGISTER command to confirm
>>>> successful initialization with the kernel.
>>>>
>>>> Suggested-by: Kevin Wolf <kwolf@redhat.com>
>>>> Suggested-by: Stefan Hajnoczi <stefanha@redhat.com>
>>>> Signed-off-by: Brian Song <hibriansong@gmail.com>
>>>> ---
>>>>    block/export/fuse.c                  | 161 ++++++++++++++++++++++++---
>>>>    docs/tools/qemu-storage-daemon.rst   |  11 +-
>>>>    qapi/block-export.json               |   5 +-
>>>>    storage-daemon/qemu-storage-daemon.c |   1 +
>>>>    util/fdmon-io_uring.c                |   5 +-
>>>>    5 files changed, 159 insertions(+), 24 deletions(-)
>>>>
>>>> diff --git a/block/export/fuse.c b/block/export/fuse.c
>>>> index c0ad4696ce..59fa79f486 100644
>>>> --- a/block/export/fuse.c
>>>> +++ b/block/export/fuse.c
>>>> @@ -48,6 +48,11 @@
>>>>    #include <linux/fs.h>
>>>>    #endif
>>>>
>>>> +#define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32
>>>> +
>>>> +/* room needed in buffer to accommodate header */
>>>> +#define FUSE_BUFFER_HEADER_SIZE 0x1000
>>>> +
>>>>    /* Prevent overly long bounce buffer allocations */
>>>>    #define FUSE_MAX_READ_BYTES (MIN(BDRV_REQUEST_MAX_BYTES, 1 * 1024 * 1024))
>>>>    /*
>>>> @@ -63,12 +68,31 @@
>>>>        (FUSE_MAX_WRITE_BYTES - FUSE_IN_PLACE_WRITE_BYTES)
>>>>
>>>>    typedef struct FuseExport FuseExport;
>>>> +typedef struct FuseQueue FuseQueue;
>>>> +
>>>> +typedef struct FuseRingEnt {
>>>> +    /* back pointer */
>>>> +    FuseQueue *q;
>>>> +
>>>> +    /* commit id of a fuse request */
>>>> +    uint64_t req_commit_id;
>>>> +
>>>> +    /* fuse request header and payload */
>>>> +    struct fuse_uring_req_header req_header;
>>>> +    void *op_payload;
>>>> +    size_t req_payload_sz;
>>>> +
>>>> +    /* The vector passed to the kernel */
>>>> +    struct iovec iov[2];
>>>> +
>>>> +    CqeHandler fuse_cqe_handler;
>>>> +} FuseRingEnt;
>>>>
>>>>    /*
>>>>     * One FUSE "queue", representing one FUSE FD from which requests are fetched
>>>>     * and processed.  Each queue is tied to an AioContext.
>>>>     */
>>>> -typedef struct FuseQueue {
>>>> +struct FuseQueue {
>>>>        FuseExport *exp;
>>>>
>>>>        AioContext *ctx;
>>>> @@ -109,7 +133,12 @@ typedef struct FuseQueue {
>>>>         * Free this buffer with qemu_vfree().
>>>>         */
>>>>        void *spillover_buf;
>>>> -} FuseQueue;
>>>> +
>>>> +#ifdef CONFIG_LINUX_IO_URING
>>>> +    int qid;
>>>> +    FuseRingEnt ent;
>>>> +#endif
>>>> +};
>>>>
>>>>    /*
>>>>     * Verify that FuseQueue.request_buf plus the spill-over buffer together
>>>> @@ -148,6 +177,7 @@ struct FuseExport {
>>>>        bool growable;
>>>>        /* Whether allow_other was used as a mount option or not */
>>>>        bool allow_other;
>>>> +    bool is_uring;
>>>>
>>>>        mode_t st_mode;
>>>>        uid_t st_uid;
>>>> @@ -257,6 +287,93 @@ static const BlockDevOps fuse_export_blk_dev_ops = {
>>>>        .drained_poll  = fuse_export_drained_poll,
>>>>    };
>>>>
>>>> +#ifdef CONFIG_LINUX_IO_URING
>>>> +
>>>> +static void fuse_uring_sqe_set_req_data(struct fuse_uring_cmd_req *req,
>>>> +                    const unsigned int qid,
>>>> +                    const unsigned int commit_id)
>>>> +{
>>>> +    req->qid = qid;
>>>> +    req->commit_id = commit_id;
>>>> +    req->flags = 0;
>>>> +}
>>>> +
>>>> +static void fuse_uring_sqe_prepare(struct io_uring_sqe *sqe, FuseQueue *q,
>>>> +               __u32 cmd_op)
>>>> +{
>>>> +    sqe->opcode = IORING_OP_URING_CMD;
>>>> +
>>>> +    sqe->fd = q->fuse_fd;
>>>> +    sqe->rw_flags = 0;
>>>> +    sqe->ioprio = 0;
>>>> +    sqe->off = 0;
>>>> +
>>>> +    sqe->cmd_op = cmd_op;
>>>> +    sqe->__pad1 = 0;
>>>> +}
>>>> +
>>>> +static void fuse_uring_prep_sqe_register(struct io_uring_sqe *sqe, void *opaque)
>>>> +{
>>>> +    FuseQueue *q = opaque;
>>>> +    struct fuse_uring_cmd_req *req = (void *)&sqe->cmd[0];
>>>> +
>>>> +    fuse_uring_sqe_prepare(sqe, q, FUSE_IO_URING_CMD_REGISTER);
>>>> +
>>>> +    sqe->addr = (uint64_t)(q->ent.iov);
>>>> +    sqe->len = 2;
>>>> +
>>>> +    fuse_uring_sqe_set_req_data(req, q->qid, 0);
>>>> +}
>>>> +
>>>> +static void fuse_uring_submit_register(void *opaque)
>>>> +{
>>>> +    FuseQueue *q = opaque;
>>>> +    FuseExport *exp = q->exp;
>>>> +
>>>> +
>>>> +    aio_add_sqe(fuse_uring_prep_sqe_register, q, &(q->ent.fuse_cqe_handler));
>>>
>>> I think there might be a tricky issue with the io_uring integration in
>>> QEMU. Currently, when the number of IOThreads goes above ~6 or 7,
>>> there’s a pretty high chance of a hang. I added some debug logging in
>>> the kernel’s fuse_uring_cmd() registration part, and noticed that the
>>> number of register calls is less than the total number of entries in the
>>> queue. In theory, we should be registering each entry for each queue.
>>
>> Did you also try to add logging at the top of fuse_uring_cmd()? I wonder
>> if there is a start up race and if initial commands are just getting
>> refused. I had run into issues you are describing in some versions of
>> the -rfc patches, but thought that everything was fixed for that.
>> I.e. not excluded that there is still a kernel issue left.
>>
>> Thanks,
>> Bernd
>>
>>
> 
> Yes. I added a printk at the beginning of fuse_uring_cmd(), another at 
> the beginning of fuse_uring_register(), and one more at the end of 
> fuse_uring_do_register(). Then I created and registered 20 queues, each 
> with a single ring entry. It printed 37 times(diff every time) with 
> opcode FUSE_IO_URING_CMD_REGISTER (would expect 20), and only 6 queues 
> were registered successfully. The rest of fuse_uring_cmd (x31) exited 
> inside the if (!fc->initialized) branch in fuse_uring_cmd()
> 
> dmesg: https://gist.github.com/hibriansong/4eda6e7e92601df497282dcd56fd5470

Thank you for the logs, could you try this?

diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
index 2aa20707f40b..cea57ad5d3ab 100644
--- a/fs/fuse/dev_uring.c
+++ b/fs/fuse/dev_uring.c
@@ -1324,6 +1324,9 @@ int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
        if (!fc->connected)
                return -ENOTCONN;
 
+       /* Matches smp_wmb() in fuse_set_initialized() */
+       smp_rmb();
+
        /*
         * fuse_uring_register() needs the ring to be initialized,
         * we need to know the max payload size



Thanks,
Bernd

Re: [PATCH 1/3] fuse: add FUSE-over-io_uring enable opt and init

Posted by Brian Song 2 months, 3 weeks ago


On 8/19/25 6:26 PM, Bernd Schubert wrote:
> 
> 
> On 8/19/25 03:15, Brian Song wrote:
>>
>>
>> On 8/18/25 7:04 PM, Bernd Schubert wrote:
>>>
>>>
>>> On 8/17/25 01:13, Brian Song wrote:
>>>>
>>>>
>>>> On 8/14/25 11:46 PM, Brian Song wrote:
>>>>> From: Brian Song <hibriansong@gmail.com>
>>>>>
>>>>> This patch adds a new export option for storage-export-daemon to enable
>>>>> or disable FUSE-over-io_uring via the switch io-uring=on|off (disable
>>>>> by default). It also implements the protocol handshake with the Linux
>>>>> kernel during the FUSE-over-io_uring initialization phase.
>>>>>
>>>>> See: https://docs.kernel.org/filesystems/fuse-io-uring.html
>>>>>
>>>>> The kernel documentation describes in detail how FUSE-over-io_uring
>>>>> works. This patch implements the Initial SQE stage shown in thediagram:
>>>>> it initializes one queue per IOThread, each currently supporting a
>>>>> single submission queue entry (SQE). When the FUSE driver sends the
>>>>> first FUSE request (FUSE_INIT), storage-export-daemon calls
>>>>> fuse_uring_start() to complete initialization, ultimately submitting
>>>>> the SQE with the FUSE_IO_URING_CMD_REGISTER command to confirm
>>>>> successful initialization with the kernel.
>>>>>
>>>>> Suggested-by: Kevin Wolf <kwolf@redhat.com>
>>>>> Suggested-by: Stefan Hajnoczi <stefanha@redhat.com>
>>>>> Signed-off-by: Brian Song <hibriansong@gmail.com>
>>>>> ---
>>>>>     block/export/fuse.c                  | 161 ++++++++++++++++++++++++---
>>>>>     docs/tools/qemu-storage-daemon.rst   |  11 +-
>>>>>     qapi/block-export.json               |   5 +-
>>>>>     storage-daemon/qemu-storage-daemon.c |   1 +
>>>>>     util/fdmon-io_uring.c                |   5 +-
>>>>>     5 files changed, 159 insertions(+), 24 deletions(-)
>>>>>
>>>>> diff --git a/block/export/fuse.c b/block/export/fuse.c
>>>>> index c0ad4696ce..59fa79f486 100644
>>>>> --- a/block/export/fuse.c
>>>>> +++ b/block/export/fuse.c
>>>>> @@ -48,6 +48,11 @@
>>>>>     #include <linux/fs.h>
>>>>>     #endif
>>>>>
>>>>> +#define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32
>>>>> +
>>>>> +/* room needed in buffer to accommodate header */
>>>>> +#define FUSE_BUFFER_HEADER_SIZE 0x1000
>>>>> +
>>>>>     /* Prevent overly long bounce buffer allocations */
>>>>>     #define FUSE_MAX_READ_BYTES (MIN(BDRV_REQUEST_MAX_BYTES, 1 * 1024 * 1024))
>>>>>     /*
>>>>> @@ -63,12 +68,31 @@
>>>>>         (FUSE_MAX_WRITE_BYTES - FUSE_IN_PLACE_WRITE_BYTES)
>>>>>
>>>>>     typedef struct FuseExport FuseExport;
>>>>> +typedef struct FuseQueue FuseQueue;
>>>>> +
>>>>> +typedef struct FuseRingEnt {
>>>>> +    /* back pointer */
>>>>> +    FuseQueue *q;
>>>>> +
>>>>> +    /* commit id of a fuse request */
>>>>> +    uint64_t req_commit_id;
>>>>> +
>>>>> +    /* fuse request header and payload */
>>>>> +    struct fuse_uring_req_header req_header;
>>>>> +    void *op_payload;
>>>>> +    size_t req_payload_sz;
>>>>> +
>>>>> +    /* The vector passed to the kernel */
>>>>> +    struct iovec iov[2];
>>>>> +
>>>>> +    CqeHandler fuse_cqe_handler;
>>>>> +} FuseRingEnt;
>>>>>
>>>>>     /*
>>>>>      * One FUSE "queue", representing one FUSE FD from which requests are fetched
>>>>>      * and processed.  Each queue is tied to an AioContext.
>>>>>      */
>>>>> -typedef struct FuseQueue {
>>>>> +struct FuseQueue {
>>>>>         FuseExport *exp;
>>>>>
>>>>>         AioContext *ctx;
>>>>> @@ -109,7 +133,12 @@ typedef struct FuseQueue {
>>>>>          * Free this buffer with qemu_vfree().
>>>>>          */
>>>>>         void *spillover_buf;
>>>>> -} FuseQueue;
>>>>> +
>>>>> +#ifdef CONFIG_LINUX_IO_URING
>>>>> +    int qid;
>>>>> +    FuseRingEnt ent;
>>>>> +#endif
>>>>> +};
>>>>>
>>>>>     /*
>>>>>      * Verify that FuseQueue.request_buf plus the spill-over buffer together
>>>>> @@ -148,6 +177,7 @@ struct FuseExport {
>>>>>         bool growable;
>>>>>         /* Whether allow_other was used as a mount option or not */
>>>>>         bool allow_other;
>>>>> +    bool is_uring;
>>>>>
>>>>>         mode_t st_mode;
>>>>>         uid_t st_uid;
>>>>> @@ -257,6 +287,93 @@ static const BlockDevOps fuse_export_blk_dev_ops = {
>>>>>         .drained_poll  = fuse_export_drained_poll,
>>>>>     };
>>>>>
>>>>> +#ifdef CONFIG_LINUX_IO_URING
>>>>> +
>>>>> +static void fuse_uring_sqe_set_req_data(struct fuse_uring_cmd_req *req,
>>>>> +                    const unsigned int qid,
>>>>> +                    const unsigned int commit_id)
>>>>> +{
>>>>> +    req->qid = qid;
>>>>> +    req->commit_id = commit_id;
>>>>> +    req->flags = 0;
>>>>> +}
>>>>> +
>>>>> +static void fuse_uring_sqe_prepare(struct io_uring_sqe *sqe, FuseQueue *q,
>>>>> +               __u32 cmd_op)
>>>>> +{
>>>>> +    sqe->opcode = IORING_OP_URING_CMD;
>>>>> +
>>>>> +    sqe->fd = q->fuse_fd;
>>>>> +    sqe->rw_flags = 0;
>>>>> +    sqe->ioprio = 0;
>>>>> +    sqe->off = 0;
>>>>> +
>>>>> +    sqe->cmd_op = cmd_op;
>>>>> +    sqe->__pad1 = 0;
>>>>> +}
>>>>> +
>>>>> +static void fuse_uring_prep_sqe_register(struct io_uring_sqe *sqe, void *opaque)
>>>>> +{
>>>>> +    FuseQueue *q = opaque;
>>>>> +    struct fuse_uring_cmd_req *req = (void *)&sqe->cmd[0];
>>>>> +
>>>>> +    fuse_uring_sqe_prepare(sqe, q, FUSE_IO_URING_CMD_REGISTER);
>>>>> +
>>>>> +    sqe->addr = (uint64_t)(q->ent.iov);
>>>>> +    sqe->len = 2;
>>>>> +
>>>>> +    fuse_uring_sqe_set_req_data(req, q->qid, 0);
>>>>> +}
>>>>> +
>>>>> +static void fuse_uring_submit_register(void *opaque)
>>>>> +{
>>>>> +    FuseQueue *q = opaque;
>>>>> +    FuseExport *exp = q->exp;
>>>>> +
>>>>> +
>>>>> +    aio_add_sqe(fuse_uring_prep_sqe_register, q, &(q->ent.fuse_cqe_handler));
>>>>
>>>> I think there might be a tricky issue with the io_uring integration in
>>>> QEMU. Currently, when the number of IOThreads goes above ~6 or 7,
>>>> there’s a pretty high chance of a hang. I added some debug logging in
>>>> the kernel’s fuse_uring_cmd() registration part, and noticed that the
>>>> number of register calls is less than the total number of entries in the
>>>> queue. In theory, we should be registering each entry for each queue.
>>>
>>> Did you also try to add logging at the top of fuse_uring_cmd()? I wonder
>>> if there is a start up race and if initial commands are just getting
>>> refused. I had run into issues you are describing in some versions of
>>> the -rfc patches, but thought that everything was fixed for that.
>>> I.e. not excluded that there is still a kernel issue left.
>>>
>>> Thanks,
>>> Bernd
>>>
>>>
>>
>> Yes. I added a printk at the beginning of fuse_uring_cmd(), another at
>> the beginning of fuse_uring_register(), and one more at the end of
>> fuse_uring_do_register(). Then I created and registered 20 queues, each
>> with a single ring entry. It printed 37 times(diff every time) with
>> opcode FUSE_IO_URING_CMD_REGISTER (would expect 20), and only 6 queues
>> were registered successfully. The rest of fuse_uring_cmd (x31) exited
>> inside the if (!fc->initialized) branch in fuse_uring_cmd()
>>
>> dmesg: https://gist.github.com/hibriansong/4eda6e7e92601df497282dcd56fd5470
> 
> Thank you for the logs, could you try this?
> 
> diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
> index 2aa20707f40b..cea57ad5d3ab 100644
> --- a/fs/fuse/dev_uring.c
> +++ b/fs/fuse/dev_uring.c
> @@ -1324,6 +1324,9 @@ int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
>          if (!fc->connected)
>                  return -ENOTCONN;
>   
> +       /* Matches smp_wmb() in fuse_set_initialized() */
> +       smp_rmb();
> +
>          /*
>           * fuse_uring_register() needs the ring to be initialized,
>           * we need to know the max payload size
> 
> 
> 
> Thanks,
> Bernd

I realized the issue actually comes from QEMU handling the FUSE_INIT 
request. After I processed outargs, I didn't send the response back to 
the kernel before starting the fuse-over-io_uring initialization. So 
it's possible that the 20 registration requests submitted via 
io_uring_cmd() reach the kernel before process_init_reply() has run and 
set fc->initialized = 1, which causes fuse_uring_cmd to bail out repeatedly.

I also noticed that in libfuse, they first send the init request 
response, then allocate queues and submit the register SQEs. But even 
there, during the fuse-over-io_uring init after sending the response, if 
the kernel hasn't finished process_init_reply() and set fc->initialized 
= 1, wouldn't they run into a similar issue fuse_uring_cmd repeatedly 
bailing on register requests because fc->initialized isn't set yet?

Re: [PATCH 1/3] fuse: add FUSE-over-io_uring enable opt and init

Posted by Brian Song 2 months, 3 weeks ago

On 8/19/25 7:23 PM, Brian Song wrote:
>
>
> On 8/19/25 6:26 PM, Bernd Schubert wrote:
>>
>>
>> On 8/19/25 03:15, Brian Song wrote:
>>>
>>>
>>> On 8/18/25 7:04 PM, Bernd Schubert wrote:
>>>>
>>>>
>>>> On 8/17/25 01:13, Brian Song wrote:
>>>>>
>>>>>
>>>>> On 8/14/25 11:46 PM, Brian Song wrote:
>>>>>> From: Brian Song <hibriansong@gmail.com>
>>>>>>
>>>>>> This patch adds a new export option for storage-export-daemon to
>>>>>> enable
>>>>>> or disable FUSE-over-io_uring via the switch io-uring=on|off (disable
>>>>>> by default). It also implements the protocol handshake with the Linux
>>>>>> kernel during the FUSE-over-io_uring initialization phase.
>>>>>>
>>>>>> See: https://docs.kernel.org/filesystems/fuse-io-uring.html
>>>>>>
>>>>>> The kernel documentation describes in detail how FUSE-over-io_uring
>>>>>> works. This patch implements the Initial SQE stage shown in
>>>>>> thediagram:
>>>>>> it initializes one queue per IOThread, each currently supporting a
>>>>>> single submission queue entry (SQE). When the FUSE driver sends the
>>>>>> first FUSE request (FUSE_INIT), storage-export-daemon calls
>>>>>> fuse_uring_start() to complete initialization, ultimately submitting
>>>>>> the SQE with the FUSE_IO_URING_CMD_REGISTER command to confirm
>>>>>> successful initialization with the kernel.
>>>>>>
>>>>>> Suggested-by: Kevin Wolf <kwolf@redhat.com>
>>>>>> Suggested-by: Stefan Hajnoczi <stefanha@redhat.com>
>>>>>> Signed-off-by: Brian Song <hibriansong@gmail.com>
>>>>>> ---
>>>>>>     block/export/fuse.c                  | 161 +++++++++++++++++++
>>>>>> +++++---
>>>>>>     docs/tools/qemu-storage-daemon.rst   |  11 +-
>>>>>>     qapi/block-export.json               |   5 +-
>>>>>>     storage-daemon/qemu-storage-daemon.c |   1 +
>>>>>>     util/fdmon-io_uring.c                |   5 +-
>>>>>>     5 files changed, 159 insertions(+), 24 deletions(-)
>>>>>>
>>>>>> diff --git a/block/export/fuse.c b/block/export/fuse.c
>>>>>> index c0ad4696ce..59fa79f486 100644
>>>>>> --- a/block/export/fuse.c
>>>>>> +++ b/block/export/fuse.c
>>>>>> @@ -48,6 +48,11 @@
>>>>>>     #include <linux/fs.h>
>>>>>>     #endif
>>>>>>
>>>>>> +#define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32
>>>>>> +
>>>>>> +/* room needed in buffer to accommodate header */
>>>>>> +#define FUSE_BUFFER_HEADER_SIZE 0x1000
>>>>>> +
>>>>>>     /* Prevent overly long bounce buffer allocations */
>>>>>>     #define FUSE_MAX_READ_BYTES (MIN(BDRV_REQUEST_MAX_BYTES, 1 *
>>>>>> 1024 * 1024))
>>>>>>     /*
>>>>>> @@ -63,12 +68,31 @@
>>>>>>         (FUSE_MAX_WRITE_BYTES - FUSE_IN_PLACE_WRITE_BYTES)
>>>>>>
>>>>>>     typedef struct FuseExport FuseExport;
>>>>>> +typedef struct FuseQueue FuseQueue;
>>>>>> +
>>>>>> +typedef struct FuseRingEnt {
>>>>>> +    /* back pointer */
>>>>>> +    FuseQueue *q;
>>>>>> +
>>>>>> +    /* commit id of a fuse request */
>>>>>> +    uint64_t req_commit_id;
>>>>>> +
>>>>>> +    /* fuse request header and payload */
>>>>>> +    struct fuse_uring_req_header req_header;
>>>>>> +    void *op_payload;
>>>>>> +    size_t req_payload_sz;
>>>>>> +
>>>>>> +    /* The vector passed to the kernel */
>>>>>> +    struct iovec iov[2];
>>>>>> +
>>>>>> +    CqeHandler fuse_cqe_handler;
>>>>>> +} FuseRingEnt;
>>>>>>
>>>>>>     /*
>>>>>>      * One FUSE "queue", representing one FUSE FD from which
>>>>>> requests are fetched
>>>>>>      * and processed.  Each queue is tied to an AioContext.
>>>>>>      */
>>>>>> -typedef struct FuseQueue {
>>>>>> +struct FuseQueue {
>>>>>>         FuseExport *exp;
>>>>>>
>>>>>>         AioContext *ctx;
>>>>>> @@ -109,7 +133,12 @@ typedef struct FuseQueue {
>>>>>>          * Free this buffer with qemu_vfree().
>>>>>>          */
>>>>>>         void *spillover_buf;
>>>>>> -} FuseQueue;
>>>>>> +
>>>>>> +#ifdef CONFIG_LINUX_IO_URING
>>>>>> +    int qid;
>>>>>> +    FuseRingEnt ent;
>>>>>> +#endif
>>>>>> +};
>>>>>>
>>>>>>     /*
>>>>>>      * Verify that FuseQueue.request_buf plus the spill-over
>>>>>> buffer together
>>>>>> @@ -148,6 +177,7 @@ struct FuseExport {
>>>>>>         bool growable;
>>>>>>         /* Whether allow_other was used as a mount option or not */
>>>>>>         bool allow_other;
>>>>>> +    bool is_uring;
>>>>>>
>>>>>>         mode_t st_mode;
>>>>>>         uid_t st_uid;
>>>>>> @@ -257,6 +287,93 @@ static const BlockDevOps
>>>>>> fuse_export_blk_dev_ops = {
>>>>>>         .drained_poll  = fuse_export_drained_poll,
>>>>>>     };
>>>>>>
>>>>>> +#ifdef CONFIG_LINUX_IO_URING
>>>>>> +
>>>>>> +static void fuse_uring_sqe_set_req_data(struct fuse_uring_cmd_req
>>>>>> *req,
>>>>>> +                    const unsigned int qid,
>>>>>> +                    const unsigned int commit_id)
>>>>>> +{
>>>>>> +    req->qid = qid;
>>>>>> +    req->commit_id = commit_id;
>>>>>> +    req->flags = 0;
>>>>>> +}
>>>>>> +
>>>>>> +static void fuse_uring_sqe_prepare(struct io_uring_sqe *sqe,
>>>>>> FuseQueue *q,
>>>>>> +               __u32 cmd_op)
>>>>>> +{
>>>>>> +    sqe->opcode = IORING_OP_URING_CMD;
>>>>>> +
>>>>>> +    sqe->fd = q->fuse_fd;
>>>>>> +    sqe->rw_flags = 0;
>>>>>> +    sqe->ioprio = 0;
>>>>>> +    sqe->off = 0;
>>>>>> +
>>>>>> +    sqe->cmd_op = cmd_op;
>>>>>> +    sqe->__pad1 = 0;
>>>>>> +}
>>>>>> +
>>>>>> +static void fuse_uring_prep_sqe_register(struct io_uring_sqe
>>>>>> *sqe, void *opaque)
>>>>>> +{
>>>>>> +    FuseQueue *q = opaque;
>>>>>> +    struct fuse_uring_cmd_req *req = (void *)&sqe->cmd[0];
>>>>>> +
>>>>>> +    fuse_uring_sqe_prepare(sqe, q, FUSE_IO_URING_CMD_REGISTER);
>>>>>> +
>>>>>> +    sqe->addr = (uint64_t)(q->ent.iov);
>>>>>> +    sqe->len = 2;
>>>>>> +
>>>>>> +    fuse_uring_sqe_set_req_data(req, q->qid, 0);
>>>>>> +}
>>>>>> +
>>>>>> +static void fuse_uring_submit_register(void *opaque)
>>>>>> +{
>>>>>> +    FuseQueue *q = opaque;
>>>>>> +    FuseExport *exp = q->exp;
>>>>>> +
>>>>>> +
>>>>>> +    aio_add_sqe(fuse_uring_prep_sqe_register, q, &(q-
>>>>>> >ent.fuse_cqe_handler));
>>>>>
>>>>> I think there might be a tricky issue with the io_uring integration in
>>>>> QEMU. Currently, when the number of IOThreads goes above ~6 or 7,
>>>>> there’s a pretty high chance of a hang. I added some debug logging in
>>>>> the kernel’s fuse_uring_cmd() registration part, and noticed that the
>>>>> number of register calls is less than the total number of entries
>>>>> in the
>>>>> queue. In theory, we should be registering each entry for each queue.
>>>>
>>>> Did you also try to add logging at the top of fuse_uring_cmd()? I
>>>> wonder
>>>> if there is a start up race and if initial commands are just getting
>>>> refused. I had run into issues you are describing in some versions of
>>>> the -rfc patches, but thought that everything was fixed for that.
>>>> I.e. not excluded that there is still a kernel issue left.
>>>>
>>>> Thanks,
>>>> Bernd
>>>>
>>>>
>>>
>>> Yes. I added a printk at the beginning of fuse_uring_cmd(), another at
>>> the beginning of fuse_uring_register(), and one more at the end of
>>> fuse_uring_do_register(). Then I created and registered 20 queues, each
>>> with a single ring entry. It printed 37 times(diff every time) with
>>> opcode FUSE_IO_URING_CMD_REGISTER (would expect 20), and only 6 queues
>>> were registered successfully. The rest of fuse_uring_cmd (x31) exited
>>> inside the if (!fc->initialized) branch in fuse_uring_cmd()
>>>
>>> dmesg: https://gist.github.com/
>>> hibriansong/4eda6e7e92601df497282dcd56fd5470
>>
>> Thank you for the logs, could you try this?
>>
>> diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
>> index 2aa20707f40b..cea57ad5d3ab 100644
>> --- a/fs/fuse/dev_uring.c
>> +++ b/fs/fuse/dev_uring.c
>> @@ -1324,6 +1324,9 @@ int fuse_uring_cmd(struct io_uring_cmd *cmd,
>> unsigned int issue_flags)
>>          if (!fc->connected)
>>                  return -ENOTCONN;
>> +       /* Matches smp_wmb() in fuse_set_initialized() */
>> +       smp_rmb();
>> +
>>          /*
>>           * fuse_uring_register() needs the ring to be initialized,
>>           * we need to know the max payload size
>>
>>
>>
>> Thanks,
>> Bernd
>
> I realized the issue actually comes from QEMU handling the FUSE_INIT
> request. After I processed outargs, I didn't send the response back to
> the kernel before starting the fuse-over-io_uring initialization. So
> it's possible that the 20 registration requests submitted via
> io_uring_cmd() reach the kernel before process_init_reply() has run and
> set fc->initialized = 1, which causes fuse_uring_cmd to bail out
> repeatedly.
>
> I also noticed that in libfuse, they first send the init request
> response, then allocate queues and submit the register SQEs. But even
> there, during the fuse-over-io_uring init after sending the response, if
> the kernel hasn't finished process_init_reply() and set fc->initialized
> = 1, wouldn't they run into a similar issue fuse_uring_cmd repeatedly
> bailing on register requests because fc->initialized isn't set yet?

Hi Bernd,

Nvm, I think writing to /dev/fuse fd is blocking.

Thanks so much for your feedback!

Best,
Brian

Re: [PATCH 1/3] fuse: add FUSE-over-io_uring enable opt and init

Posted by Stefan Hajnoczi 2 months, 4 weeks ago

On Sat, Aug 16, 2025 at 07:13:53PM -0400, Brian Song wrote:
> 
> 
> On 8/14/25 11:46 PM, Brian Song wrote:
> > From: Brian Song <hibriansong@gmail.com>
> > 
> > This patch adds a new export option for storage-export-daemon to enable
> > or disable FUSE-over-io_uring via the switch io-uring=on|off (disable
> > by default). It also implements the protocol handshake with the Linux
> > kernel during the FUSE-over-io_uring initialization phase.
> > 
> > See: https://docs.kernel.org/filesystems/fuse-io-uring.html
> > 
> > The kernel documentation describes in detail how FUSE-over-io_uring
> > works. This patch implements the Initial SQE stage shown in thediagram:
> > it initializes one queue per IOThread, each currently supporting a
> > single submission queue entry (SQE). When the FUSE driver sends the
> > first FUSE request (FUSE_INIT), storage-export-daemon calls
> > fuse_uring_start() to complete initialization, ultimately submitting
> > the SQE with the FUSE_IO_URING_CMD_REGISTER command to confirm
> > successful initialization with the kernel.
> > 
> > Suggested-by: Kevin Wolf <kwolf@redhat.com>
> > Suggested-by: Stefan Hajnoczi <stefanha@redhat.com>
> > Signed-off-by: Brian Song <hibriansong@gmail.com>
> > ---
> >   block/export/fuse.c                  | 161 ++++++++++++++++++++++++---
> >   docs/tools/qemu-storage-daemon.rst   |  11 +-
> >   qapi/block-export.json               |   5 +-
> >   storage-daemon/qemu-storage-daemon.c |   1 +
> >   util/fdmon-io_uring.c                |   5 +-
> >   5 files changed, 159 insertions(+), 24 deletions(-)
> > 
> > diff --git a/block/export/fuse.c b/block/export/fuse.c
> > index c0ad4696ce..59fa79f486 100644
> > --- a/block/export/fuse.c
> > +++ b/block/export/fuse.c
> > @@ -48,6 +48,11 @@
> >   #include <linux/fs.h>
> >   #endif
> > 
> > +#define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32
> > +
> > +/* room needed in buffer to accommodate header */
> > +#define FUSE_BUFFER_HEADER_SIZE 0x1000
> > +
> >   /* Prevent overly long bounce buffer allocations */
> >   #define FUSE_MAX_READ_BYTES (MIN(BDRV_REQUEST_MAX_BYTES, 1 * 1024 * 1024))
> >   /*
> > @@ -63,12 +68,31 @@
> >       (FUSE_MAX_WRITE_BYTES - FUSE_IN_PLACE_WRITE_BYTES)
> > 
> >   typedef struct FuseExport FuseExport;
> > +typedef struct FuseQueue FuseQueue;
> > +
> > +typedef struct FuseRingEnt {
> > +    /* back pointer */
> > +    FuseQueue *q;
> > +
> > +    /* commit id of a fuse request */
> > +    uint64_t req_commit_id;
> > +
> > +    /* fuse request header and payload */
> > +    struct fuse_uring_req_header req_header;
> > +    void *op_payload;
> > +    size_t req_payload_sz;
> > +
> > +    /* The vector passed to the kernel */
> > +    struct iovec iov[2];
> > +
> > +    CqeHandler fuse_cqe_handler;
> > +} FuseRingEnt;
> > 
> >   /*
> >    * One FUSE "queue", representing one FUSE FD from which requests are fetched
> >    * and processed.  Each queue is tied to an AioContext.
> >    */
> > -typedef struct FuseQueue {
> > +struct FuseQueue {
> >       FuseExport *exp;
> > 
> >       AioContext *ctx;
> > @@ -109,7 +133,12 @@ typedef struct FuseQueue {
> >        * Free this buffer with qemu_vfree().
> >        */
> >       void *spillover_buf;
> > -} FuseQueue;
> > +
> > +#ifdef CONFIG_LINUX_IO_URING
> > +    int qid;
> > +    FuseRingEnt ent;
> > +#endif
> > +};
> > 
> >   /*
> >    * Verify that FuseQueue.request_buf plus the spill-over buffer together
> > @@ -148,6 +177,7 @@ struct FuseExport {
> >       bool growable;
> >       /* Whether allow_other was used as a mount option or not */
> >       bool allow_other;
> > +    bool is_uring;
> > 
> >       mode_t st_mode;
> >       uid_t st_uid;
> > @@ -257,6 +287,93 @@ static const BlockDevOps fuse_export_blk_dev_ops = {
> >       .drained_poll  = fuse_export_drained_poll,
> >   };
> > 
> > +#ifdef CONFIG_LINUX_IO_URING
> > +
> > +static void fuse_uring_sqe_set_req_data(struct fuse_uring_cmd_req *req,
> > +                    const unsigned int qid,
> > +                    const unsigned int commit_id)
> > +{
> > +    req->qid = qid;
> > +    req->commit_id = commit_id;
> > +    req->flags = 0;
> > +}
> > +
> > +static void fuse_uring_sqe_prepare(struct io_uring_sqe *sqe, FuseQueue *q,
> > +               __u32 cmd_op)
> > +{
> > +    sqe->opcode = IORING_OP_URING_CMD;
> > +
> > +    sqe->fd = q->fuse_fd;
> > +    sqe->rw_flags = 0;
> > +    sqe->ioprio = 0;
> > +    sqe->off = 0;
> > +
> > +    sqe->cmd_op = cmd_op;
> > +    sqe->__pad1 = 0;
> > +}
> > +
> > +static void fuse_uring_prep_sqe_register(struct io_uring_sqe *sqe, void *opaque)
> > +{
> > +    FuseQueue *q = opaque;
> > +    struct fuse_uring_cmd_req *req = (void *)&sqe->cmd[0];
> > +
> > +    fuse_uring_sqe_prepare(sqe, q, FUSE_IO_URING_CMD_REGISTER);
> > +
> > +    sqe->addr = (uint64_t)(q->ent.iov);
> > +    sqe->len = 2;
> > +
> > +    fuse_uring_sqe_set_req_data(req, q->qid, 0);
> > +}
> > +
> > +static void fuse_uring_submit_register(void *opaque)
> > +{
> > +    FuseQueue *q = opaque;
> > +    FuseExport *exp = q->exp;
> > +
> > +
> > +    aio_add_sqe(fuse_uring_prep_sqe_register, q, &(q->ent.fuse_cqe_handler));
> 
> I think there might be a tricky issue with the io_uring integration in QEMU.
> Currently, when the number of IOThreads goes above ~6 or 7, there’s a pretty
> high chance of a hang. I added some debug logging in the kernel’s
> fuse_uring_cmd() registration part, and noticed that the number of register
> calls is less than the total number of entries in the queue. In theory, we
> should be registering each entry for each queue.
> 
> On the userspace side, everything seems normal, the number of aio_add_sqe()
> calls matches the number of IOThreads. But here’s the weird part: if I add a
> printf inside the while loop in fdmon-io_uring.c::fdmon_io_uring_wait(),
> suddenly everything works fine, and the kernel receives registration
> requests for all entries as expected.
> 
>     do {
>         ret = io_uring_submit_and_wait(&ctx->fdmon_io_uring, wait_nr);
>         fprintf(stderr, "io_uring_submit_and_wait ret: %d\n", ret);
>     } while (ret == -EINTR);
> 
> My guess is that printf is just slowing down the loop, or maybe there’s some
> implicit memory barrier happening. Obviously, the right fix isn’t to
> sprinkle fprintfs around. I suspect there might be a subtle
> synchronization/race issue here.

Strange, your fprintf(3) is after io_uring_submit_and_wait(3). I'm not
sure how that would influence timing because there should be num_cpus
IOThreads independently submitting 1 REGISTER uring_cmd.

Debugging ideas:

- When QEMU hangs, cat /proc/<pid>/fdinfo/<fd> for each IOThread's
  io_uring file descriptor. That shows you what the kernel sees,
  including the state of the SQ/CQ rings. If userspace has filled in the
  SQE then the output should reflect that.

- Replace the REGISTER uring_cmd SQE with a IORING_OP_NOP SQE. This way
  you eliminate FUSE and can focus purely on testing io_uring. If the
  CQE is still missing then there is probably a bug in QEMU's
  aio_add_sqe() API.

Stefan

> 
> Brian
>

[PATCH 1/3] fuse: add FUSE-over-io_uring enable opt and init
[PATCH 2/3] fuse: Handle FUSE-uring requests
[PATCH 3/3] fuse: Safe termination for FUSE-uring