export/fuse: Add FUSE-over-io_uring for Storage Exports

[PATCH 1/4] export/fuse: add opt to enable FUSE-over-io_uring

Posted by Brian Song 4 weeks, 1 day ago

This patch adds a new export option for storage-export-daemon to enable
FUSE-over-io_uring via the switch io-uring=on|off (disableby default).
It also implements the protocol handshake with the Linux kernel
during the FUSE-over-io_uring initialization phase.

See: https://docs.kernel.org/filesystems/fuse-io-uring.html

The kernel documentation describes in detail how FUSE-over-io_uring
works. This patch implements the Initial SQE stage shown in thediagram:
it initializes one queue per IOThread, each currently supporting a
single submission queue entry (SQE). When the FUSE driver sends the
first FUSE request (FUSE_INIT), storage-export-daemon calls
fuse_uring_start() to complete initialization, ultimately submitting
the SQE with the FUSE_IO_URING_CMD_REGISTER command to confirm
successful initialization with the kernel.

We also added support for multiple IOThreads. The current Linux kernel
requires registering $(nproc) queues when setting up FUSE-over-io_uring
To let users customize the number of FUSE Queues (i.e., IOThreads),
we first create nproc Ring Queues as required by the kernel, then
distribute them in a round-robin manner to the FUSE Queues for
registration. In addition, to support multiple in-flight requests,
we configure each Ring Queue with FUSE_DEFAULT_RING_QUEUE_DEPTH
entries/requests.

Suggested-by: Kevin Wolf <kwolf@redhat.com>
Suggested-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Brian Song <hibriansong@gmail.com>
---
 block/export/fuse.c                  | 310 +++++++++++++++++++++++++--
 docs/tools/qemu-storage-daemon.rst   |  11 +-
 qapi/block-export.json               |   5 +-
 storage-daemon/qemu-storage-daemon.c |   1 +
 util/fdmon-io_uring.c                |   5 +-
 5 files changed, 309 insertions(+), 23 deletions(-)

diff --git a/block/export/fuse.c b/block/export/fuse.c
index c0ad4696ce..19bf9e5f74 100644
--- a/block/export/fuse.c
+++ b/block/export/fuse.c
@@ -48,6 +48,9 @@
 #include <linux/fs.h>
 #endif
 
+/* room needed in buffer to accommodate header */
+#define FUSE_BUFFER_HEADER_SIZE 0x1000
+
 /* Prevent overly long bounce buffer allocations */
 #define FUSE_MAX_READ_BYTES (MIN(BDRV_REQUEST_MAX_BYTES, 1 * 1024 * 1024))
 /*
@@ -63,12 +66,59 @@
     (FUSE_MAX_WRITE_BYTES - FUSE_IN_PLACE_WRITE_BYTES)
 
 typedef struct FuseExport FuseExport;
+typedef struct FuseQueue FuseQueue;
+
+#ifdef CONFIG_LINUX_IO_URING
+#define FUSE_DEFAULT_RING_QUEUE_DEPTH 64
+#define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32
+
+typedef struct FuseRingQueue FuseRingQueue;
+typedef struct FuseRingEnt {
+    /* back pointer */
+    FuseRingQueue *rq;
+
+    /* commit id of a fuse request */
+    uint64_t req_commit_id;
+
+    /* fuse request header and payload */
+    struct fuse_uring_req_header req_header;
+    void *op_payload;
+    size_t req_payload_sz;
+
+    /* The vector passed to the kernel */
+    struct iovec iov[2];
+
+    CqeHandler fuse_cqe_handler;
+} FuseRingEnt;
+
+struct FuseRingQueue {
+    int rqid;
+
+    /* back pointer */
+    FuseQueue *q;
+    FuseRingEnt *ent;
+
+    /* List entry for ring_queues */
+    QLIST_ENTRY(FuseRingQueue) next;
+};
+
+/*
+ * Round-robin distribution of ring queues across FUSE queues.
+ * This structure manages the mapping between kernel ring queues and user
+ * FUSE queues.
+ */
+typedef struct FuseRingQueueManager {
+    FuseRingQueue *ring_queues;
+    int num_ring_queues;
+    int num_fuse_queues;
+} FuseRingQueueManager;
+#endif
 
 /*
  * One FUSE "queue", representing one FUSE FD from which requests are fetched
  * and processed.  Each queue is tied to an AioContext.
  */
-typedef struct FuseQueue {
+struct FuseQueue {
     FuseExport *exp;
 
     AioContext *ctx;
@@ -109,15 +159,11 @@ typedef struct FuseQueue {
      * Free this buffer with qemu_vfree().
      */
     void *spillover_buf;
-} FuseQueue;
 
-/*
- * Verify that FuseQueue.request_buf plus the spill-over buffer together
- * are big enough to be accepted by the FUSE kernel driver.
- */
-QEMU_BUILD_BUG_ON(sizeof(((FuseQueue *)0)->request_buf) +
-                  FUSE_SPILLOVER_BUF_SIZE <
-                  FUSE_MIN_READ_BUFFER);
+#ifdef CONFIG_LINUX_IO_URING
+    QLIST_HEAD(, FuseRingQueue) ring_queue_list;
+#endif
+};
 
 struct FuseExport {
     BlockExport common;
@@ -133,7 +179,7 @@ struct FuseExport {
      */
     bool halted;
 
-    int num_queues;
+    size_t num_queues;
     FuseQueue *queues;
     /*
      * True if this export should follow the generic export's AioContext.
@@ -149,6 +195,12 @@ struct FuseExport {
     /* Whether allow_other was used as a mount option or not */
     bool allow_other;
 
+#ifdef CONFIG_LINUX_IO_URING
+    bool is_uring;
+    size_t ring_queue_depth;
+    FuseRingQueueManager *ring_queue_manager;
+#endif
+
     mode_t st_mode;
     uid_t st_uid;
     gid_t st_gid;
@@ -205,7 +257,7 @@ static void fuse_attach_handlers(FuseExport *exp)
         return;
     }
 
-    for (int i = 0; i < exp->num_queues; i++) {
+    for (size_t i = 0; i < exp->num_queues; i++) {
         aio_set_fd_handler(exp->queues[i].ctx, exp->queues[i].fuse_fd,
                            read_from_fuse_fd, NULL, NULL, NULL,
                            &exp->queues[i]);
@@ -257,6 +309,189 @@ static const BlockDevOps fuse_export_blk_dev_ops = {
     .drained_poll  = fuse_export_drained_poll,
 };
 
+#ifdef CONFIG_LINUX_IO_URING
+static void fuse_uring_sqe_set_req_data(struct fuse_uring_cmd_req *req,
+                    const unsigned int rqid,
+                    const unsigned int commit_id)
+{
+    req->qid = rqid;
+    req->commit_id = commit_id;
+    req->flags = 0;
+}
+
+static void fuse_uring_sqe_prepare(struct io_uring_sqe *sqe, FuseQueue *q,
+               __u32 cmd_op)
+{
+    sqe->opcode = IORING_OP_URING_CMD;
+
+    sqe->fd = q->fuse_fd;
+    sqe->rw_flags = 0;
+    sqe->ioprio = 0;
+    sqe->off = 0;
+
+    sqe->cmd_op = cmd_op;
+    sqe->__pad1 = 0;
+}
+
+static void fuse_uring_prep_sqe_register(struct io_uring_sqe *sqe, void *opaque)
+{
+    FuseRingEnt *ent = opaque;
+    struct fuse_uring_cmd_req *req = (void *)&sqe->cmd[0];
+
+    fuse_uring_sqe_prepare(sqe, ent->rq->q, FUSE_IO_URING_CMD_REGISTER);
+
+    sqe->addr = (uint64_t)(ent->iov);
+    sqe->len = 2;
+
+    fuse_uring_sqe_set_req_data(req, ent->rq->rqid, 0);
+}
+
+static void fuse_uring_submit_register(void *opaque)
+{
+    FuseRingEnt *ent = opaque;
+    FuseExport *exp = ent->rq->q->exp;
+
+
+    aio_add_sqe(fuse_uring_prep_sqe_register, ent, &(ent->fuse_cqe_handler));
+}
+
+/**
+ * Distribute ring queues across FUSE queues using round-robin algorithm.
+ * This ensures even distribution of kernel ring queues across user-specified
+ * FUSE queues.
+ */
+static
+FuseRingQueueManager *fuse_ring_queue_manager_create(int num_fuse_queues,
+                                                    size_t ring_queue_depth,
+                                                    size_t bufsize)
+{
+    int num_ring_queues = get_nprocs();
+    FuseRingQueueManager *manager = g_new(FuseRingQueueManager, 1);
+
+    if (!manager) {
+        return NULL;
+    }
+
+    manager->ring_queues = g_new(FuseRingQueue, num_ring_queues);
+    manager->num_ring_queues = num_ring_queues;
+    manager->num_fuse_queues = num_fuse_queues;
+
+    if (!manager->ring_queues) {
+        g_free(manager);
+        return NULL;
+    }
+
+    for (int i = 0; i < num_ring_queues; i++) {
+        FuseRingQueue *rq = &manager->ring_queues[i];
+        rq->rqid = i;
+        rq->ent = g_new(FuseRingEnt, ring_queue_depth);
+
+        if (!rq->ent) {
+            for (int j = 0; j < i; j++) {
+                g_free(manager->ring_queues[j].ent);
+            }
+            g_free(manager->ring_queues);
+            g_free(manager);
+            return NULL;
+        }
+
+        for (size_t j = 0; j < ring_queue_depth; j++) {
+            FuseRingEnt *ent = &rq->ent[j];
+            ent->rq = rq;
+            ent->req_payload_sz = bufsize - FUSE_BUFFER_HEADER_SIZE;
+            ent->op_payload = g_malloc0(ent->req_payload_sz);
+
+            if (!ent->op_payload) {
+                for (size_t k = 0; k < j; k++) {
+                    g_free(rq->ent[k].op_payload);
+                }
+                g_free(rq->ent);
+                for (int k = 0; k < i; k++) {
+                    g_free(manager->ring_queues[k].ent);
+                }
+                g_free(manager->ring_queues);
+                g_free(manager);
+                return NULL;
+            }
+
+            ent->iov[0] = (struct iovec) {
+                &(ent->req_header),
+                sizeof(struct fuse_uring_req_header)
+            };
+            ent->iov[1] = (struct iovec) {
+                ent->op_payload,
+                ent->req_payload_sz
+            };
+
+            ent->fuse_cqe_handler.cb = fuse_uring_cqe_handler;
+        }
+    }
+
+    return manager;
+}
+
+static
+void fuse_distribute_ring_queues(FuseExport *exp, FuseRingQueueManager *manager)
+{
+    int queue_index = 0;
+
+    for (int i = 0; i < manager->num_ring_queues; i++) {
+        FuseRingQueue *rq = &manager->ring_queues[i];
+
+        rq->q = &exp->queues[queue_index];
+        QLIST_INSERT_HEAD(&(rq->q->ring_queue_list), rq, next);
+
+        queue_index = (queue_index + 1) % manager->num_fuse_queues;
+    }
+}
+
+static
+void fuse_schedule_ring_queue_registrations(FuseExport *exp,
+                                            FuseRingQueueManager *manager)
+{
+    for (int i = 0; i < manager->num_fuse_queues; i++) {
+        FuseQueue *q = &exp->queues[i];
+        FuseRingQueue *rq;
+
+        QLIST_FOREACH(rq, &q->ring_queue_list, next) {
+            for (int j = 0; j < exp->ring_queue_depth; j++) {
+                aio_bh_schedule_oneshot(q->ctx, fuse_uring_submit_register,
+                                        &(rq->ent[j]));
+            }
+        }
+    }
+}
+
+static void fuse_uring_start(FuseExport *exp, struct fuse_init_out *out)
+{
+    /*
+     * Since we didn't enable the FUSE_MAX_PAGES feature, the value of
+     * fc->max_pages should be FUSE_DEFAULT_MAX_PAGES_PER_REQ, which is set by
+     * the kernel by default. Also, max_write should not exceed
+     * FUSE_DEFAULT_MAX_PAGES_PER_REQ * PAGE_SIZE.
+     */
+    size_t bufsize = out->max_write + FUSE_BUFFER_HEADER_SIZE;
+
+    if (!(out->flags & FUSE_MAX_PAGES)) {
+        bufsize = FUSE_DEFAULT_MAX_PAGES_PER_REQ * qemu_real_host_page_size()
+                         + FUSE_BUFFER_HEADER_SIZE;
+    }
+
+    exp->ring_queue_manager = fuse_ring_queue_manager_create(
+        exp->num_queues, exp->ring_queue_depth, bufsize);
+
+    if (!exp->ring_queue_manager) {
+        error_report("Failed to create ring queue manager");
+        return;
+    }
+
+    /* Distribute ring queues across FUSE queues using round-robin */
+    fuse_distribute_ring_queues(exp, exp->ring_queue_manager);
+
+    fuse_schedule_ring_queue_registrations(exp, exp->ring_queue_manager);
+}
+#endif
+
 static int fuse_export_create(BlockExport *blk_exp,
                               BlockExportOptions *blk_exp_args,
                               AioContext *const *multithread,
@@ -270,6 +505,11 @@ static int fuse_export_create(BlockExport *blk_exp,
 
     assert(blk_exp_args->type == BLOCK_EXPORT_TYPE_FUSE);
 
+#ifdef CONFIG_LINUX_IO_URING
+    exp->is_uring = args->io_uring;
+    exp->ring_queue_depth = FUSE_DEFAULT_RING_QUEUE_DEPTH;
+#endif
+
     if (multithread) {
         /* Guaranteed by common export code */
         assert(mt_count >= 1);
@@ -283,6 +523,10 @@ static int fuse_export_create(BlockExport *blk_exp,
                 .exp = exp,
                 .ctx = multithread[i],
                 .fuse_fd = -1,
+#ifdef CONFIG_LINUX_IO_URING
+                .ring_queue_list =
+                    QLIST_HEAD_INITIALIZER(exp->queues[i].ring_queue_list),
+#endif
             };
         }
     } else {
@@ -296,6 +540,10 @@ static int fuse_export_create(BlockExport *blk_exp,
             .exp = exp,
             .ctx = exp->common.ctx,
             .fuse_fd = -1,
+#ifdef CONFIG_LINUX_IO_URING
+            .ring_queue_list =
+                QLIST_HEAD_INITIALIZER(exp->queues[0].ring_queue_list),
+#endif
         };
     }
 
@@ -685,17 +933,39 @@ static bool is_regular_file(const char *path, Error **errp)
  */
 static ssize_t coroutine_fn
 fuse_co_init(FuseExport *exp, struct fuse_init_out *out,
-             uint32_t max_readahead, uint32_t flags)
+             uint32_t max_readahead, const struct fuse_init_in *in)
 {
-    const uint32_t supported_flags = FUSE_ASYNC_READ | FUSE_ASYNC_DIO;
+    uint64_t supported_flags = FUSE_ASYNC_READ | FUSE_ASYNC_DIO
+                                     | FUSE_INIT_EXT;
+    uint64_t outargflags = 0;
+    uint64_t inargflags = in->flags;
+
+    ssize_t ret = 0;
+
+    if (inargflags & FUSE_INIT_EXT) {
+        inargflags = inargflags | (uint64_t) in->flags2 << 32;
+    }
+
+#ifdef CONFIG_LINUX_IO_URING
+    if (exp->is_uring) {
+        if (inargflags & FUSE_OVER_IO_URING) {
+            supported_flags |= FUSE_OVER_IO_URING;
+        } else {
+            exp->is_uring = false;
+            ret = -ENODEV;
+        }
+    }
+#endif
+
+    outargflags = inargflags & supported_flags;
 
     *out = (struct fuse_init_out) {
         .major = FUSE_KERNEL_VERSION,
         .minor = FUSE_KERNEL_MINOR_VERSION,
         .max_readahead = max_readahead,
         .max_write = FUSE_MAX_WRITE_BYTES,
-        .flags = flags & supported_flags,
-        .flags2 = 0,
+        .flags = outargflags,
+        .flags2 = outargflags >> 32,
 
         /* libfuse maximum: 2^16 - 1 */
         .max_background = UINT16_MAX,
@@ -717,7 +987,7 @@ fuse_co_init(FuseExport *exp, struct fuse_init_out *out,
         .map_alignment = 0,
     };
 
-    return sizeof(*out);
+    return ret < 0 ? ret : sizeof(*out);
 }
 
 /**
@@ -1506,6 +1776,14 @@ fuse_co_process_request(FuseQueue *q, void *spillover_buf)
         fuse_write_buf_response(q->fuse_fd, req_id, out_hdr,
                                 out_data_buffer, ret);
         qemu_vfree(out_data_buffer);
+#ifdef CONFIG_LINUX_IO_URING
+    /* Handle FUSE-over-io_uring initialization */
+    if (unlikely(opcode == FUSE_INIT && exp->is_uring)) {
+        struct fuse_init_out *out =
+            (struct fuse_init_out *)FUSE_OUT_OP_STRUCT(out_buf);
+        fuse_uring_start(exp, out);
+    }
+#endif
     } else {
         fuse_write_response(q->fuse_fd, req_id, out_hdr,
                             ret < 0 ? ret : 0,
diff --git a/docs/tools/qemu-storage-daemon.rst b/docs/tools/qemu-storage-daemon.rst
index 35ab2d7807..c5076101e0 100644
--- a/docs/tools/qemu-storage-daemon.rst
+++ b/docs/tools/qemu-storage-daemon.rst
@@ -78,7 +78,7 @@ Standard options:
 .. option:: --export [type=]nbd,id=<id>,node-name=<node-name>[,name=<export-name>][,writable=on|off][,bitmap=<name>]
   --export [type=]vhost-user-blk,id=<id>,node-name=<node-name>,addr.type=unix,addr.path=<socket-path>[,writable=on|off][,logical-block-size=<block-size>][,num-queues=<num-queues>]
   --export [type=]vhost-user-blk,id=<id>,node-name=<node-name>,addr.type=fd,addr.str=<fd>[,writable=on|off][,logical-block-size=<block-size>][,num-queues=<num-queues>]
-  --export [type=]fuse,id=<id>,node-name=<node-name>,mountpoint=<file>[,growable=on|off][,writable=on|off][,allow-other=on|off|auto]
+  --export [type=]fuse,id=<id>,node-name=<node-name>,mountpoint=<file>[,growable=on|off][,writable=on|off][,allow-other=on|off|auto][,io-uring=on|off]
   --export [type=]vduse-blk,id=<id>,node-name=<node-name>,name=<vduse-name>[,writable=on|off][,num-queues=<num-queues>][,queue-size=<queue-size>][,logical-block-size=<block-size>][,serial=<serial-number>]
 
   is a block export definition. ``node-name`` is the block node that should be
@@ -111,10 +111,11 @@ Standard options:
   that enabling this option as a non-root user requires enabling the
   user_allow_other option in the global fuse.conf configuration file.  Setting
   ``allow-other`` to auto (the default) will try enabling this option, and on
-  error fall back to disabling it.
-
-  The ``vduse-blk`` export type takes a ``name`` (must be unique across the host)
-  to create the VDUSE device.
+  error fall back to disabling it. Once ``io-uring`` is enabled (off by default),
+  the FUSE-over-io_uring-related settings will be initialized to bypass the
+  traditional /dev/fuse communication mechanism and instead use io_uring to
+  handle FUSE operations. The ``vduse-blk`` export type takes a ``name``
+  (must be unique across the host) to create the VDUSE device.
   ``num-queues`` sets the number of virtqueues (the default is 1).
   ``queue-size`` sets the virtqueue descriptor table size (the default is 256).
 
diff --git a/qapi/block-export.json b/qapi/block-export.json
index 9ae703ad01..37f2fc47e2 100644
--- a/qapi/block-export.json
+++ b/qapi/block-export.json
@@ -184,12 +184,15 @@
 #     mount the export with allow_other, and if that fails, try again
 #     without.  (since 6.1; default: auto)
 #
+# @io-uring: Use FUSE-over-io-uring.  (since 10.2; default: false)
+#
 # Since: 6.0
 ##
 { 'struct': 'BlockExportOptionsFuse',
   'data': { 'mountpoint': 'str',
             '*growable': 'bool',
-            '*allow-other': 'FuseExportAllowOther' },
+            '*allow-other': 'FuseExportAllowOther',
+            '*io-uring': 'bool' },
   'if': 'CONFIG_FUSE' }
 
 ##
diff --git a/storage-daemon/qemu-storage-daemon.c b/storage-daemon/qemu-storage-daemon.c
index eb72561358..0cd4cd2b58 100644
--- a/storage-daemon/qemu-storage-daemon.c
+++ b/storage-daemon/qemu-storage-daemon.c
@@ -107,6 +107,7 @@ static void help(void)
 #ifdef CONFIG_FUSE
 "  --export [type=]fuse,id=<id>,node-name=<node-name>,mountpoint=<file>\n"
 "           [,growable=on|off][,writable=on|off][,allow-other=on|off|auto]\n"
+"           [,io-uring=on|off]"
 "                         export the specified block node over FUSE\n"
 "\n"
 #endif /* CONFIG_FUSE */
diff --git a/util/fdmon-io_uring.c b/util/fdmon-io_uring.c
index d2433d1d99..68d3fe8e01 100644
--- a/util/fdmon-io_uring.c
+++ b/util/fdmon-io_uring.c
@@ -452,10 +452,13 @@ static const FDMonOps fdmon_io_uring_ops = {
 void fdmon_io_uring_setup(AioContext *ctx, Error **errp)
 {
     int ret;
+    int flags;
 
     ctx->io_uring_fd_tag = NULL;
+    flags = IORING_SETUP_SQE128;
 
-    ret = io_uring_queue_init(FDMON_IO_URING_ENTRIES, &ctx->fdmon_io_uring, 0);
+    ret = io_uring_queue_init(FDMON_IO_URING_ENTRIES,
+                            &ctx->fdmon_io_uring, flags);
     if (ret != 0) {
         error_setg_errno(errp, -ret, "Failed to initialize io_uring");
         return;
-- 
2.45.2

Re: [PATCH 1/4] export/fuse: add opt to enable FUSE-over-io_uring

Posted by Kevin Wolf 1 week, 4 days ago

Am 30.08.2025 um 04:50 hat Brian Song geschrieben:
> This patch adds a new export option for storage-export-daemon to enable
> FUSE-over-io_uring via the switch io-uring=on|off (disableby default).
> It also implements the protocol handshake with the Linux kernel
> during the FUSE-over-io_uring initialization phase.
> 
> See: https://docs.kernel.org/filesystems/fuse-io-uring.html
> 
> The kernel documentation describes in detail how FUSE-over-io_uring
> works. This patch implements the Initial SQE stage shown in thediagram:
> it initializes one queue per IOThread, each currently supporting a
> single submission queue entry (SQE). When the FUSE driver sends the
> first FUSE request (FUSE_INIT), storage-export-daemon calls
> fuse_uring_start() to complete initialization, ultimately submitting
> the SQE with the FUSE_IO_URING_CMD_REGISTER command to confirm
> successful initialization with the kernel.
> 
> We also added support for multiple IOThreads. The current Linux kernel
> requires registering $(nproc) queues when setting up FUSE-over-io_uring
> To let users customize the number of FUSE Queues (i.e., IOThreads),
> we first create nproc Ring Queues as required by the kernel, then
> distribute them in a round-robin manner to the FUSE Queues for
> registration. In addition, to support multiple in-flight requests,
> we configure each Ring Queue with FUSE_DEFAULT_RING_QUEUE_DEPTH
> entries/requests.
> 
> Suggested-by: Kevin Wolf <kwolf@redhat.com>
> Suggested-by: Stefan Hajnoczi <stefanha@redhat.com>
> Signed-off-by: Brian Song <hibriansong@gmail.com>
> ---
>  block/export/fuse.c                  | 310 +++++++++++++++++++++++++--
>  docs/tools/qemu-storage-daemon.rst   |  11 +-
>  qapi/block-export.json               |   5 +-
>  storage-daemon/qemu-storage-daemon.c |   1 +
>  util/fdmon-io_uring.c                |   5 +-
>  5 files changed, 309 insertions(+), 23 deletions(-)
> 
> diff --git a/block/export/fuse.c b/block/export/fuse.c
> index c0ad4696ce..19bf9e5f74 100644
> --- a/block/export/fuse.c
> +++ b/block/export/fuse.c
> @@ -48,6 +48,9 @@
>  #include <linux/fs.h>
>  #endif
>  
> +/* room needed in buffer to accommodate header */
> +#define FUSE_BUFFER_HEADER_SIZE 0x1000
> +
>  /* Prevent overly long bounce buffer allocations */
>  #define FUSE_MAX_READ_BYTES (MIN(BDRV_REQUEST_MAX_BYTES, 1 * 1024 * 1024))
>  /*
> @@ -63,12 +66,59 @@
>      (FUSE_MAX_WRITE_BYTES - FUSE_IN_PLACE_WRITE_BYTES)
>  
>  typedef struct FuseExport FuseExport;
> +typedef struct FuseQueue FuseQueue;
> +
> +#ifdef CONFIG_LINUX_IO_URING
> +#define FUSE_DEFAULT_RING_QUEUE_DEPTH 64
> +#define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32

Maybe it would be a little clearer if the next few types has URing in
their name instead of just Ring.

> +typedef struct FuseRingQueue FuseRingQueue;
> +typedef struct FuseRingEnt {
> +    /* back pointer */
> +    FuseRingQueue *rq;
> +
> +    /* commit id of a fuse request */
> +    uint64_t req_commit_id;
> +
> +    /* fuse request header and payload */
> +    struct fuse_uring_req_header req_header;
> +    void *op_payload;
> +    size_t req_payload_sz;
> +
> +    /* The vector passed to the kernel */
> +    struct iovec iov[2];
> +
> +    CqeHandler fuse_cqe_handler;
> +} FuseRingEnt;
> +
> +struct FuseRingQueue {

It would be good to have a comment here that explains the difference
between FuseQueue and FuseRingQueue.

Is this a distinction that should remain in the long run or would we
always have a 1:1 mapping between FuseQueue and FuseRingQueue once the
pending kernel changes are merged that allow a number of uring queues
different from the number of CPUs?

> +    int rqid;
> +
> +    /* back pointer */
> +    FuseQueue *q;
> +    FuseRingEnt *ent;
> +
> +    /* List entry for ring_queues */
> +    QLIST_ENTRY(FuseRingQueue) next;
> +};
> +
> +/*
> + * Round-robin distribution of ring queues across FUSE queues.
> + * This structure manages the mapping between kernel ring queues and user
> + * FUSE queues.
> + */
> +typedef struct FuseRingQueueManager {
> +    FuseRingQueue *ring_queues;
> +    int num_ring_queues;
> +    int num_fuse_queues;
> +} FuseRingQueueManager;

This isn't a manager, it's just the set of queues the export uses.

num_fuse_queues duplicates exp->num_queues, there is no reason for it to
exist. All users also have access to the FuseExport itself.

The other two fields can just be merged directly into FuseExport,
preferably renamed to uring_queues and num_uring_queues.

> +#endif
>  
>  /*
>   * One FUSE "queue", representing one FUSE FD from which requests are fetched
>   * and processed.  Each queue is tied to an AioContext.
>   */
> -typedef struct FuseQueue {
> +struct FuseQueue {
>      FuseExport *exp;
>  
>      AioContext *ctx;
> @@ -109,15 +159,11 @@ typedef struct FuseQueue {
>       * Free this buffer with qemu_vfree().
>       */
>      void *spillover_buf;
> -} FuseQueue;
>  
> -/*
> - * Verify that FuseQueue.request_buf plus the spill-over buffer together
> - * are big enough to be accepted by the FUSE kernel driver.
> - */
> -QEMU_BUILD_BUG_ON(sizeof(((FuseQueue *)0)->request_buf) +
> -                  FUSE_SPILLOVER_BUF_SIZE <
> -                  FUSE_MIN_READ_BUFFER);
> +#ifdef CONFIG_LINUX_IO_URING
> +    QLIST_HEAD(, FuseRingQueue) ring_queue_list;
> +#endif
> +};
>  
>  struct FuseExport {
>      BlockExport common;
> @@ -133,7 +179,7 @@ struct FuseExport {
>       */
>      bool halted;
>  
> -    int num_queues;
> +    size_t num_queues;

I'm not sure why this change is needed. If it is, can it be a separate
patch before this one, with a commit message describing the reason?

>      FuseQueue *queues;
>      /*
>       * True if this export should follow the generic export's AioContext.
> @@ -149,6 +195,12 @@ struct FuseExport {
>      /* Whether allow_other was used as a mount option or not */
>      bool allow_other;
>  
> +#ifdef CONFIG_LINUX_IO_URING
> +    bool is_uring;
> +    size_t ring_queue_depth;
> +    FuseRingQueueManager *ring_queue_manager;
> +#endif
> +
>      mode_t st_mode;
>      uid_t st_uid;
>      gid_t st_gid;
> @@ -205,7 +257,7 @@ static void fuse_attach_handlers(FuseExport *exp)
>          return;
>      }
>  
> -    for (int i = 0; i < exp->num_queues; i++) {
> +    for (size_t i = 0; i < exp->num_queues; i++) {
>          aio_set_fd_handler(exp->queues[i].ctx, exp->queues[i].fuse_fd,
>                             read_from_fuse_fd, NULL, NULL, NULL,
>                             &exp->queues[i]);
> @@ -257,6 +309,189 @@ static const BlockDevOps fuse_export_blk_dev_ops = {
>      .drained_poll  = fuse_export_drained_poll,
>  };
>  
> +#ifdef CONFIG_LINUX_IO_URING
> +static void fuse_uring_sqe_set_req_data(struct fuse_uring_cmd_req *req,
> +                    const unsigned int rqid,
> +                    const unsigned int commit_id)

Indentation is off here. There are two accepted styles for indentation
after breaking a long line in QEMU (see docs/devel/style.rst):

1. Indent the next line by exactly four spaces:

    do_something(x, y,
        z);

2. Align the next line with the first character after the opening
   parenthesis:

    do_something(x, y,
                 z);

The second one is the preferred one. The first one is generally only
used when the parenthesis is already too far right and we can't do much
about it.

> +{
> +    req->qid = rqid;
> +    req->commit_id = commit_id;
> +    req->flags = 0;
> +}
> +
> +static void fuse_uring_sqe_prepare(struct io_uring_sqe *sqe, FuseQueue *q,
> +               __u32 cmd_op)

Indentation.

Another option here is to keep everything before the function name on a
separate line, like this:

static void
fuse_uring_sqe_prepare(struct io_uring_sqe *sqe, FuseQueue *q, __u32 cmd_op)

This would allow the second line to stay under 80 characters.

> +{
> +    sqe->opcode = IORING_OP_URING_CMD;
> +
> +    sqe->fd = q->fuse_fd;
> +    sqe->rw_flags = 0;
> +    sqe->ioprio = 0;
> +    sqe->off = 0;
> +
> +    sqe->cmd_op = cmd_op;
> +    sqe->__pad1 = 0;
> +}
> +
> +static void fuse_uring_prep_sqe_register(struct io_uring_sqe *sqe, void *opaque)
> +{
> +    FuseRingEnt *ent = opaque;
> +    struct fuse_uring_cmd_req *req = (void *)&sqe->cmd[0];
> +
> +    fuse_uring_sqe_prepare(sqe, ent->rq->q, FUSE_IO_URING_CMD_REGISTER);
> +
> +    sqe->addr = (uint64_t)(ent->iov);
> +    sqe->len = 2;
> +
> +    fuse_uring_sqe_set_req_data(req, ent->rq->rqid, 0);
> +}
> +
> +static void fuse_uring_submit_register(void *opaque)
> +{
> +    FuseRingEnt *ent = opaque;
> +    FuseExport *exp = ent->rq->q->exp;
> +
> +

Extra empty line.

> +    aio_add_sqe(fuse_uring_prep_sqe_register, ent, &(ent->fuse_cqe_handler));

The parentheses around ent->fuse_cqe_handler are unnecessary.

> +}
> +
> +/**
> + * Distribute ring queues across FUSE queues using round-robin algorithm.

Hm, if this function distributes (u)ring queues, then what is
fuse_distribute_ring_queues() doing? Is the term overloaded with two
meanings?

> + * This ensures even distribution of kernel ring queues across user-specified
> + * FUSE queues.
> + */
> +static
> +FuseRingQueueManager *fuse_ring_queue_manager_create(int num_fuse_queues,
> +                                                    size_t ring_queue_depth,
> +                                                    size_t bufsize)

The right style here would be something like:

static FuseRingQueueManager *
fuse_ring_queue_manager_create(int num_fuse_queues,
                               size_t ring_queue_depth,
                               size_t bufsize)

Given that I said that there is no reason to call the set of all queues
a manager, or to even have it separate from FuseExport, this probably
becomes fuse_uring_setup_queues() or something.

> +{
> +    int num_ring_queues = get_nprocs();

This could use a comment saying that this is a kernel requirement at the
moment.

> +    FuseRingQueueManager *manager = g_new(FuseRingQueueManager, 1);
> +
> +    if (!manager) {
> +        return NULL;
> +    }

g_new() never returns NULL, it aborts on error instead, so no reason to
have a NULL check here.

> +
> +    manager->ring_queues = g_new(FuseRingQueue, num_ring_queues);
> +    manager->num_ring_queues = num_ring_queues;
> +    manager->num_fuse_queues = num_fuse_queues;
> +
> +    if (!manager->ring_queues) {
> +        g_free(manager);
> +        return NULL;
> +    }

This check is unnecessary for the same reason.

> +
> +    for (int i = 0; i < num_ring_queues; i++) {
> +        FuseRingQueue *rq = &manager->ring_queues[i];
> +        rq->rqid = i;
> +        rq->ent = g_new(FuseRingEnt, ring_queue_depth);
> +
> +        if (!rq->ent) {
> +            for (int j = 0; j < i; j++) {
> +                g_free(manager->ring_queues[j].ent);
> +            }
> +            g_free(manager->ring_queues);
> +            g_free(manager);
> +            return NULL;
> +        }

This one, too.

> +
> +        for (size_t j = 0; j < ring_queue_depth; j++) {
> +            FuseRingEnt *ent = &rq->ent[j];
> +            ent->rq = rq;
> +            ent->req_payload_sz = bufsize - FUSE_BUFFER_HEADER_SIZE;
> +            ent->op_payload = g_malloc0(ent->req_payload_sz);
> +
> +            if (!ent->op_payload) {
> +                for (size_t k = 0; k < j; k++) {
> +                    g_free(rq->ent[k].op_payload);
> +                }
> +                g_free(rq->ent);
> +                for (int k = 0; k < i; k++) {
> +                    g_free(manager->ring_queues[k].ent);
> +                }
> +                g_free(manager->ring_queues);
> +                g_free(manager);
> +                return NULL;
> +            }

And this one.

Removing all of them will make the function a lot more readable.

> +
> +            ent->iov[0] = (struct iovec) {
> +                &(ent->req_header),

Unnecessary parentheses.

> +                sizeof(struct fuse_uring_req_header)
> +            };
> +            ent->iov[1] = (struct iovec) {
> +                ent->op_payload,
> +                ent->req_payload_sz
> +            };
> +
> +            ent->fuse_cqe_handler.cb = fuse_uring_cqe_handler;
> +        }
> +    }
> +
> +    return manager;
> +}
> +
> +static
> +void fuse_distribute_ring_queues(FuseExport *exp, FuseRingQueueManager *manager)
> +{
> +    int queue_index = 0;
> +
> +    for (int i = 0; i < manager->num_ring_queues; i++) {
> +        FuseRingQueue *rq = &manager->ring_queues[i];
> +
> +        rq->q = &exp->queues[queue_index];
> +        QLIST_INSERT_HEAD(&(rq->q->ring_queue_list), rq, next);
> +
> +        queue_index = (queue_index + 1) % manager->num_fuse_queues;
> +    }
> +}

Ok, no overloaded meaning of distributing queues, but this function
should probably be merged with the one above. It's part of setting up
the queues.

You don't need a separate queue_index counter, you can just directly use
exp->queues[i % manager->num_fuse_queues].

> +static
> +void fuse_schedule_ring_queue_registrations(FuseExport *exp,
> +                                            FuseRingQueueManager *manager)

Again the formatting. If you split the line before the function name, it
should be "static void" on the first line.

> +{
> +    for (int i = 0; i < manager->num_fuse_queues; i++) {
> +        FuseQueue *q = &exp->queues[i];
> +        FuseRingQueue *rq;
> +
> +        QLIST_FOREACH(rq, &q->ring_queue_list, next) {
> +            for (int j = 0; j < exp->ring_queue_depth; j++) {
> +                aio_bh_schedule_oneshot(q->ctx, fuse_uring_submit_register,
> +                                        &(rq->ent[j]));
> +            }
> +        }
> +    }
> +}

Why one BH per queue entry? This adds up quickly. All entries of the
same queue need to be processed in the same AioContext, so wouldn't it
make more sense to have a BH per (FUSE) queue and handle all of its
uring queues and their entries in a single BH?

> +static void fuse_uring_start(FuseExport *exp, struct fuse_init_out *out)
> +{
> +    /*
> +     * Since we didn't enable the FUSE_MAX_PAGES feature, the value of
> +     * fc->max_pages should be FUSE_DEFAULT_MAX_PAGES_PER_REQ, which is set by
> +     * the kernel by default. Also, max_write should not exceed
> +     * FUSE_DEFAULT_MAX_PAGES_PER_REQ * PAGE_SIZE.
> +     */
> +    size_t bufsize = out->max_write + FUSE_BUFFER_HEADER_SIZE;
> +
> +    if (!(out->flags & FUSE_MAX_PAGES)) {
> +        bufsize = FUSE_DEFAULT_MAX_PAGES_PER_REQ * qemu_real_host_page_size()
> +                         + FUSE_BUFFER_HEADER_SIZE;
> +    }
> +
> +    exp->ring_queue_manager = fuse_ring_queue_manager_create(
> +        exp->num_queues, exp->ring_queue_depth, bufsize);
> +
> +    if (!exp->ring_queue_manager) {
> +        error_report("Failed to create ring queue manager");
> +        return;
> +    }
> +
> +    /* Distribute ring queues across FUSE queues using round-robin */
> +    fuse_distribute_ring_queues(exp, exp->ring_queue_manager);
> +
> +    fuse_schedule_ring_queue_registrations(exp, exp->ring_queue_manager);
> +}
> +#endif
> +
>  static int fuse_export_create(BlockExport *blk_exp,
>                                BlockExportOptions *blk_exp_args,
>                                AioContext *const *multithread,
> @@ -270,6 +505,11 @@ static int fuse_export_create(BlockExport *blk_exp,
>  
>      assert(blk_exp_args->type == BLOCK_EXPORT_TYPE_FUSE);
>  
> +#ifdef CONFIG_LINUX_IO_URING
> +    exp->is_uring = args->io_uring;
> +    exp->ring_queue_depth = FUSE_DEFAULT_RING_QUEUE_DEPTH;
> +#endif
> +
>      if (multithread) {
>          /* Guaranteed by common export code */
>          assert(mt_count >= 1);
> @@ -283,6 +523,10 @@ static int fuse_export_create(BlockExport *blk_exp,
>                  .exp = exp,
>                  .ctx = multithread[i],
>                  .fuse_fd = -1,
> +#ifdef CONFIG_LINUX_IO_URING
> +                .ring_queue_list =
> +                    QLIST_HEAD_INITIALIZER(exp->queues[i].ring_queue_list),
> +#endif
>              };
>          }
>      } else {
> @@ -296,6 +540,10 @@ static int fuse_export_create(BlockExport *blk_exp,
>              .exp = exp,
>              .ctx = exp->common.ctx,
>              .fuse_fd = -1,
> +#ifdef CONFIG_LINUX_IO_URING
> +            .ring_queue_list =
> +                QLIST_HEAD_INITIALIZER(exp->queues[0].ring_queue_list),
> +#endif
>          };
>      }
>  
> @@ -685,17 +933,39 @@ static bool is_regular_file(const char *path, Error **errp)
>   */
>  static ssize_t coroutine_fn
>  fuse_co_init(FuseExport *exp, struct fuse_init_out *out,
> -             uint32_t max_readahead, uint32_t flags)
> +             uint32_t max_readahead, const struct fuse_init_in *in)
>  {
> -    const uint32_t supported_flags = FUSE_ASYNC_READ | FUSE_ASYNC_DIO;
> +    uint64_t supported_flags = FUSE_ASYNC_READ | FUSE_ASYNC_DIO
> +                                     | FUSE_INIT_EXT;
> +    uint64_t outargflags = 0;
> +    uint64_t inargflags = in->flags;
> +
> +    ssize_t ret = 0;
> +
> +    if (inargflags & FUSE_INIT_EXT) {
> +        inargflags = inargflags | (uint64_t) in->flags2 << 32;
> +    }
> +
> +#ifdef CONFIG_LINUX_IO_URING
> +    if (exp->is_uring) {
> +        if (inargflags & FUSE_OVER_IO_URING) {
> +            supported_flags |= FUSE_OVER_IO_URING;
> +        } else {
> +            exp->is_uring = false;
> +            ret = -ENODEV;

Add a 'goto out' here...

> +        }
> +    }
> +#endif
> +
> +    outargflags = inargflags & supported_flags;
>  
>      *out = (struct fuse_init_out) {
>          .major = FUSE_KERNEL_VERSION,
>          .minor = FUSE_KERNEL_MINOR_VERSION,
>          .max_readahead = max_readahead,
>          .max_write = FUSE_MAX_WRITE_BYTES,
> -        .flags = flags & supported_flags,
> -        .flags2 = 0,
> +        .flags = outargflags,
> +        .flags2 = outargflags >> 32,
>  
>          /* libfuse maximum: 2^16 - 1 */
>          .max_background = UINT16_MAX,
> @@ -717,7 +987,7 @@ fuse_co_init(FuseExport *exp, struct fuse_init_out *out,
>          .map_alignment = 0,
>      };
> -    return sizeof(*out);
> +    return ret < 0 ? ret : sizeof(*out);

...and make this:

    ret = sizeof(*out);
out:
    return ret;

>  }
>  
>  /**
> @@ -1506,6 +1776,14 @@ fuse_co_process_request(FuseQueue *q, void *spillover_buf)
>          fuse_write_buf_response(q->fuse_fd, req_id, out_hdr,
>                                  out_data_buffer, ret);
>          qemu_vfree(out_data_buffer);
> +#ifdef CONFIG_LINUX_IO_URING
> +    /* Handle FUSE-over-io_uring initialization */
> +    if (unlikely(opcode == FUSE_INIT && exp->is_uring)) {
> +        struct fuse_init_out *out =
> +            (struct fuse_init_out *)FUSE_OUT_OP_STRUCT(out_buf);
> +        fuse_uring_start(exp, out);
> +    }
> +#endif

A level of indentation was lost here.

>      } else {
>          fuse_write_response(q->fuse_fd, req_id, out_hdr,
>                              ret < 0 ? ret : 0,
> diff --git a/docs/tools/qemu-storage-daemon.rst b/docs/tools/qemu-storage-daemon.rst
> index 35ab2d7807..c5076101e0 100644
> --- a/docs/tools/qemu-storage-daemon.rst
> +++ b/docs/tools/qemu-storage-daemon.rst
> @@ -78,7 +78,7 @@ Standard options:
>  .. option:: --export [type=]nbd,id=<id>,node-name=<node-name>[,name=<export-name>][,writable=on|off][,bitmap=<name>]
>    --export [type=]vhost-user-blk,id=<id>,node-name=<node-name>,addr.type=unix,addr.path=<socket-path>[,writable=on|off][,logical-block-size=<block-size>][,num-queues=<num-queues>]
>    --export [type=]vhost-user-blk,id=<id>,node-name=<node-name>,addr.type=fd,addr.str=<fd>[,writable=on|off][,logical-block-size=<block-size>][,num-queues=<num-queues>]
> -  --export [type=]fuse,id=<id>,node-name=<node-name>,mountpoint=<file>[,growable=on|off][,writable=on|off][,allow-other=on|off|auto]
> +  --export [type=]fuse,id=<id>,node-name=<node-name>,mountpoint=<file>[,growable=on|off][,writable=on|off][,allow-other=on|off|auto][,io-uring=on|off]
>    --export [type=]vduse-blk,id=<id>,node-name=<node-name>,name=<vduse-name>[,writable=on|off][,num-queues=<num-queues>][,queue-size=<queue-size>][,logical-block-size=<block-size>][,serial=<serial-number>]
>  
>    is a block export definition. ``node-name`` is the block node that should be
> @@ -111,10 +111,11 @@ Standard options:
>    that enabling this option as a non-root user requires enabling the
>    user_allow_other option in the global fuse.conf configuration file.  Setting
>    ``allow-other`` to auto (the default) will try enabling this option, and on
> -  error fall back to disabling it.
> -
> -  The ``vduse-blk`` export type takes a ``name`` (must be unique across the host)
> -  to create the VDUSE device.
> +  error fall back to disabling it. Once ``io-uring`` is enabled (off by default),
> +  the FUSE-over-io_uring-related settings will be initialized to bypass the
> +  traditional /dev/fuse communication mechanism and instead use io_uring to
> +  handle FUSE operations. The ``vduse-blk`` export type takes a ``name``
> +  (must be unique across the host) to create the VDUSE device.
>    ``num-queues`` sets the number of virtqueues (the default is 1).
>    ``queue-size`` sets the virtqueue descriptor table size (the default is 256).
>  
> diff --git a/qapi/block-export.json b/qapi/block-export.json
> index 9ae703ad01..37f2fc47e2 100644
> --- a/qapi/block-export.json
> +++ b/qapi/block-export.json
> @@ -184,12 +184,15 @@
>  #     mount the export with allow_other, and if that fails, try again
>  #     without.  (since 6.1; default: auto)
>  #
> +# @io-uring: Use FUSE-over-io-uring.  (since 10.2; default: false)
> +#
>  # Since: 6.0
>  ##
>  { 'struct': 'BlockExportOptionsFuse',
>    'data': { 'mountpoint': 'str',
>              '*growable': 'bool',
> -            '*allow-other': 'FuseExportAllowOther' },
> +            '*allow-other': 'FuseExportAllowOther',
> +            '*io-uring': 'bool' },
>    'if': 'CONFIG_FUSE' }
>  
>  ##
> diff --git a/storage-daemon/qemu-storage-daemon.c b/storage-daemon/qemu-storage-daemon.c
> index eb72561358..0cd4cd2b58 100644
> --- a/storage-daemon/qemu-storage-daemon.c
> +++ b/storage-daemon/qemu-storage-daemon.c
> @@ -107,6 +107,7 @@ static void help(void)
>  #ifdef CONFIG_FUSE
>  "  --export [type=]fuse,id=<id>,node-name=<node-name>,mountpoint=<file>\n"
>  "           [,growable=on|off][,writable=on|off][,allow-other=on|off|auto]\n"
> +"           [,io-uring=on|off]"
>  "                         export the specified block node over FUSE\n"
>  "\n"
>  #endif /* CONFIG_FUSE */
> diff --git a/util/fdmon-io_uring.c b/util/fdmon-io_uring.c
> index d2433d1d99..68d3fe8e01 100644
> --- a/util/fdmon-io_uring.c
> +++ b/util/fdmon-io_uring.c
> @@ -452,10 +452,13 @@ static const FDMonOps fdmon_io_uring_ops = {
>  void fdmon_io_uring_setup(AioContext *ctx, Error **errp)
>  {
>      int ret;
> +    int flags;
>  
>      ctx->io_uring_fd_tag = NULL;
> +    flags = IORING_SETUP_SQE128;
>  
> -    ret = io_uring_queue_init(FDMON_IO_URING_ENTRIES, &ctx->fdmon_io_uring, 0);
> +    ret = io_uring_queue_init(FDMON_IO_URING_ENTRIES,
> +                            &ctx->fdmon_io_uring, flags);

The indentation is off here.

>      if (ret != 0) {
>          error_setg_errno(errp, -ret, "Failed to initialize io_uring");
>          return;

The change to fdmon-io_uring.c should be a separate patch. It's a
prerequisite for, but not directly part of io_uring support in FUSE.

Kevin

Re: [PATCH 1/4] export/fuse: add opt to enable FUSE-over-io_uring

Posted by Brian Song 1 week, 3 days ago


On 9/16/25 3:08 PM, Kevin Wolf wrote:
> Am 30.08.2025 um 04:50 hat Brian Song geschrieben:
>> This patch adds a new export option for storage-export-daemon to enable
>> FUSE-over-io_uring via the switch io-uring=on|off (disableby default).
>> It also implements the protocol handshake with the Linux kernel
>> during the FUSE-over-io_uring initialization phase.
>>
>> See: https://docs.kernel.org/filesystems/fuse-io-uring.html
>>
>> The kernel documentation describes in detail how FUSE-over-io_uring
>> works. This patch implements the Initial SQE stage shown in thediagram:
>> it initializes one queue per IOThread, each currently supporting a
>> single submission queue entry (SQE). When the FUSE driver sends the
>> first FUSE request (FUSE_INIT), storage-export-daemon calls
>> fuse_uring_start() to complete initialization, ultimately submitting
>> the SQE with the FUSE_IO_URING_CMD_REGISTER command to confirm
>> successful initialization with the kernel.
>>
>> We also added support for multiple IOThreads. The current Linux kernel
>> requires registering $(nproc) queues when setting up FUSE-over-io_uring
>> To let users customize the number of FUSE Queues (i.e., IOThreads),
>> we first create nproc Ring Queues as required by the kernel, then
>> distribute them in a round-robin manner to the FUSE Queues for
>> registration. In addition, to support multiple in-flight requests,
>> we configure each Ring Queue with FUSE_DEFAULT_RING_QUEUE_DEPTH
>> entries/requests.
>>
>> Suggested-by: Kevin Wolf <kwolf@redhat.com>
>> Suggested-by: Stefan Hajnoczi <stefanha@redhat.com>
>> Signed-off-by: Brian Song <hibriansong@gmail.com>
>> ---
>>   block/export/fuse.c                  | 310 +++++++++++++++++++++++++--
>>   docs/tools/qemu-storage-daemon.rst   |  11 +-
>>   qapi/block-export.json               |   5 +-
>>   storage-daemon/qemu-storage-daemon.c |   1 +
>>   util/fdmon-io_uring.c                |   5 +-
>>   5 files changed, 309 insertions(+), 23 deletions(-)
>>
>> diff --git a/block/export/fuse.c b/block/export/fuse.c
>> index c0ad4696ce..19bf9e5f74 100644
>> --- a/block/export/fuse.c
>> +++ b/block/export/fuse.c
>> @@ -48,6 +48,9 @@
>>   #include <linux/fs.h>
>>   #endif
>>   
>> +/* room needed in buffer to accommodate header */
>> +#define FUSE_BUFFER_HEADER_SIZE 0x1000
>> +
>>   /* Prevent overly long bounce buffer allocations */
>>   #define FUSE_MAX_READ_BYTES (MIN(BDRV_REQUEST_MAX_BYTES, 1 * 1024 * 1024))
>>   /*
>> @@ -63,12 +66,59 @@
>>       (FUSE_MAX_WRITE_BYTES - FUSE_IN_PLACE_WRITE_BYTES)
>>   
>>   typedef struct FuseExport FuseExport;
>> +typedef struct FuseQueue FuseQueue;
>> +
>> +#ifdef CONFIG_LINUX_IO_URING
>> +#define FUSE_DEFAULT_RING_QUEUE_DEPTH 64
>> +#define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32
> 
> Maybe it would be a little clearer if the next few types has URing in
> their name instead of just Ring.
> 
>> +typedef struct FuseRingQueue FuseRingQueue;
>> +typedef struct FuseRingEnt {
>> +    /* back pointer */
>> +    FuseRingQueue *rq;
>> +
>> +    /* commit id of a fuse request */
>> +    uint64_t req_commit_id;
>> +
>> +    /* fuse request header and payload */
>> +    struct fuse_uring_req_header req_header;
>> +    void *op_payload;
>> +    size_t req_payload_sz;
>> +
>> +    /* The vector passed to the kernel */
>> +    struct iovec iov[2];
>> +
>> +    CqeHandler fuse_cqe_handler;
>> +} FuseRingEnt;
>> +
>> +struct FuseRingQueue {
> 
> It would be good to have a comment here that explains the difference
> between FuseQueue and FuseRingQueue.
> 
> Is this a distinction that should remain in the long run or would we
> always have a 1:1 mapping between FuseQueue and FuseRingQueue once the
> pending kernel changes are merged that allow a number of uring queues
> different from the number of CPUs?
> 

Stefan mentioned the issue, and I added some comments here. One thing to 
note is that FuseRingQueueManager and the distribution between FuseQueue 
and FuseRingQueue are just temporary measures until the kernel allows 
user-defined queues. Therefore, I don't think it's a good idea to remove 
FuseRingQueueManager at this stage.

If you look back at the v2 patch, we put the ring entries inside the 
FuseQueue. The result was that we had to define nproc IOThreads 
(FuseQueue) in order to make it work. That's why here I separated the 
numbers of the two types of queues and RingQueue into independent 
abstractions: allocate nproc RingQueues and initialize the entries, then 
distribute them to FuseQueues in a round-robin manner. Once the kernel 
supports a user-defined number of queues, we can remove 
FuseRingQueueManager and the RR distribution.

Also, to keep the variable names consistent with those in the kernel and 
libfuse, I use Ring here instead of URing.

>> +    int rqid;
>> +
>> +    /* back pointer */
>> +    FuseQueue *q;
>> +    FuseRingEnt *ent;
>> +
>> +    /* List entry for ring_queues */
>> +    QLIST_ENTRY(FuseRingQueue) next;
>> +};
>> +
>> +/*
>> + * Round-robin distribution of ring queues across FUSE queues.
>> + * This structure manages the mapping between kernel ring queues and user
>> + * FUSE queues.
>> + */
>> +typedef struct FuseRingQueueManager {
>> +    FuseRingQueue *ring_queues;
>> +    int num_ring_queues;
>> +    int num_fuse_queues;
>> +} FuseRingQueueManager;
> 
> This isn't a manager, it's just the set of queues the export uses.
> 
> num_fuse_queues duplicates exp->num_queues, there is no reason for it to
> exist. All users also have access to the FuseExport itself.
> 
> The other two fields can just be merged directly into FuseExport,
> preferably renamed to uring_queues and num_uring_queues.
> >> +#endif
>>   
>>   /*
>>    * One FUSE "queue", representing one FUSE FD from which requests are fetched
>>    * and processed.  Each queue is tied to an AioContext.
>>    */
>> -typedef struct FuseQueue {
>> +struct FuseQueue {
>>       FuseExport *exp;
>>   
>>       AioContext *ctx;
>> @@ -109,15 +159,11 @@ typedef struct FuseQueue {
>>        * Free this buffer with qemu_vfree().
>>        */
>>       void *spillover_buf;
>> -} FuseQueue;
>>   
>> -/*
>> - * Verify that FuseQueue.request_buf plus the spill-over buffer together
>> - * are big enough to be accepted by the FUSE kernel driver.
>> - */
>> -QEMU_BUILD_BUG_ON(sizeof(((FuseQueue *)0)->request_buf) +
>> -                  FUSE_SPILLOVER_BUF_SIZE <
>> -                  FUSE_MIN_READ_BUFFER);
>> +#ifdef CONFIG_LINUX_IO_URING
>> +    QLIST_HEAD(, FuseRingQueue) ring_queue_list;
>> +#endif
>> +};
>>   
>>   struct FuseExport {
>>       BlockExport common;
>> @@ -133,7 +179,7 @@ struct FuseExport {
>>        */
>>       bool halted;
>>   
>> -    int num_queues;
>> +    size_t num_queues;
> 
> I'm not sure why this change is needed. If it is, can it be a separate
> patch before this one, with a commit message describing the reason?
> 

I feel there's no reason to use a signed int here, since the number of 
queues cannot be negative.

>>       FuseQueue *queues;
>>       /*
>>        * True if this export should follow the generic export's AioContext.
>> @@ -149,6 +195,12 @@ struct FuseExport {
>>       /* Whether allow_other was used as a mount option or not */
>>       bool allow_other;
>>   
>> +#ifdef CONFIG_LINUX_IO_URING
>> +    bool is_uring;
>> +    size_t ring_queue_depth;
>> +    FuseRingQueueManager *ring_queue_manager;
>> +#endif
>> +
>>       mode_t st_mode;
>>       uid_t st_uid;
>>       gid_t st_gid;
>> @@ -205,7 +257,7 @@ static void fuse_attach_handlers(FuseExport *exp)
>>           return;
>>       }
>>   
>> -    for (int i = 0; i < exp->num_queues; i++) {
>> +    for (size_t i = 0; i < exp->num_queues; i++) {
>>           aio_set_fd_handler(exp->queues[i].ctx, exp->queues[i].fuse_fd,
>>                              read_from_fuse_fd, NULL, NULL, NULL,
>>                              &exp->queues[i]);
>> @@ -257,6 +309,189 @@ static const BlockDevOps fuse_export_blk_dev_ops = {
>>       .drained_poll  = fuse_export_drained_poll,
>>   };
>>   
>> +#ifdef CONFIG_LINUX_IO_URING
>> +static void fuse_uring_sqe_set_req_data(struct fuse_uring_cmd_req *req,
>> +                    const unsigned int rqid,
>> +                    const unsigned int commit_id)
> 
> Indentation is off here. There are two accepted styles for indentation
> after breaking a long line in QEMU (see docs/devel/style.rst):
> 
> 1. Indent the next line by exactly four spaces:
> 
>      do_something(x, y,
>          z);
> 
> 2. Align the next line with the first character after the opening
>     parenthesis:
> 
>      do_something(x, y,
>                   z);
> 
> The second one is the preferred one. The first one is generally only
> used when the parenthesis is already too far right and we can't do much
> about it.
> 
>> +{
>> +    req->qid = rqid;
>> +    req->commit_id = commit_id;
>> +    req->flags = 0;
>> +}
>> +
>> +static void fuse_uring_sqe_prepare(struct io_uring_sqe *sqe, FuseQueue *q,
>> +               __u32 cmd_op)
> 
> Indentation.
> 
> Another option here is to keep everything before the function name on a
> separate line, like this:
> 
> static void
> fuse_uring_sqe_prepare(struct io_uring_sqe *sqe, FuseQueue *q, __u32 cmd_op)
> 
> This would allow the second line to stay under 80 characters.
> 
>> +{
>> +    sqe->opcode = IORING_OP_URING_CMD;
>> +
>> +    sqe->fd = q->fuse_fd;
>> +    sqe->rw_flags = 0;
>> +    sqe->ioprio = 0;
>> +    sqe->off = 0;
>> +
>> +    sqe->cmd_op = cmd_op;
>> +    sqe->__pad1 = 0;
>> +}
>> +
>> +static void fuse_uring_prep_sqe_register(struct io_uring_sqe *sqe, void *opaque)
>> +{
>> +    FuseRingEnt *ent = opaque;
>> +    struct fuse_uring_cmd_req *req = (void *)&sqe->cmd[0];
>> +
>> +    fuse_uring_sqe_prepare(sqe, ent->rq->q, FUSE_IO_URING_CMD_REGISTER);
>> +
>> +    sqe->addr = (uint64_t)(ent->iov);
>> +    sqe->len = 2;
>> +
>> +    fuse_uring_sqe_set_req_data(req, ent->rq->rqid, 0);
>> +}
>> +
>> +static void fuse_uring_submit_register(void *opaque)
>> +{
>> +    FuseRingEnt *ent = opaque;
>> +    FuseExport *exp = ent->rq->q->exp;
>> +
>> +
> 
> Extra empty line.
> 
>> +    aio_add_sqe(fuse_uring_prep_sqe_register, ent, &(ent->fuse_cqe_handler));
> 
> The parentheses around ent->fuse_cqe_handler are unnecessary.
> 
>> +}
>> +
>> +/**
>> + * Distribute ring queues across FUSE queues using round-robin algorithm.
> 
> Hm, if this function distributes (u)ring queues, then what is
> fuse_distribute_ring_queues() doing? Is the term overloaded with two
> meanings?
> 
>> + * This ensures even distribution of kernel ring queues across user-specified
>> + * FUSE queues.
>> + */
>> +static
>> +FuseRingQueueManager *fuse_ring_queue_manager_create(int num_fuse_queues,
>> +                                                    size_t ring_queue_depth,
>> +                                                    size_t bufsize)
> 
> The right style here would be something like:
> 
> static FuseRingQueueManager *
> fuse_ring_queue_manager_create(int num_fuse_queues,
>                                 size_t ring_queue_depth,
>                                 size_t bufsize)
> 
> Given that I said that there is no reason to call the set of all queues
> a manager, or to even have it separate from FuseExport, this probably
> becomes fuse_uring_setup_queues() or something.
> 
>> +{
>> +    int num_ring_queues = get_nprocs();
> 
> This could use a comment saying that this is a kernel requirement at the
> moment.
> 
>> +    FuseRingQueueManager *manager = g_new(FuseRingQueueManager, 1);
>> +
>> +    if (!manager) {
>> +        return NULL;
>> +    }
> 
> g_new() never returns NULL, it aborts on error instead, so no reason to
> have a NULL check here.
> 
>> +
>> +    manager->ring_queues = g_new(FuseRingQueue, num_ring_queues);
>> +    manager->num_ring_queues = num_ring_queues;
>> +    manager->num_fuse_queues = num_fuse_queues;
>> +
>> +    if (!manager->ring_queues) {
>> +        g_free(manager);
>> +        return NULL;
>> +    }
> 
> This check is unnecessary for the same reason.
> 
>> +
>> +    for (int i = 0; i < num_ring_queues; i++) {
>> +        FuseRingQueue *rq = &manager->ring_queues[i];
>> +        rq->rqid = i;
>> +        rq->ent = g_new(FuseRingEnt, ring_queue_depth);
>> +
>> +        if (!rq->ent) {
>> +            for (int j = 0; j < i; j++) {
>> +                g_free(manager->ring_queues[j].ent);
>> +            }
>> +            g_free(manager->ring_queues);
>> +            g_free(manager);
>> +            return NULL;
>> +        }
> 
> This one, too.
> 
>> +
>> +        for (size_t j = 0; j < ring_queue_depth; j++) {
>> +            FuseRingEnt *ent = &rq->ent[j];
>> +            ent->rq = rq;
>> +            ent->req_payload_sz = bufsize - FUSE_BUFFER_HEADER_SIZE;
>> +            ent->op_payload = g_malloc0(ent->req_payload_sz);
>> +
>> +            if (!ent->op_payload) {
>> +                for (size_t k = 0; k < j; k++) {
>> +                    g_free(rq->ent[k].op_payload);
>> +                }
>> +                g_free(rq->ent);
>> +                for (int k = 0; k < i; k++) {
>> +                    g_free(manager->ring_queues[k].ent);
>> +                }
>> +                g_free(manager->ring_queues);
>> +                g_free(manager);
>> +                return NULL;
>> +            }
> 
> And this one.
> 
> Removing all of them will make the function a lot more readable.
> 
>> +
>> +            ent->iov[0] = (struct iovec) {
>> +                &(ent->req_header),
> 
> Unnecessary parentheses.
> 
>> +                sizeof(struct fuse_uring_req_header)
>> +            };
>> +            ent->iov[1] = (struct iovec) {
>> +                ent->op_payload,
>> +                ent->req_payload_sz
>> +            };
>> +
>> +            ent->fuse_cqe_handler.cb = fuse_uring_cqe_handler;
>> +        }
>> +    }
>> +
>> +    return manager;
>> +}
>> +
>> +static
>> +void fuse_distribute_ring_queues(FuseExport *exp, FuseRingQueueManager *manager)
>> +{
>> +    int queue_index = 0;
>> +
>> +    for (int i = 0; i < manager->num_ring_queues; i++) {
>> +        FuseRingQueue *rq = &manager->ring_queues[i];
>> +
>> +        rq->q = &exp->queues[queue_index];
>> +        QLIST_INSERT_HEAD(&(rq->q->ring_queue_list), rq, next);
>> +
>> +        queue_index = (queue_index + 1) % manager->num_fuse_queues;
>> +    }
>> +}
> 
> Ok, no overloaded meaning of distributing queues, but this function
> should probably be merged with the one above. It's part of setting up
> the queues.
> 
> You don't need a separate queue_index counter, you can just directly use
> exp->queues[i % manager->num_fuse_queues].
> 

There are two steps:

1. Create uring queues and allocate buffers for each entry's payload.

2. Distribute these uring queues to FUSE queues using a round-robin 
algorithm.

Given that this is only a temporary measure to allow users to define 
their own IOThreads/FUSE queues, we might later replace the second part 
of the logic. I believe it's better to separate these two pieces of 
logic rather than combining them.


>> +static
>> +void fuse_schedule_ring_queue_registrations(FuseExport *exp,
>> +                                            FuseRingQueueManager *manager)
> 
> Again the formatting. If you split the line before the function name, it
> should be "static void" on the first line.
> 
>> +{
>> +    for (int i = 0; i < manager->num_fuse_queues; i++) {
>> +        FuseQueue *q = &exp->queues[i];
>> +        FuseRingQueue *rq;
>> +
>> +        QLIST_FOREACH(rq, &q->ring_queue_list, next) {
>> +            for (int j = 0; j < exp->ring_queue_depth; j++) {
>> +                aio_bh_schedule_oneshot(q->ctx, fuse_uring_submit_register,
>> +                                        &(rq->ent[j]));
>> +            }
>> +        }
>> +    }
>> +}
> 
> Why one BH per queue entry? This adds up quickly. All entries of the
> same queue need to be processed in the same AioContext, so wouldn't it
> make more sense to have a BH per (FUSE) queue and handle all of its
> uring queues and their entries in a single BH?
> 
>> +static void fuse_uring_start(FuseExport *exp, struct fuse_init_out *out)
>> +{
>> +    /*
>> +     * Since we didn't enable the FUSE_MAX_PAGES feature, the value of
>> +     * fc->max_pages should be FUSE_DEFAULT_MAX_PAGES_PER_REQ, which is set by
>> +     * the kernel by default. Also, max_write should not exceed
>> +     * FUSE_DEFAULT_MAX_PAGES_PER_REQ * PAGE_SIZE.
>> +     */
>> +    size_t bufsize = out->max_write + FUSE_BUFFER_HEADER_SIZE;
>> +
>> +    if (!(out->flags & FUSE_MAX_PAGES)) {
>> +        bufsize = FUSE_DEFAULT_MAX_PAGES_PER_REQ * qemu_real_host_page_size()
>> +                         + FUSE_BUFFER_HEADER_SIZE;
>> +    }
>> +
>> +    exp->ring_queue_manager = fuse_ring_queue_manager_create(
>> +        exp->num_queues, exp->ring_queue_depth, bufsize);
>> +
>> +    if (!exp->ring_queue_manager) {
>> +        error_report("Failed to create ring queue manager");
>> +        return;
>> +    }
>> +
>> +    /* Distribute ring queues across FUSE queues using round-robin */
>> +    fuse_distribute_ring_queues(exp, exp->ring_queue_manager);
>> +
>> +    fuse_schedule_ring_queue_registrations(exp, exp->ring_queue_manager);
>> +}
>> +#endif
>> +
>>   static int fuse_export_create(BlockExport *blk_exp,
>>                                 BlockExportOptions *blk_exp_args,
>>                                 AioContext *const *multithread,
>> @@ -270,6 +505,11 @@ static int fuse_export_create(BlockExport *blk_exp,
>>   
>>       assert(blk_exp_args->type == BLOCK_EXPORT_TYPE_FUSE);
>>   
>> +#ifdef CONFIG_LINUX_IO_URING
>> +    exp->is_uring = args->io_uring;
>> +    exp->ring_queue_depth = FUSE_DEFAULT_RING_QUEUE_DEPTH;
>> +#endif
>> +
>>       if (multithread) {
>>           /* Guaranteed by common export code */
>>           assert(mt_count >= 1);
>> @@ -283,6 +523,10 @@ static int fuse_export_create(BlockExport *blk_exp,
>>                   .exp = exp,
>>                   .ctx = multithread[i],
>>                   .fuse_fd = -1,
>> +#ifdef CONFIG_LINUX_IO_URING
>> +                .ring_queue_list =
>> +                    QLIST_HEAD_INITIALIZER(exp->queues[i].ring_queue_list),
>> +#endif
>>               };
>>           }
>>       } else {
>> @@ -296,6 +540,10 @@ static int fuse_export_create(BlockExport *blk_exp,
>>               .exp = exp,
>>               .ctx = exp->common.ctx,
>>               .fuse_fd = -1,
>> +#ifdef CONFIG_LINUX_IO_URING
>> +            .ring_queue_list =
>> +                QLIST_HEAD_INITIALIZER(exp->queues[0].ring_queue_list),
>> +#endif
>>           };
>>       }
>>   
>> @@ -685,17 +933,39 @@ static bool is_regular_file(const char *path, Error **errp)
>>    */
>>   static ssize_t coroutine_fn
>>   fuse_co_init(FuseExport *exp, struct fuse_init_out *out,
>> -             uint32_t max_readahead, uint32_t flags)
>> +             uint32_t max_readahead, const struct fuse_init_in *in)
>>   {
>> -    const uint32_t supported_flags = FUSE_ASYNC_READ | FUSE_ASYNC_DIO;
>> +    uint64_t supported_flags = FUSE_ASYNC_READ | FUSE_ASYNC_DIO
>> +                                     | FUSE_INIT_EXT;
>> +    uint64_t outargflags = 0;
>> +    uint64_t inargflags = in->flags;
>> +
>> +    ssize_t ret = 0;
>> +
>> +    if (inargflags & FUSE_INIT_EXT) {
>> +        inargflags = inargflags | (uint64_t) in->flags2 << 32;
>> +    }
>> +
>> +#ifdef CONFIG_LINUX_IO_URING
>> +    if (exp->is_uring) {
>> +        if (inargflags & FUSE_OVER_IO_URING) {
>> +            supported_flags |= FUSE_OVER_IO_URING;
>> +        } else {
>> +            exp->is_uring = false;
>> +            ret = -ENODEV;
> 
> Add a 'goto out' here...
> 
>> +        }
>> +    }
>> +#endif
>> +
>> +    outargflags = inargflags & supported_flags;
>>   
>>       *out = (struct fuse_init_out) {
>>           .major = FUSE_KERNEL_VERSION,
>>           .minor = FUSE_KERNEL_MINOR_VERSION,
>>           .max_readahead = max_readahead,
>>           .max_write = FUSE_MAX_WRITE_BYTES,
>> -        .flags = flags & supported_flags,
>> -        .flags2 = 0,
>> +        .flags = outargflags,
>> +        .flags2 = outargflags >> 32,
>>   
>>           /* libfuse maximum: 2^16 - 1 */
>>           .max_background = UINT16_MAX,
>> @@ -717,7 +987,7 @@ fuse_co_init(FuseExport *exp, struct fuse_init_out *out,
>>           .map_alignment = 0,
>>       };
>> -    return sizeof(*out);
>> +    return ret < 0 ? ret : sizeof(*out);
> 
> ...and make this:
> 
>      ret = sizeof(*out);
> out:
>      return ret;
> 
>>   }
>>   
>>   /**
>> @@ -1506,6 +1776,14 @@ fuse_co_process_request(FuseQueue *q, void *spillover_buf)
>>           fuse_write_buf_response(q->fuse_fd, req_id, out_hdr,
>>                                   out_data_buffer, ret);
>>           qemu_vfree(out_data_buffer);
>> +#ifdef CONFIG_LINUX_IO_URING
>> +    /* Handle FUSE-over-io_uring initialization */
>> +    if (unlikely(opcode == FUSE_INIT && exp->is_uring)) {
>> +        struct fuse_init_out *out =
>> +            (struct fuse_init_out *)FUSE_OUT_OP_STRUCT(out_buf);
>> +        fuse_uring_start(exp, out);
>> +    }
>> +#endif
> 
> A level of indentation was lost here.
> 
>>       } else {
>>           fuse_write_response(q->fuse_fd, req_id, out_hdr,
>>                               ret < 0 ? ret : 0,
>> diff --git a/docs/tools/qemu-storage-daemon.rst b/docs/tools/qemu-storage-daemon.rst
>> index 35ab2d7807..c5076101e0 100644
>> --- a/docs/tools/qemu-storage-daemon.rst
>> +++ b/docs/tools/qemu-storage-daemon.rst
>> @@ -78,7 +78,7 @@ Standard options:
>>   .. option:: --export [type=]nbd,id=<id>,node-name=<node-name>[,name=<export-name>][,writable=on|off][,bitmap=<name>]
>>     --export [type=]vhost-user-blk,id=<id>,node-name=<node-name>,addr.type=unix,addr.path=<socket-path>[,writable=on|off][,logical-block-size=<block-size>][,num-queues=<num-queues>]
>>     --export [type=]vhost-user-blk,id=<id>,node-name=<node-name>,addr.type=fd,addr.str=<fd>[,writable=on|off][,logical-block-size=<block-size>][,num-queues=<num-queues>]
>> -  --export [type=]fuse,id=<id>,node-name=<node-name>,mountpoint=<file>[,growable=on|off][,writable=on|off][,allow-other=on|off|auto]
>> +  --export [type=]fuse,id=<id>,node-name=<node-name>,mountpoint=<file>[,growable=on|off][,writable=on|off][,allow-other=on|off|auto][,io-uring=on|off]
>>     --export [type=]vduse-blk,id=<id>,node-name=<node-name>,name=<vduse-name>[,writable=on|off][,num-queues=<num-queues>][,queue-size=<queue-size>][,logical-block-size=<block-size>][,serial=<serial-number>]
>>   
>>     is a block export definition. ``node-name`` is the block node that should be
>> @@ -111,10 +111,11 @@ Standard options:
>>     that enabling this option as a non-root user requires enabling the
>>     user_allow_other option in the global fuse.conf configuration file.  Setting
>>     ``allow-other`` to auto (the default) will try enabling this option, and on
>> -  error fall back to disabling it.
>> -
>> -  The ``vduse-blk`` export type takes a ``name`` (must be unique across the host)
>> -  to create the VDUSE device.
>> +  error fall back to disabling it. Once ``io-uring`` is enabled (off by default),
>> +  the FUSE-over-io_uring-related settings will be initialized to bypass the
>> +  traditional /dev/fuse communication mechanism and instead use io_uring to
>> +  handle FUSE operations. The ``vduse-blk`` export type takes a ``name``
>> +  (must be unique across the host) to create the VDUSE device.
>>     ``num-queues`` sets the number of virtqueues (the default is 1).
>>     ``queue-size`` sets the virtqueue descriptor table size (the default is 256).
>>   
>> diff --git a/qapi/block-export.json b/qapi/block-export.json
>> index 9ae703ad01..37f2fc47e2 100644
>> --- a/qapi/block-export.json
>> +++ b/qapi/block-export.json
>> @@ -184,12 +184,15 @@
>>   #     mount the export with allow_other, and if that fails, try again
>>   #     without.  (since 6.1; default: auto)
>>   #
>> +# @io-uring: Use FUSE-over-io-uring.  (since 10.2; default: false)
>> +#
>>   # Since: 6.0
>>   ##
>>   { 'struct': 'BlockExportOptionsFuse',
>>     'data': { 'mountpoint': 'str',
>>               '*growable': 'bool',
>> -            '*allow-other': 'FuseExportAllowOther' },
>> +            '*allow-other': 'FuseExportAllowOther',
>> +            '*io-uring': 'bool' },
>>     'if': 'CONFIG_FUSE' }
>>   
>>   ##
>> diff --git a/storage-daemon/qemu-storage-daemon.c b/storage-daemon/qemu-storage-daemon.c
>> index eb72561358..0cd4cd2b58 100644
>> --- a/storage-daemon/qemu-storage-daemon.c
>> +++ b/storage-daemon/qemu-storage-daemon.c
>> @@ -107,6 +107,7 @@ static void help(void)
>>   #ifdef CONFIG_FUSE
>>   "  --export [type=]fuse,id=<id>,node-name=<node-name>,mountpoint=<file>\n"
>>   "           [,growable=on|off][,writable=on|off][,allow-other=on|off|auto]\n"
>> +"           [,io-uring=on|off]"
>>   "                         export the specified block node over FUSE\n"
>>   "\n"
>>   #endif /* CONFIG_FUSE */
>> diff --git a/util/fdmon-io_uring.c b/util/fdmon-io_uring.c
>> index d2433d1d99..68d3fe8e01 100644
>> --- a/util/fdmon-io_uring.c
>> +++ b/util/fdmon-io_uring.c
>> @@ -452,10 +452,13 @@ static const FDMonOps fdmon_io_uring_ops = {
>>   void fdmon_io_uring_setup(AioContext *ctx, Error **errp)
>>   {
>>       int ret;
>> +    int flags;
>>   
>>       ctx->io_uring_fd_tag = NULL;
>> +    flags = IORING_SETUP_SQE128;
>>   
>> -    ret = io_uring_queue_init(FDMON_IO_URING_ENTRIES, &ctx->fdmon_io_uring, 0);
>> +    ret = io_uring_queue_init(FDMON_IO_URING_ENTRIES,
>> +                            &ctx->fdmon_io_uring, flags);
> 
> The indentation is off here.
> 
>>       if (ret != 0) {
>>           error_setg_errno(errp, -ret, "Failed to initialize io_uring");
>>           return;
> 
> The change to fdmon-io_uring.c should be a separate patch. It's a
> prerequisite for, but not directly part of io_uring support in FUSE.
> 
> Kevin
>

Re: [PATCH 1/4] export/fuse: add opt to enable FUSE-over-io_uring

Posted by Kevin Wolf 1 week, 2 days ago

Am 17.09.2025 um 21:47 hat Brian Song geschrieben:
> 
> 
> On 9/16/25 3:08 PM, Kevin Wolf wrote:
> > Am 30.08.2025 um 04:50 hat Brian Song geschrieben:
> > > This patch adds a new export option for storage-export-daemon to enable
> > > FUSE-over-io_uring via the switch io-uring=on|off (disableby default).
> > > It also implements the protocol handshake with the Linux kernel
> > > during the FUSE-over-io_uring initialization phase.
> > > 
> > > See: https://docs.kernel.org/filesystems/fuse-io-uring.html
> > > 
> > > The kernel documentation describes in detail how FUSE-over-io_uring
> > > works. This patch implements the Initial SQE stage shown in thediagram:
> > > it initializes one queue per IOThread, each currently supporting a
> > > single submission queue entry (SQE). When the FUSE driver sends the
> > > first FUSE request (FUSE_INIT), storage-export-daemon calls
> > > fuse_uring_start() to complete initialization, ultimately submitting
> > > the SQE with the FUSE_IO_URING_CMD_REGISTER command to confirm
> > > successful initialization with the kernel.
> > > 
> > > We also added support for multiple IOThreads. The current Linux kernel
> > > requires registering $(nproc) queues when setting up FUSE-over-io_uring
> > > To let users customize the number of FUSE Queues (i.e., IOThreads),
> > > we first create nproc Ring Queues as required by the kernel, then
> > > distribute them in a round-robin manner to the FUSE Queues for
> > > registration. In addition, to support multiple in-flight requests,
> > > we configure each Ring Queue with FUSE_DEFAULT_RING_QUEUE_DEPTH
> > > entries/requests.
> > > 
> > > Suggested-by: Kevin Wolf <kwolf@redhat.com>
> > > Suggested-by: Stefan Hajnoczi <stefanha@redhat.com>
> > > Signed-off-by: Brian Song <hibriansong@gmail.com>
> > > ---
> > >   block/export/fuse.c                  | 310 +++++++++++++++++++++++++--
> > >   docs/tools/qemu-storage-daemon.rst   |  11 +-
> > >   qapi/block-export.json               |   5 +-
> > >   storage-daemon/qemu-storage-daemon.c |   1 +
> > >   util/fdmon-io_uring.c                |   5 +-
> > >   5 files changed, 309 insertions(+), 23 deletions(-)
> > > 
> > > diff --git a/block/export/fuse.c b/block/export/fuse.c
> > > index c0ad4696ce..19bf9e5f74 100644
> > > --- a/block/export/fuse.c
> > > +++ b/block/export/fuse.c
> > > @@ -48,6 +48,9 @@
> > >   #include <linux/fs.h>
> > >   #endif
> > > +/* room needed in buffer to accommodate header */
> > > +#define FUSE_BUFFER_HEADER_SIZE 0x1000
> > > +
> > >   /* Prevent overly long bounce buffer allocations */
> > >   #define FUSE_MAX_READ_BYTES (MIN(BDRV_REQUEST_MAX_BYTES, 1 * 1024 * 1024))
> > >   /*
> > > @@ -63,12 +66,59 @@
> > >       (FUSE_MAX_WRITE_BYTES - FUSE_IN_PLACE_WRITE_BYTES)
> > >   typedef struct FuseExport FuseExport;
> > > +typedef struct FuseQueue FuseQueue;
> > > +
> > > +#ifdef CONFIG_LINUX_IO_URING
> > > +#define FUSE_DEFAULT_RING_QUEUE_DEPTH 64
> > > +#define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32
> > 
> > Maybe it would be a little clearer if the next few types has URing in
> > their name instead of just Ring.
> > 
> > > +typedef struct FuseRingQueue FuseRingQueue;
> > > +typedef struct FuseRingEnt {
> > > +    /* back pointer */
> > > +    FuseRingQueue *rq;
> > > +
> > > +    /* commit id of a fuse request */
> > > +    uint64_t req_commit_id;
> > > +
> > > +    /* fuse request header and payload */
> > > +    struct fuse_uring_req_header req_header;
> > > +    void *op_payload;
> > > +    size_t req_payload_sz;
> > > +
> > > +    /* The vector passed to the kernel */
> > > +    struct iovec iov[2];
> > > +
> > > +    CqeHandler fuse_cqe_handler;
> > > +} FuseRingEnt;
> > > +
> > > +struct FuseRingQueue {
> > 
> > It would be good to have a comment here that explains the difference
> > between FuseQueue and FuseRingQueue.
> > 
> > Is this a distinction that should remain in the long run or would we
> > always have a 1:1 mapping between FuseQueue and FuseRingQueue once the
> > pending kernel changes are merged that allow a number of uring queues
> > different from the number of CPUs?
> 
> Stefan mentioned the issue, and I added some comments here. One thing to
> note is that FuseRingQueueManager and the distribution between FuseQueue and
> FuseRingQueue are just temporary measures until the kernel allows
> user-defined queues. Therefore, I don't think it's a good idea to remove
> FuseRingQueueManager at this stage.

I don't think that simplifying the code now will make it harder to make
these changes in the future, so I'd really prefer to keep e.g. all of
the queue setup in a single place even if we expect part of it to go
away in the long run.

> If you look back at the v2 patch, we put the ring entries inside the
> FuseQueue. The result was that we had to define nproc IOThreads (FuseQueue)
> in order to make it work. That's why here I separated the numbers of the two
> types of queues and RingQueue into independent abstractions: allocate nproc
> RingQueues and initialize the entries, then distribute them to FuseQueues in
> a round-robin manner. Once the kernel supports a user-defined number of
> queues, we can remove FuseRingQueueManager and the RR distribution.

Right, I'm not requesting that you change the mechanism per se. I'd just
like to see it more integrated with the rest. Additional functions and
structs can be helpful if they allow you to separate out self-contained
logic, but that's not the case here. Here it's just one additional
moving part that you have to understand when reading the code, which
makes it a little more complex and harder to read than necessary.

> Also, to keep the variable names consistent with those in the kernel and
> libfuse, I use Ring here instead of URing.

Yes, I can see that. The difference is that there, the types are
contained in a separate source file that handles only io_uring, so the
context is clear.

In QEMU's FUSE export, we're still mixing /dev/fuse code and io_uring
code in a single file, so it's a bit more confusing which name refers to
which.

But alternatively, we can also split the source file in QEMU. At almost
2000 lines of code, that might be a good idea anyway.

> > > +    int rqid;
> > > +
> > > +    /* back pointer */
> > > +    FuseQueue *q;
> > > +    FuseRingEnt *ent;
> > > +
> > > +    /* List entry for ring_queues */
> > > +    QLIST_ENTRY(FuseRingQueue) next;
> > > +};
> > > +
> > > +/*
> > > + * Round-robin distribution of ring queues across FUSE queues.
> > > + * This structure manages the mapping between kernel ring queues and user
> > > + * FUSE queues.
> > > + */
> > > +typedef struct FuseRingQueueManager {
> > > +    FuseRingQueue *ring_queues;
> > > +    int num_ring_queues;
> > > +    int num_fuse_queues;
> > > +} FuseRingQueueManager;
> > 
> > This isn't a manager, it's just the set of queues the export uses.
> > 
> > num_fuse_queues duplicates exp->num_queues, there is no reason for it to
> > exist. All users also have access to the FuseExport itself.
> > 
> > The other two fields can just be merged directly into FuseExport,
> > preferably renamed to uring_queues and num_uring_queues.
> > >> +#endif
> > >   /*
> > >    * One FUSE "queue", representing one FUSE FD from which requests are fetched
> > >    * and processed.  Each queue is tied to an AioContext.
> > >    */
> > > -typedef struct FuseQueue {
> > > +struct FuseQueue {
> > >       FuseExport *exp;
> > >       AioContext *ctx;
> > > @@ -109,15 +159,11 @@ typedef struct FuseQueue {
> > >        * Free this buffer with qemu_vfree().
> > >        */
> > >       void *spillover_buf;
> > > -} FuseQueue;
> > > -/*
> > > - * Verify that FuseQueue.request_buf plus the spill-over buffer together
> > > - * are big enough to be accepted by the FUSE kernel driver.
> > > - */
> > > -QEMU_BUILD_BUG_ON(sizeof(((FuseQueue *)0)->request_buf) +
> > > -                  FUSE_SPILLOVER_BUF_SIZE <
> > > -                  FUSE_MIN_READ_BUFFER);
> > > +#ifdef CONFIG_LINUX_IO_URING
> > > +    QLIST_HEAD(, FuseRingQueue) ring_queue_list;
> > > +#endif
> > > +};
> > >   struct FuseExport {
> > >       BlockExport common;
> > > @@ -133,7 +179,7 @@ struct FuseExport {
> > >        */
> > >       bool halted;
> > > -    int num_queues;
> > > +    size_t num_queues;
> > 
> > I'm not sure why this change is needed. If it is, can it be a separate
> > patch before this one, with a commit message describing the reason?
> 
> I feel there's no reason to use a signed int here, since the number of
> queues cannot be negative.

So it's unrelated to what the commit message promises, right? ("add opt
to enable FUSE-over-io_uring"). You can make it a separate cleanup patch
then.

> > >       FuseQueue *queues;
> > >       /*
> > >        * True if this export should follow the generic export's AioContext.
> > > @@ -149,6 +195,12 @@ struct FuseExport {
> > >       /* Whether allow_other was used as a mount option or not */
> > >       bool allow_other;
> > > +#ifdef CONFIG_LINUX_IO_URING
> > > +    bool is_uring;
> > > +    size_t ring_queue_depth;
> > > +    FuseRingQueueManager *ring_queue_manager;
> > > +#endif
> > > +
> > >       mode_t st_mode;
> > >       uid_t st_uid;
> > >       gid_t st_gid;
> > > @@ -205,7 +257,7 @@ static void fuse_attach_handlers(FuseExport *exp)
> > >           return;
> > >       }
> > > -    for (int i = 0; i < exp->num_queues; i++) {
> > > +    for (size_t i = 0; i < exp->num_queues; i++) {
> > >           aio_set_fd_handler(exp->queues[i].ctx, exp->queues[i].fuse_fd,
> > >                              read_from_fuse_fd, NULL, NULL, NULL,
> > >                              &exp->queues[i]);
> > > @@ -257,6 +309,189 @@ static const BlockDevOps fuse_export_blk_dev_ops = {
> > >       .drained_poll  = fuse_export_drained_poll,
> > >   };
> > > +#ifdef CONFIG_LINUX_IO_URING
> > > +static void fuse_uring_sqe_set_req_data(struct fuse_uring_cmd_req *req,
> > > +                    const unsigned int rqid,
> > > +                    const unsigned int commit_id)
> > 
> > Indentation is off here. There are two accepted styles for indentation
> > after breaking a long line in QEMU (see docs/devel/style.rst):
> > 
> > 1. Indent the next line by exactly four spaces:
> > 
> >      do_something(x, y,
> >          z);
> > 
> > 2. Align the next line with the first character after the opening
> >     parenthesis:
> > 
> >      do_something(x, y,
> >                   z);
> > 
> > The second one is the preferred one. The first one is generally only
> > used when the parenthesis is already too far right and we can't do much
> > about it.
> > 
> > > +{
> > > +    req->qid = rqid;
> > > +    req->commit_id = commit_id;
> > > +    req->flags = 0;
> > > +}
> > > +
> > > +static void fuse_uring_sqe_prepare(struct io_uring_sqe *sqe, FuseQueue *q,
> > > +               __u32 cmd_op)
> > 
> > Indentation.
> > 
> > Another option here is to keep everything before the function name on a
> > separate line, like this:
> > 
> > static void
> > fuse_uring_sqe_prepare(struct io_uring_sqe *sqe, FuseQueue *q, __u32 cmd_op)
> > 
> > This would allow the second line to stay under 80 characters.
> > 
> > > +{
> > > +    sqe->opcode = IORING_OP_URING_CMD;
> > > +
> > > +    sqe->fd = q->fuse_fd;
> > > +    sqe->rw_flags = 0;
> > > +    sqe->ioprio = 0;
> > > +    sqe->off = 0;
> > > +
> > > +    sqe->cmd_op = cmd_op;
> > > +    sqe->__pad1 = 0;
> > > +}
> > > +
> > > +static void fuse_uring_prep_sqe_register(struct io_uring_sqe *sqe, void *opaque)
> > > +{
> > > +    FuseRingEnt *ent = opaque;
> > > +    struct fuse_uring_cmd_req *req = (void *)&sqe->cmd[0];
> > > +
> > > +    fuse_uring_sqe_prepare(sqe, ent->rq->q, FUSE_IO_URING_CMD_REGISTER);
> > > +
> > > +    sqe->addr = (uint64_t)(ent->iov);
> > > +    sqe->len = 2;
> > > +
> > > +    fuse_uring_sqe_set_req_data(req, ent->rq->rqid, 0);
> > > +}
> > > +
> > > +static void fuse_uring_submit_register(void *opaque)
> > > +{
> > > +    FuseRingEnt *ent = opaque;
> > > +    FuseExport *exp = ent->rq->q->exp;
> > > +
> > > +
> > 
> > Extra empty line.
> > 
> > > +    aio_add_sqe(fuse_uring_prep_sqe_register, ent, &(ent->fuse_cqe_handler));
> > 
> > The parentheses around ent->fuse_cqe_handler are unnecessary.
> > 
> > > +}
> > > +
> > > +/**
> > > + * Distribute ring queues across FUSE queues using round-robin algorithm.
> > 
> > Hm, if this function distributes (u)ring queues, then what is
> > fuse_distribute_ring_queues() doing? Is the term overloaded with two
> > meanings?
> > 
> > > + * This ensures even distribution of kernel ring queues across user-specified
> > > + * FUSE queues.
> > > + */
> > > +static
> > > +FuseRingQueueManager *fuse_ring_queue_manager_create(int num_fuse_queues,
> > > +                                                    size_t ring_queue_depth,
> > > +                                                    size_t bufsize)
> > 
> > The right style here would be something like:
> > 
> > static FuseRingQueueManager *
> > fuse_ring_queue_manager_create(int num_fuse_queues,
> >                                 size_t ring_queue_depth,
> >                                 size_t bufsize)
> > 
> > Given that I said that there is no reason to call the set of all queues
> > a manager, or to even have it separate from FuseExport, this probably
> > becomes fuse_uring_setup_queues() or something.
> > 
> > > +{
> > > +    int num_ring_queues = get_nprocs();
> > 
> > This could use a comment saying that this is a kernel requirement at the
> > moment.
> > 
> > > +    FuseRingQueueManager *manager = g_new(FuseRingQueueManager, 1);
> > > +
> > > +    if (!manager) {
> > > +        return NULL;
> > > +    }
> > 
> > g_new() never returns NULL, it aborts on error instead, so no reason to
> > have a NULL check here.
> > 
> > > +
> > > +    manager->ring_queues = g_new(FuseRingQueue, num_ring_queues);
> > > +    manager->num_ring_queues = num_ring_queues;
> > > +    manager->num_fuse_queues = num_fuse_queues;
> > > +
> > > +    if (!manager->ring_queues) {
> > > +        g_free(manager);
> > > +        return NULL;
> > > +    }
> > 
> > This check is unnecessary for the same reason.
> > 
> > > +
> > > +    for (int i = 0; i < num_ring_queues; i++) {
> > > +        FuseRingQueue *rq = &manager->ring_queues[i];
> > > +        rq->rqid = i;
> > > +        rq->ent = g_new(FuseRingEnt, ring_queue_depth);
> > > +
> > > +        if (!rq->ent) {
> > > +            for (int j = 0; j < i; j++) {
> > > +                g_free(manager->ring_queues[j].ent);
> > > +            }
> > > +            g_free(manager->ring_queues);
> > > +            g_free(manager);
> > > +            return NULL;
> > > +        }
> > 
> > This one, too.
> > 
> > > +
> > > +        for (size_t j = 0; j < ring_queue_depth; j++) {
> > > +            FuseRingEnt *ent = &rq->ent[j];
> > > +            ent->rq = rq;
> > > +            ent->req_payload_sz = bufsize - FUSE_BUFFER_HEADER_SIZE;
> > > +            ent->op_payload = g_malloc0(ent->req_payload_sz);
> > > +
> > > +            if (!ent->op_payload) {
> > > +                for (size_t k = 0; k < j; k++) {
> > > +                    g_free(rq->ent[k].op_payload);
> > > +                }
> > > +                g_free(rq->ent);
> > > +                for (int k = 0; k < i; k++) {
> > > +                    g_free(manager->ring_queues[k].ent);
> > > +                }
> > > +                g_free(manager->ring_queues);
> > > +                g_free(manager);
> > > +                return NULL;
> > > +            }
> > 
> > And this one.
> > 
> > Removing all of them will make the function a lot more readable.
> > 
> > > +
> > > +            ent->iov[0] = (struct iovec) {
> > > +                &(ent->req_header),
> > 
> > Unnecessary parentheses.
> > 
> > > +                sizeof(struct fuse_uring_req_header)
> > > +            };
> > > +            ent->iov[1] = (struct iovec) {
> > > +                ent->op_payload,
> > > +                ent->req_payload_sz
> > > +            };
> > > +
> > > +            ent->fuse_cqe_handler.cb = fuse_uring_cqe_handler;
> > > +        }
> > > +    }
> > > +
> > > +    return manager;
> > > +}
> > > +
> > > +static
> > > +void fuse_distribute_ring_queues(FuseExport *exp, FuseRingQueueManager *manager)
> > > +{
> > > +    int queue_index = 0;
> > > +
> > > +    for (int i = 0; i < manager->num_ring_queues; i++) {
> > > +        FuseRingQueue *rq = &manager->ring_queues[i];
> > > +
> > > +        rq->q = &exp->queues[queue_index];
> > > +        QLIST_INSERT_HEAD(&(rq->q->ring_queue_list), rq, next);
> > > +
> > > +        queue_index = (queue_index + 1) % manager->num_fuse_queues;
> > > +    }
> > > +}
> > 
> > Ok, no overloaded meaning of distributing queues, but this function
> > should probably be merged with the one above. It's part of setting up
> > the queues.
> > 
> > You don't need a separate queue_index counter, you can just directly use
> > exp->queues[i % manager->num_fuse_queues].
> > 
> 
> There are two steps:
> 
> 1. Create uring queues and allocate buffers for each entry's payload.
> 
> 2. Distribute these uring queues to FUSE queues using a round-robin
> algorithm.
> 
> Given that this is only a temporary measure to allow users to define their
> own IOThreads/FUSE queues, we might later replace the second part of the
> logic. I believe it's better to separate these two pieces of logic rather
> than combining them.

But one of them doesn't make sense without the other currently. Looping
twice over all queues and doing half of their setup is harder to
understand than having a single loop and doing all of the setup.

You're right that we hope that the second half goes away eventually, but
we don't know if or when someone will actually do this. We shouldn't
structure our code so that it may make sense some time in the future if
someone extends it in the way we envision now, but so that it makes
sense and is easy to understand now.

Kevin

Re: [PATCH 1/4] export/fuse: add opt to enable FUSE-over-io_uring

Posted by Stefan Hajnoczi 3 weeks, 4 days ago

On Fri, Aug 29, 2025 at 10:50:22PM -0400, Brian Song wrote:
> This patch adds a new export option for storage-export-daemon to enable
> FUSE-over-io_uring via the switch io-uring=on|off (disableby default).
> It also implements the protocol handshake with the Linux kernel
> during the FUSE-over-io_uring initialization phase.
> 
> See: https://docs.kernel.org/filesystems/fuse-io-uring.html
> 
> The kernel documentation describes in detail how FUSE-over-io_uring
> works. This patch implements the Initial SQE stage shown in thediagram:
> it initializes one queue per IOThread, each currently supporting a
> single submission queue entry (SQE). When the FUSE driver sends the
> first FUSE request (FUSE_INIT), storage-export-daemon calls
> fuse_uring_start() to complete initialization, ultimately submitting
> the SQE with the FUSE_IO_URING_CMD_REGISTER command to confirm
> successful initialization with the kernel.
> 
> We also added support for multiple IOThreads. The current Linux kernel
> requires registering $(nproc) queues when setting up FUSE-over-io_uring
> To let users customize the number of FUSE Queues (i.e., IOThreads),
> we first create nproc Ring Queues as required by the kernel, then
> distribute them in a round-robin manner to the FUSE Queues for
> registration. In addition, to support multiple in-flight requests,
> we configure each Ring Queue with FUSE_DEFAULT_RING_QUEUE_DEPTH
> entries/requests.
> 
> Suggested-by: Kevin Wolf <kwolf@redhat.com>
> Suggested-by: Stefan Hajnoczi <stefanha@redhat.com>
> Signed-off-by: Brian Song <hibriansong@gmail.com>
> ---
>  block/export/fuse.c                  | 310 +++++++++++++++++++++++++--
>  docs/tools/qemu-storage-daemon.rst   |  11 +-
>  qapi/block-export.json               |   5 +-
>  storage-daemon/qemu-storage-daemon.c |   1 +
>  util/fdmon-io_uring.c                |   5 +-
>  5 files changed, 309 insertions(+), 23 deletions(-)
> 
> diff --git a/block/export/fuse.c b/block/export/fuse.c
> index c0ad4696ce..19bf9e5f74 100644
> --- a/block/export/fuse.c
> +++ b/block/export/fuse.c
> @@ -48,6 +48,9 @@
>  #include <linux/fs.h>
>  #endif
>  
> +/* room needed in buffer to accommodate header */
> +#define FUSE_BUFFER_HEADER_SIZE 0x1000
> +
>  /* Prevent overly long bounce buffer allocations */
>  #define FUSE_MAX_READ_BYTES (MIN(BDRV_REQUEST_MAX_BYTES, 1 * 1024 * 1024))
>  /*
> @@ -63,12 +66,59 @@
>      (FUSE_MAX_WRITE_BYTES - FUSE_IN_PLACE_WRITE_BYTES)
>  
>  typedef struct FuseExport FuseExport;
> +typedef struct FuseQueue FuseQueue;
> +
> +#ifdef CONFIG_LINUX_IO_URING
> +#define FUSE_DEFAULT_RING_QUEUE_DEPTH 64
> +#define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32
> +
> +typedef struct FuseRingQueue FuseRingQueue;
> +typedef struct FuseRingEnt {
> +    /* back pointer */
> +    FuseRingQueue *rq;
> +
> +    /* commit id of a fuse request */
> +    uint64_t req_commit_id;
> +
> +    /* fuse request header and payload */
> +    struct fuse_uring_req_header req_header;
> +    void *op_payload;
> +    size_t req_payload_sz;
> +
> +    /* The vector passed to the kernel */
> +    struct iovec iov[2];
> +
> +    CqeHandler fuse_cqe_handler;
> +} FuseRingEnt;
> +
> +struct FuseRingQueue {
> +    int rqid;
> +
> +    /* back pointer */
> +    FuseQueue *q;
> +    FuseRingEnt *ent;
> +
> +    /* List entry for ring_queues */
> +    QLIST_ENTRY(FuseRingQueue) next;
> +};
> +
> +/*
> + * Round-robin distribution of ring queues across FUSE queues.
> + * This structure manages the mapping between kernel ring queues and user
> + * FUSE queues.
> + */
> +typedef struct FuseRingQueueManager {
> +    FuseRingQueue *ring_queues;
> +    int num_ring_queues;
> +    int num_fuse_queues;
> +} FuseRingQueueManager;
> +#endif
>  
>  /*
>   * One FUSE "queue", representing one FUSE FD from which requests are fetched
>   * and processed.  Each queue is tied to an AioContext.
>   */
> -typedef struct FuseQueue {
> +struct FuseQueue {
>      FuseExport *exp;
>  
>      AioContext *ctx;
> @@ -109,15 +159,11 @@ typedef struct FuseQueue {
>       * Free this buffer with qemu_vfree().
>       */
>      void *spillover_buf;
> -} FuseQueue;
>  
> -/*
> - * Verify that FuseQueue.request_buf plus the spill-over buffer together
> - * are big enough to be accepted by the FUSE kernel driver.
> - */
> -QEMU_BUILD_BUG_ON(sizeof(((FuseQueue *)0)->request_buf) +
> -                  FUSE_SPILLOVER_BUF_SIZE <
> -                  FUSE_MIN_READ_BUFFER);
> +#ifdef CONFIG_LINUX_IO_URING
> +    QLIST_HEAD(, FuseRingQueue) ring_queue_list;
> +#endif
> +};
>  
>  struct FuseExport {
>      BlockExport common;
> @@ -133,7 +179,7 @@ struct FuseExport {
>       */
>      bool halted;
>  
> -    int num_queues;
> +    size_t num_queues;
>      FuseQueue *queues;
>      /*
>       * True if this export should follow the generic export's AioContext.
> @@ -149,6 +195,12 @@ struct FuseExport {
>      /* Whether allow_other was used as a mount option or not */
>      bool allow_other;
>  
> +#ifdef CONFIG_LINUX_IO_URING
> +    bool is_uring;
> +    size_t ring_queue_depth;
> +    FuseRingQueueManager *ring_queue_manager;
> +#endif
> +
>      mode_t st_mode;
>      uid_t st_uid;
>      gid_t st_gid;
> @@ -205,7 +257,7 @@ static void fuse_attach_handlers(FuseExport *exp)
>          return;
>      }
>  
> -    for (int i = 0; i < exp->num_queues; i++) {
> +    for (size_t i = 0; i < exp->num_queues; i++) {
>          aio_set_fd_handler(exp->queues[i].ctx, exp->queues[i].fuse_fd,
>                             read_from_fuse_fd, NULL, NULL, NULL,
>                             &exp->queues[i]);
> @@ -257,6 +309,189 @@ static const BlockDevOps fuse_export_blk_dev_ops = {
>      .drained_poll  = fuse_export_drained_poll,
>  };
>  
> +#ifdef CONFIG_LINUX_IO_URING
> +static void fuse_uring_sqe_set_req_data(struct fuse_uring_cmd_req *req,
> +                    const unsigned int rqid,
> +                    const unsigned int commit_id)
> +{
> +    req->qid = rqid;
> +    req->commit_id = commit_id;
> +    req->flags = 0;
> +}
> +
> +static void fuse_uring_sqe_prepare(struct io_uring_sqe *sqe, FuseQueue *q,
> +               __u32 cmd_op)
> +{
> +    sqe->opcode = IORING_OP_URING_CMD;
> +
> +    sqe->fd = q->fuse_fd;
> +    sqe->rw_flags = 0;
> +    sqe->ioprio = 0;
> +    sqe->off = 0;
> +
> +    sqe->cmd_op = cmd_op;
> +    sqe->__pad1 = 0;
> +}
> +
> +static void fuse_uring_prep_sqe_register(struct io_uring_sqe *sqe, void *opaque)
> +{
> +    FuseRingEnt *ent = opaque;
> +    struct fuse_uring_cmd_req *req = (void *)&sqe->cmd[0];
> +
> +    fuse_uring_sqe_prepare(sqe, ent->rq->q, FUSE_IO_URING_CMD_REGISTER);
> +
> +    sqe->addr = (uint64_t)(ent->iov);
> +    sqe->len = 2;
> +
> +    fuse_uring_sqe_set_req_data(req, ent->rq->rqid, 0);
> +}
> +
> +static void fuse_uring_submit_register(void *opaque)
> +{
> +    FuseRingEnt *ent = opaque;
> +    FuseExport *exp = ent->rq->q->exp;
> +
> +
> +    aio_add_sqe(fuse_uring_prep_sqe_register, ent, &(ent->fuse_cqe_handler));
> +}
> +
> +/**
> + * Distribute ring queues across FUSE queues using round-robin algorithm.
> + * This ensures even distribution of kernel ring queues across user-specified
> + * FUSE queues.
> + */
> +static
> +FuseRingQueueManager *fuse_ring_queue_manager_create(int num_fuse_queues,
> +                                                    size_t ring_queue_depth,
> +                                                    size_t bufsize)
> +{
> +    int num_ring_queues = get_nprocs();
> +    FuseRingQueueManager *manager = g_new(FuseRingQueueManager, 1);
> +
> +    if (!manager) {
> +        return NULL;
> +    }
> +
> +    manager->ring_queues = g_new(FuseRingQueue, num_ring_queues);
> +    manager->num_ring_queues = num_ring_queues;
> +    manager->num_fuse_queues = num_fuse_queues;
> +
> +    if (!manager->ring_queues) {
> +        g_free(manager);
> +        return NULL;
> +    }
> +
> +    for (int i = 0; i < num_ring_queues; i++) {
> +        FuseRingQueue *rq = &manager->ring_queues[i];
> +        rq->rqid = i;
> +        rq->ent = g_new(FuseRingEnt, ring_queue_depth);
> +
> +        if (!rq->ent) {
> +            for (int j = 0; j < i; j++) {
> +                g_free(manager->ring_queues[j].ent);
> +            }
> +            g_free(manager->ring_queues);
> +            g_free(manager);
> +            return NULL;
> +        }
> +
> +        for (size_t j = 0; j < ring_queue_depth; j++) {
> +            FuseRingEnt *ent = &rq->ent[j];
> +            ent->rq = rq;
> +            ent->req_payload_sz = bufsize - FUSE_BUFFER_HEADER_SIZE;
> +            ent->op_payload = g_malloc0(ent->req_payload_sz);
> +
> +            if (!ent->op_payload) {
> +                for (size_t k = 0; k < j; k++) {
> +                    g_free(rq->ent[k].op_payload);
> +                }
> +                g_free(rq->ent);
> +                for (int k = 0; k < i; k++) {
> +                    g_free(manager->ring_queues[k].ent);
> +                }
> +                g_free(manager->ring_queues);
> +                g_free(manager);
> +                return NULL;
> +            }
> +
> +            ent->iov[0] = (struct iovec) {
> +                &(ent->req_header),
> +                sizeof(struct fuse_uring_req_header)
> +            };
> +            ent->iov[1] = (struct iovec) {
> +                ent->op_payload,
> +                ent->req_payload_sz
> +            };
> +
> +            ent->fuse_cqe_handler.cb = fuse_uring_cqe_handler;

I just noticed this commit won't compile because
fuse_uring_cqe_handler() is introduced in the next commit. There are
several options for resolving this. I suggest squashing the next commit
into this one.

The reason why every commit must compile is that git-bisect(1) is only
useful when the code compiles and passes tests at every commit. If there
are broken commits then bisection becomes impractical because you have
to troubleshoot intermediate commits that may be broken due to issues
unrelated to your bisection.

> +        }
> +    }
> +
> +    return manager;
> +}
> +
> +static
> +void fuse_distribute_ring_queues(FuseExport *exp, FuseRingQueueManager *manager)
> +{
> +    int queue_index = 0;
> +
> +    for (int i = 0; i < manager->num_ring_queues; i++) {
> +        FuseRingQueue *rq = &manager->ring_queues[i];
> +
> +        rq->q = &exp->queues[queue_index];
> +        QLIST_INSERT_HEAD(&(rq->q->ring_queue_list), rq, next);
> +
> +        queue_index = (queue_index + 1) % manager->num_fuse_queues;
> +    }
> +}
> +
> +static
> +void fuse_schedule_ring_queue_registrations(FuseExport *exp,
> +                                            FuseRingQueueManager *manager)
> +{
> +    for (int i = 0; i < manager->num_fuse_queues; i++) {
> +        FuseQueue *q = &exp->queues[i];
> +        FuseRingQueue *rq;
> +
> +        QLIST_FOREACH(rq, &q->ring_queue_list, next) {
> +            for (int j = 0; j < exp->ring_queue_depth; j++) {
> +                aio_bh_schedule_oneshot(q->ctx, fuse_uring_submit_register,
> +                                        &(rq->ent[j]));
> +            }
> +        }
> +    }
> +}
> +
> +static void fuse_uring_start(FuseExport *exp, struct fuse_init_out *out)
> +{
> +    /*
> +     * Since we didn't enable the FUSE_MAX_PAGES feature, the value of
> +     * fc->max_pages should be FUSE_DEFAULT_MAX_PAGES_PER_REQ, which is set by
> +     * the kernel by default. Also, max_write should not exceed
> +     * FUSE_DEFAULT_MAX_PAGES_PER_REQ * PAGE_SIZE.
> +     */
> +    size_t bufsize = out->max_write + FUSE_BUFFER_HEADER_SIZE;
> +
> +    if (!(out->flags & FUSE_MAX_PAGES)) {
> +        bufsize = FUSE_DEFAULT_MAX_PAGES_PER_REQ * qemu_real_host_page_size()
> +                         + FUSE_BUFFER_HEADER_SIZE;
> +    }
> +
> +    exp->ring_queue_manager = fuse_ring_queue_manager_create(
> +        exp->num_queues, exp->ring_queue_depth, bufsize);
> +
> +    if (!exp->ring_queue_manager) {
> +        error_report("Failed to create ring queue manager");
> +        return;
> +    }
> +
> +    /* Distribute ring queues across FUSE queues using round-robin */
> +    fuse_distribute_ring_queues(exp, exp->ring_queue_manager);
> +
> +    fuse_schedule_ring_queue_registrations(exp, exp->ring_queue_manager);
> +}
> +#endif
> +
>  static int fuse_export_create(BlockExport *blk_exp,
>                                BlockExportOptions *blk_exp_args,
>                                AioContext *const *multithread,
> @@ -270,6 +505,11 @@ static int fuse_export_create(BlockExport *blk_exp,
>  
>      assert(blk_exp_args->type == BLOCK_EXPORT_TYPE_FUSE);
>  
> +#ifdef CONFIG_LINUX_IO_URING
> +    exp->is_uring = args->io_uring;
> +    exp->ring_queue_depth = FUSE_DEFAULT_RING_QUEUE_DEPTH;
> +#endif
> +
>      if (multithread) {
>          /* Guaranteed by common export code */
>          assert(mt_count >= 1);
> @@ -283,6 +523,10 @@ static int fuse_export_create(BlockExport *blk_exp,
>                  .exp = exp,
>                  .ctx = multithread[i],
>                  .fuse_fd = -1,
> +#ifdef CONFIG_LINUX_IO_URING
> +                .ring_queue_list =
> +                    QLIST_HEAD_INITIALIZER(exp->queues[i].ring_queue_list),
> +#endif
>              };
>          }
>      } else {
> @@ -296,6 +540,10 @@ static int fuse_export_create(BlockExport *blk_exp,
>              .exp = exp,
>              .ctx = exp->common.ctx,
>              .fuse_fd = -1,
> +#ifdef CONFIG_LINUX_IO_URING
> +            .ring_queue_list =
> +                QLIST_HEAD_INITIALIZER(exp->queues[0].ring_queue_list),
> +#endif
>          };
>      }
>  
> @@ -685,17 +933,39 @@ static bool is_regular_file(const char *path, Error **errp)
>   */
>  static ssize_t coroutine_fn
>  fuse_co_init(FuseExport *exp, struct fuse_init_out *out,
> -             uint32_t max_readahead, uint32_t flags)
> +             uint32_t max_readahead, const struct fuse_init_in *in)
>  {
> -    const uint32_t supported_flags = FUSE_ASYNC_READ | FUSE_ASYNC_DIO;
> +    uint64_t supported_flags = FUSE_ASYNC_READ | FUSE_ASYNC_DIO
> +                                     | FUSE_INIT_EXT;
> +    uint64_t outargflags = 0;
> +    uint64_t inargflags = in->flags;
> +
> +    ssize_t ret = 0;
> +
> +    if (inargflags & FUSE_INIT_EXT) {
> +        inargflags = inargflags | (uint64_t) in->flags2 << 32;
> +    }
> +
> +#ifdef CONFIG_LINUX_IO_URING
> +    if (exp->is_uring) {
> +        if (inargflags & FUSE_OVER_IO_URING) {
> +            supported_flags |= FUSE_OVER_IO_URING;
> +        } else {
> +            exp->is_uring = false;
> +            ret = -ENODEV;
> +        }
> +    }
> +#endif
> +
> +    outargflags = inargflags & supported_flags;
>  
>      *out = (struct fuse_init_out) {
>          .major = FUSE_KERNEL_VERSION,
>          .minor = FUSE_KERNEL_MINOR_VERSION,
>          .max_readahead = max_readahead,
>          .max_write = FUSE_MAX_WRITE_BYTES,
> -        .flags = flags & supported_flags,
> -        .flags2 = 0,
> +        .flags = outargflags,
> +        .flags2 = outargflags >> 32,
>  
>          /* libfuse maximum: 2^16 - 1 */
>          .max_background = UINT16_MAX,
> @@ -717,7 +987,7 @@ fuse_co_init(FuseExport *exp, struct fuse_init_out *out,
>          .map_alignment = 0,
>      };
>  
> -    return sizeof(*out);
> +    return ret < 0 ? ret : sizeof(*out);
>  }
>  
>  /**
> @@ -1506,6 +1776,14 @@ fuse_co_process_request(FuseQueue *q, void *spillover_buf)
>          fuse_write_buf_response(q->fuse_fd, req_id, out_hdr,
>                                  out_data_buffer, ret);
>          qemu_vfree(out_data_buffer);
> +#ifdef CONFIG_LINUX_IO_URING
> +    /* Handle FUSE-over-io_uring initialization */
> +    if (unlikely(opcode == FUSE_INIT && exp->is_uring)) {
> +        struct fuse_init_out *out =
> +            (struct fuse_init_out *)FUSE_OUT_OP_STRUCT(out_buf);
> +        fuse_uring_start(exp, out);
> +    }
> +#endif
>      } else {
>          fuse_write_response(q->fuse_fd, req_id, out_hdr,
>                              ret < 0 ? ret : 0,
> diff --git a/docs/tools/qemu-storage-daemon.rst b/docs/tools/qemu-storage-daemon.rst
> index 35ab2d7807..c5076101e0 100644
> --- a/docs/tools/qemu-storage-daemon.rst
> +++ b/docs/tools/qemu-storage-daemon.rst
> @@ -78,7 +78,7 @@ Standard options:
>  .. option:: --export [type=]nbd,id=<id>,node-name=<node-name>[,name=<export-name>][,writable=on|off][,bitmap=<name>]
>    --export [type=]vhost-user-blk,id=<id>,node-name=<node-name>,addr.type=unix,addr.path=<socket-path>[,writable=on|off][,logical-block-size=<block-size>][,num-queues=<num-queues>]
>    --export [type=]vhost-user-blk,id=<id>,node-name=<node-name>,addr.type=fd,addr.str=<fd>[,writable=on|off][,logical-block-size=<block-size>][,num-queues=<num-queues>]
> -  --export [type=]fuse,id=<id>,node-name=<node-name>,mountpoint=<file>[,growable=on|off][,writable=on|off][,allow-other=on|off|auto]
> +  --export [type=]fuse,id=<id>,node-name=<node-name>,mountpoint=<file>[,growable=on|off][,writable=on|off][,allow-other=on|off|auto][,io-uring=on|off]
>    --export [type=]vduse-blk,id=<id>,node-name=<node-name>,name=<vduse-name>[,writable=on|off][,num-queues=<num-queues>][,queue-size=<queue-size>][,logical-block-size=<block-size>][,serial=<serial-number>]
>  
>    is a block export definition. ``node-name`` is the block node that should be
> @@ -111,10 +111,11 @@ Standard options:
>    that enabling this option as a non-root user requires enabling the
>    user_allow_other option in the global fuse.conf configuration file.  Setting
>    ``allow-other`` to auto (the default) will try enabling this option, and on
> -  error fall back to disabling it.
> -
> -  The ``vduse-blk`` export type takes a ``name`` (must be unique across the host)
> -  to create the VDUSE device.
> +  error fall back to disabling it. Once ``io-uring`` is enabled (off by default),
> +  the FUSE-over-io_uring-related settings will be initialized to bypass the
> +  traditional /dev/fuse communication mechanism and instead use io_uring to
> +  handle FUSE operations. The ``vduse-blk`` export type takes a ``name``
> +  (must be unique across the host) to create the VDUSE device.
>    ``num-queues`` sets the number of virtqueues (the default is 1).
>    ``queue-size`` sets the virtqueue descriptor table size (the default is 256).
>  
> diff --git a/qapi/block-export.json b/qapi/block-export.json
> index 9ae703ad01..37f2fc47e2 100644
> --- a/qapi/block-export.json
> +++ b/qapi/block-export.json
> @@ -184,12 +184,15 @@
>  #     mount the export with allow_other, and if that fails, try again
>  #     without.  (since 6.1; default: auto)
>  #
> +# @io-uring: Use FUSE-over-io-uring.  (since 10.2; default: false)
> +#
>  # Since: 6.0
>  ##
>  { 'struct': 'BlockExportOptionsFuse',
>    'data': { 'mountpoint': 'str',
>              '*growable': 'bool',
> -            '*allow-other': 'FuseExportAllowOther' },
> +            '*allow-other': 'FuseExportAllowOther',
> +            '*io-uring': 'bool' },
>    'if': 'CONFIG_FUSE' }
>  
>  ##
> diff --git a/storage-daemon/qemu-storage-daemon.c b/storage-daemon/qemu-storage-daemon.c
> index eb72561358..0cd4cd2b58 100644
> --- a/storage-daemon/qemu-storage-daemon.c
> +++ b/storage-daemon/qemu-storage-daemon.c
> @@ -107,6 +107,7 @@ static void help(void)
>  #ifdef CONFIG_FUSE
>  "  --export [type=]fuse,id=<id>,node-name=<node-name>,mountpoint=<file>\n"
>  "           [,growable=on|off][,writable=on|off][,allow-other=on|off|auto]\n"
> +"           [,io-uring=on|off]"
>  "                         export the specified block node over FUSE\n"
>  "\n"
>  #endif /* CONFIG_FUSE */
> diff --git a/util/fdmon-io_uring.c b/util/fdmon-io_uring.c
> index d2433d1d99..68d3fe8e01 100644
> --- a/util/fdmon-io_uring.c
> +++ b/util/fdmon-io_uring.c
> @@ -452,10 +452,13 @@ static const FDMonOps fdmon_io_uring_ops = {
>  void fdmon_io_uring_setup(AioContext *ctx, Error **errp)
>  {
>      int ret;
> +    int flags;
>  
>      ctx->io_uring_fd_tag = NULL;
> +    flags = IORING_SETUP_SQE128;
>  
> -    ret = io_uring_queue_init(FDMON_IO_URING_ENTRIES, &ctx->fdmon_io_uring, 0);
> +    ret = io_uring_queue_init(FDMON_IO_URING_ENTRIES,
> +                            &ctx->fdmon_io_uring, flags);
>      if (ret != 0) {
>          error_setg_errno(errp, -ret, "Failed to initialize io_uring");
>          return;
> -- 
> 2.45.2
>

Re: [PATCH 1/4] export/fuse: add opt to enable FUSE-over-io_uring

Posted by Stefan Hajnoczi 3 weeks, 4 days ago

On Fri, Aug 29, 2025 at 10:50:22PM -0400, Brian Song wrote:
> This patch adds a new export option for storage-export-daemon to enable
> FUSE-over-io_uring via the switch io-uring=on|off (disableby default).
> It also implements the protocol handshake with the Linux kernel
> during the FUSE-over-io_uring initialization phase.
> 
> See: https://docs.kernel.org/filesystems/fuse-io-uring.html
> 
> The kernel documentation describes in detail how FUSE-over-io_uring
> works. This patch implements the Initial SQE stage shown in thediagram:
> it initializes one queue per IOThread, each currently supporting a
> single submission queue entry (SQE). When the FUSE driver sends the
> first FUSE request (FUSE_INIT), storage-export-daemon calls
> fuse_uring_start() to complete initialization, ultimately submitting
> the SQE with the FUSE_IO_URING_CMD_REGISTER command to confirm
> successful initialization with the kernel.
> 
> We also added support for multiple IOThreads. The current Linux kernel
> requires registering $(nproc) queues when setting up FUSE-over-io_uring
> To let users customize the number of FUSE Queues (i.e., IOThreads),
> we first create nproc Ring Queues as required by the kernel, then
> distribute them in a round-robin manner to the FUSE Queues for
> registration. In addition, to support multiple in-flight requests,
> we configure each Ring Queue with FUSE_DEFAULT_RING_QUEUE_DEPTH
> entries/requests.

The previous paragraph says "each currently supporting a single
submission queue entry (SQE)" whereas this paragraph says "we configure
each Ring Queue with FUSE_DEFAULT_RING_QUEUE_DEPTH entries/requests".
Maybe this paragraph was squashed into the commit description in a later
step and the previous paragraph can be updated to reflect that multiple
SQEs are submitted?

> 
> Suggested-by: Kevin Wolf <kwolf@redhat.com>
> Suggested-by: Stefan Hajnoczi <stefanha@redhat.com>
> Signed-off-by: Brian Song <hibriansong@gmail.com>
> ---
>  block/export/fuse.c                  | 310 +++++++++++++++++++++++++--
>  docs/tools/qemu-storage-daemon.rst   |  11 +-
>  qapi/block-export.json               |   5 +-
>  storage-daemon/qemu-storage-daemon.c |   1 +
>  util/fdmon-io_uring.c                |   5 +-
>  5 files changed, 309 insertions(+), 23 deletions(-)
> 
> diff --git a/block/export/fuse.c b/block/export/fuse.c
> index c0ad4696ce..19bf9e5f74 100644
> --- a/block/export/fuse.c
> +++ b/block/export/fuse.c
> @@ -48,6 +48,9 @@
>  #include <linux/fs.h>
>  #endif
>  
> +/* room needed in buffer to accommodate header */
> +#define FUSE_BUFFER_HEADER_SIZE 0x1000

Is it possible to write this in a way that shows how the constant is
calculated? That way the constant would automatically adjust on systems
where the underlying assumptions have changed (e.g. page size, header
struct size). This approach is also self-documenting so it's possible to
understand where the magic number comes from.

For example:

  #define FUSE_BUFFER_HEADER_SIZE DIV_ROUND_UP(sizeof(struct fuse_uring_req_header), qemu_real_host_page_size())

(I'm guessing what the formula you used is, so this example may be
incorrect...)

> +
>  /* Prevent overly long bounce buffer allocations */
>  #define FUSE_MAX_READ_BYTES (MIN(BDRV_REQUEST_MAX_BYTES, 1 * 1024 * 1024))
>  /*
> @@ -63,12 +66,59 @@
>      (FUSE_MAX_WRITE_BYTES - FUSE_IN_PLACE_WRITE_BYTES)
>  
>  typedef struct FuseExport FuseExport;
> +typedef struct FuseQueue FuseQueue;
> +
> +#ifdef CONFIG_LINUX_IO_URING
> +#define FUSE_DEFAULT_RING_QUEUE_DEPTH 64
> +#define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32
> +
> +typedef struct FuseRingQueue FuseRingQueue;
> +typedef struct FuseRingEnt {
> +    /* back pointer */
> +    FuseRingQueue *rq;
> +
> +    /* commit id of a fuse request */
> +    uint64_t req_commit_id;

This field is not used in this commit. Please introduce it in the commit
that uses it so it's easier to review and understand the purpose of this
field.

> +
> +    /* fuse request header and payload */
> +    struct fuse_uring_req_header req_header;
> +    void *op_payload;
> +    size_t req_payload_sz;

op_payload and req_payload_sz refer to the same buffer, and they are
submitted alongside req_header. It would be nice to name the fields
consistently:

  struct fuse_uring_req_header req_header;
  void *req_payload;
  size_t req_payload_sz;

req_payload and req_payload_sz could be eliminated since they are also
stored in iov[1].iov_base and .iov_len. If you feel that would be harder
to understand, then it's okay to keep the duplicate fields.

> +
> +    /* The vector passed to the kernel */
> +    struct iovec iov[2];
> +
> +    CqeHandler fuse_cqe_handler;
> +} FuseRingEnt;
> +
> +struct FuseRingQueue {

A comment would be nice here to explain that the kernel requires one
FuseRingQueue per host CPU and this concept is independent of /dev/fuse
(FuseQueue).

> +    int rqid;
> +
> +    /* back pointer */
> +    FuseQueue *q;
> +    FuseRingEnt *ent;
> +
> +    /* List entry for ring_queues */
> +    QLIST_ENTRY(FuseRingQueue) next;
> +};
> +
> +/*
> + * Round-robin distribution of ring queues across FUSE queues.
> + * This structure manages the mapping between kernel ring queues and user
> + * FUSE queues.
> + */
> +typedef struct FuseRingQueueManager {
> +    FuseRingQueue *ring_queues;
> +    int num_ring_queues;
> +    int num_fuse_queues;
> +} FuseRingQueueManager;
> +#endif

It's easy to forget which #ifdef we're inside after a few lines, so it
helps to indicate that in a comment:

#endif /* CONFIG_LINUX_IO_URING */

>  
>  /*
>   * One FUSE "queue", representing one FUSE FD from which requests are fetched
>   * and processed.  Each queue is tied to an AioContext.
>   */
> -typedef struct FuseQueue {
> +struct FuseQueue {
>      FuseExport *exp;
>  
>      AioContext *ctx;
> @@ -109,15 +159,11 @@ typedef struct FuseQueue {
>       * Free this buffer with qemu_vfree().
>       */
>      void *spillover_buf;
> -} FuseQueue;
>  
> -/*
> - * Verify that FuseQueue.request_buf plus the spill-over buffer together
> - * are big enough to be accepted by the FUSE kernel driver.
> - */
> -QEMU_BUILD_BUG_ON(sizeof(((FuseQueue *)0)->request_buf) +
> -                  FUSE_SPILLOVER_BUF_SIZE <
> -                  FUSE_MIN_READ_BUFFER);

Why was this removed, it's probably still necessary in the non-io_uring
case (which is compiled in even when CONFIG_LINUX_IO_URING is defined)?

> +#ifdef CONFIG_LINUX_IO_URING
> +    QLIST_HEAD(, FuseRingQueue) ring_queue_list;
> +#endif
> +};
>  
>  struct FuseExport {
>      BlockExport common;
> @@ -133,7 +179,7 @@ struct FuseExport {
>       */
>      bool halted;
>  
> -    int num_queues;
> +    size_t num_queues;
>      FuseQueue *queues;
>      /*
>       * True if this export should follow the generic export's AioContext.
> @@ -149,6 +195,12 @@ struct FuseExport {
>      /* Whether allow_other was used as a mount option or not */
>      bool allow_other;
>  
> +#ifdef CONFIG_LINUX_IO_URING
> +    bool is_uring;
> +    size_t ring_queue_depth;
> +    FuseRingQueueManager *ring_queue_manager;
> +#endif
> +
>      mode_t st_mode;
>      uid_t st_uid;
>      gid_t st_gid;
> @@ -205,7 +257,7 @@ static void fuse_attach_handlers(FuseExport *exp)
>          return;
>      }
>  
> -    for (int i = 0; i < exp->num_queues; i++) {
> +    for (size_t i = 0; i < exp->num_queues; i++) {
>          aio_set_fd_handler(exp->queues[i].ctx, exp->queues[i].fuse_fd,
>                             read_from_fuse_fd, NULL, NULL, NULL,
>                             &exp->queues[i]);
> @@ -257,6 +309,189 @@ static const BlockDevOps fuse_export_blk_dev_ops = {
>      .drained_poll  = fuse_export_drained_poll,
>  };
>  
> +#ifdef CONFIG_LINUX_IO_URING
> +static void fuse_uring_sqe_set_req_data(struct fuse_uring_cmd_req *req,
> +                    const unsigned int rqid,
> +                    const unsigned int commit_id)
> +{
> +    req->qid = rqid;
> +    req->commit_id = commit_id;
> +    req->flags = 0;
> +}
> +
> +static void fuse_uring_sqe_prepare(struct io_uring_sqe *sqe, FuseQueue *q,
> +               __u32 cmd_op)
> +{
> +    sqe->opcode = IORING_OP_URING_CMD;
> +
> +    sqe->fd = q->fuse_fd;
> +    sqe->rw_flags = 0;
> +    sqe->ioprio = 0;
> +    sqe->off = 0;
> +
> +    sqe->cmd_op = cmd_op;
> +    sqe->__pad1 = 0;
> +}
> +
> +static void fuse_uring_prep_sqe_register(struct io_uring_sqe *sqe, void *opaque)
> +{
> +    FuseRingEnt *ent = opaque;
> +    struct fuse_uring_cmd_req *req = (void *)&sqe->cmd[0];
> +
> +    fuse_uring_sqe_prepare(sqe, ent->rq->q, FUSE_IO_URING_CMD_REGISTER);
> +
> +    sqe->addr = (uint64_t)(ent->iov);
> +    sqe->len = 2;
> +
> +    fuse_uring_sqe_set_req_data(req, ent->rq->rqid, 0);
> +}
> +
> +static void fuse_uring_submit_register(void *opaque)
> +{
> +    FuseRingEnt *ent = opaque;
> +    FuseExport *exp = ent->rq->q->exp;

This variable is unused in this commit? Does this commit compile for
you? Usually the compiler warns about unused variables.

> +
> +
> +    aio_add_sqe(fuse_uring_prep_sqe_register, ent, &(ent->fuse_cqe_handler));
> +}
> +
> +/**
> + * Distribute ring queues across FUSE queues using round-robin algorithm.
> + * This ensures even distribution of kernel ring queues across user-specified
> + * FUSE queues.
> + */
> +static
> +FuseRingQueueManager *fuse_ring_queue_manager_create(int num_fuse_queues,
> +                                                    size_t ring_queue_depth,
> +                                                    size_t bufsize)
> +{
> +    int num_ring_queues = get_nprocs();

The kernel code uses num_possible_cpus() in
fs/fuse/dev_uring.c:fuse_uring_create() so I think this should be
get_nprocs_conf() instead of get_nprocs().

> +    FuseRingQueueManager *manager = g_new(FuseRingQueueManager, 1);
> +
> +    if (!manager) {

g_new() never returns NULL, so you can remove this if statement. If
memory cannot be allocated then the process will abort.

> +        return NULL;
> +    }
> +
> +    manager->ring_queues = g_new(FuseRingQueue, num_ring_queues);
> +    manager->num_ring_queues = num_ring_queues;
> +    manager->num_fuse_queues = num_fuse_queues;
> +
> +    if (!manager->ring_queues) {

Same here.

> +        g_free(manager);
> +        return NULL;
> +    }
> +
> +    for (int i = 0; i < num_ring_queues; i++) {
> +        FuseRingQueue *rq = &manager->ring_queues[i];
> +        rq->rqid = i;
> +        rq->ent = g_new(FuseRingEnt, ring_queue_depth);
> +
> +        if (!rq->ent) {

Same here.

> +            for (int j = 0; j < i; j++) {
> +                g_free(manager->ring_queues[j].ent);
> +            }
> +            g_free(manager->ring_queues);
> +            g_free(manager);
> +            return NULL;
> +        }
> +
> +        for (size_t j = 0; j < ring_queue_depth; j++) {
> +            FuseRingEnt *ent = &rq->ent[j];
> +            ent->rq = rq;
> +            ent->req_payload_sz = bufsize - FUSE_BUFFER_HEADER_SIZE;
> +            ent->op_payload = g_malloc0(ent->req_payload_sz);
> +
> +            if (!ent->op_payload) {

Same here.

> +                for (size_t k = 0; k < j; k++) {
> +                    g_free(rq->ent[k].op_payload);
> +                }
> +                g_free(rq->ent);
> +                for (int k = 0; k < i; k++) {
> +                    g_free(manager->ring_queues[k].ent);
> +                }
> +                g_free(manager->ring_queues);
> +                g_free(manager);

Where are these structures freed in the normal lifecycle of a FUSE
export? I only see this error handling code, but nothing is freed when
the export is shut down.

> +                return NULL;
> +            }
> +
> +            ent->iov[0] = (struct iovec) {
> +                &(ent->req_header),
> +                sizeof(struct fuse_uring_req_header)
> +            };
> +            ent->iov[1] = (struct iovec) {
> +                ent->op_payload,
> +                ent->req_payload_sz
> +            };
> +
> +            ent->fuse_cqe_handler.cb = fuse_uring_cqe_handler;
> +        }
> +    }
> +
> +    return manager;
> +}
> +
> +static
> +void fuse_distribute_ring_queues(FuseExport *exp, FuseRingQueueManager *manager)
> +{
> +    int queue_index = 0;
> +
> +    for (int i = 0; i < manager->num_ring_queues; i++) {
> +        FuseRingQueue *rq = &manager->ring_queues[i];
> +
> +        rq->q = &exp->queues[queue_index];
> +        QLIST_INSERT_HEAD(&(rq->q->ring_queue_list), rq, next);
> +
> +        queue_index = (queue_index + 1) % manager->num_fuse_queues;
> +    }
> +}
> +
> +static
> +void fuse_schedule_ring_queue_registrations(FuseExport *exp,
> +                                            FuseRingQueueManager *manager)
> +{
> +    for (int i = 0; i < manager->num_fuse_queues; i++) {
> +        FuseQueue *q = &exp->queues[i];
> +        FuseRingQueue *rq;
> +
> +        QLIST_FOREACH(rq, &q->ring_queue_list, next) {
> +            for (int j = 0; j < exp->ring_queue_depth; j++) {
> +                aio_bh_schedule_oneshot(q->ctx, fuse_uring_submit_register,
> +                                        &(rq->ent[j]));
> +            }
> +        }
> +    }
> +}
> +
> +static void fuse_uring_start(FuseExport *exp, struct fuse_init_out *out)
> +{
> +    /*
> +     * Since we didn't enable the FUSE_MAX_PAGES feature, the value of
> +     * fc->max_pages should be FUSE_DEFAULT_MAX_PAGES_PER_REQ, which is set by
> +     * the kernel by default. Also, max_write should not exceed
> +     * FUSE_DEFAULT_MAX_PAGES_PER_REQ * PAGE_SIZE.
> +     */
> +    size_t bufsize = out->max_write + FUSE_BUFFER_HEADER_SIZE;
> +
> +    if (!(out->flags & FUSE_MAX_PAGES)) {
> +        bufsize = FUSE_DEFAULT_MAX_PAGES_PER_REQ * qemu_real_host_page_size()
> +                         + FUSE_BUFFER_HEADER_SIZE;
> +    }
> +
> +    exp->ring_queue_manager = fuse_ring_queue_manager_create(
> +        exp->num_queues, exp->ring_queue_depth, bufsize);
> +
> +    if (!exp->ring_queue_manager) {
> +        error_report("Failed to create ring queue manager");
> +        return;
> +    }
> +
> +    /* Distribute ring queues across FUSE queues using round-robin */
> +    fuse_distribute_ring_queues(exp, exp->ring_queue_manager);
> +
> +    fuse_schedule_ring_queue_registrations(exp, exp->ring_queue_manager);
> +}
> +#endif
> +
>  static int fuse_export_create(BlockExport *blk_exp,
>                                BlockExportOptions *blk_exp_args,
>                                AioContext *const *multithread,
> @@ -270,6 +505,11 @@ static int fuse_export_create(BlockExport *blk_exp,
>  
>      assert(blk_exp_args->type == BLOCK_EXPORT_TYPE_FUSE);
>  
> +#ifdef CONFIG_LINUX_IO_URING
> +    exp->is_uring = args->io_uring;
> +    exp->ring_queue_depth = FUSE_DEFAULT_RING_QUEUE_DEPTH;
> +#endif
> +
>      if (multithread) {
>          /* Guaranteed by common export code */
>          assert(mt_count >= 1);
> @@ -283,6 +523,10 @@ static int fuse_export_create(BlockExport *blk_exp,
>                  .exp = exp,
>                  .ctx = multithread[i],
>                  .fuse_fd = -1,
> +#ifdef CONFIG_LINUX_IO_URING
> +                .ring_queue_list =
> +                    QLIST_HEAD_INITIALIZER(exp->queues[i].ring_queue_list),
> +#endif
>              };
>          }
>      } else {
> @@ -296,6 +540,10 @@ static int fuse_export_create(BlockExport *blk_exp,
>              .exp = exp,
>              .ctx = exp->common.ctx,
>              .fuse_fd = -1,
> +#ifdef CONFIG_LINUX_IO_URING
> +            .ring_queue_list =
> +                QLIST_HEAD_INITIALIZER(exp->queues[0].ring_queue_list),
> +#endif
>          };
>      }
>  
> @@ -685,17 +933,39 @@ static bool is_regular_file(const char *path, Error **errp)
>   */
>  static ssize_t coroutine_fn
>  fuse_co_init(FuseExport *exp, struct fuse_init_out *out,
> -             uint32_t max_readahead, uint32_t flags)
> +             uint32_t max_readahead, const struct fuse_init_in *in)
>  {
> -    const uint32_t supported_flags = FUSE_ASYNC_READ | FUSE_ASYNC_DIO;
> +    uint64_t supported_flags = FUSE_ASYNC_READ | FUSE_ASYNC_DIO
> +                                     | FUSE_INIT_EXT;
> +    uint64_t outargflags = 0;
> +    uint64_t inargflags = in->flags;
> +
> +    ssize_t ret = 0;
> +
> +    if (inargflags & FUSE_INIT_EXT) {
> +        inargflags = inargflags | (uint64_t) in->flags2 << 32;
> +    }
> +
> +#ifdef CONFIG_LINUX_IO_URING
> +    if (exp->is_uring) {
> +        if (inargflags & FUSE_OVER_IO_URING) {
> +            supported_flags |= FUSE_OVER_IO_URING;
> +        } else {
> +            exp->is_uring = false;
> +            ret = -ENODEV;
> +        }
> +    }
> +#endif
> +
> +    outargflags = inargflags & supported_flags;
>  
>      *out = (struct fuse_init_out) {
>          .major = FUSE_KERNEL_VERSION,
>          .minor = FUSE_KERNEL_MINOR_VERSION,
>          .max_readahead = max_readahead,
>          .max_write = FUSE_MAX_WRITE_BYTES,
> -        .flags = flags & supported_flags,
> -        .flags2 = 0,
> +        .flags = outargflags,
> +        .flags2 = outargflags >> 32,
>  
>          /* libfuse maximum: 2^16 - 1 */
>          .max_background = UINT16_MAX,
> @@ -717,7 +987,7 @@ fuse_co_init(FuseExport *exp, struct fuse_init_out *out,
>          .map_alignment = 0,
>      };
>  
> -    return sizeof(*out);
> +    return ret < 0 ? ret : sizeof(*out);
>  }
>  
>  /**
> @@ -1506,6 +1776,14 @@ fuse_co_process_request(FuseQueue *q, void *spillover_buf)
>          fuse_write_buf_response(q->fuse_fd, req_id, out_hdr,
>                                  out_data_buffer, ret);
>          qemu_vfree(out_data_buffer);
> +#ifdef CONFIG_LINUX_IO_URING
> +    /* Handle FUSE-over-io_uring initialization */
> +    if (unlikely(opcode == FUSE_INIT && exp->is_uring)) {
> +        struct fuse_init_out *out =
> +            (struct fuse_init_out *)FUSE_OUT_OP_STRUCT(out_buf);
> +        fuse_uring_start(exp, out);

Is there any scenario where FUSE_INIT can be received multiple times?
Maybe if the FUSE file system is umounted and mounted again? I want to
check that this doesn't leak previously allocated ring state.

> +    }
> +#endif
>      } else {
>          fuse_write_response(q->fuse_fd, req_id, out_hdr,
>                              ret < 0 ? ret : 0,
> diff --git a/docs/tools/qemu-storage-daemon.rst b/docs/tools/qemu-storage-daemon.rst
> index 35ab2d7807..c5076101e0 100644
> --- a/docs/tools/qemu-storage-daemon.rst
> +++ b/docs/tools/qemu-storage-daemon.rst
> @@ -78,7 +78,7 @@ Standard options:
>  .. option:: --export [type=]nbd,id=<id>,node-name=<node-name>[,name=<export-name>][,writable=on|off][,bitmap=<name>]
>    --export [type=]vhost-user-blk,id=<id>,node-name=<node-name>,addr.type=unix,addr.path=<socket-path>[,writable=on|off][,logical-block-size=<block-size>][,num-queues=<num-queues>]
>    --export [type=]vhost-user-blk,id=<id>,node-name=<node-name>,addr.type=fd,addr.str=<fd>[,writable=on|off][,logical-block-size=<block-size>][,num-queues=<num-queues>]
> -  --export [type=]fuse,id=<id>,node-name=<node-name>,mountpoint=<file>[,growable=on|off][,writable=on|off][,allow-other=on|off|auto]
> +  --export [type=]fuse,id=<id>,node-name=<node-name>,mountpoint=<file>[,growable=on|off][,writable=on|off][,allow-other=on|off|auto][,io-uring=on|off]
>    --export [type=]vduse-blk,id=<id>,node-name=<node-name>,name=<vduse-name>[,writable=on|off][,num-queues=<num-queues>][,queue-size=<queue-size>][,logical-block-size=<block-size>][,serial=<serial-number>]
>  
>    is a block export definition. ``node-name`` is the block node that should be
> @@ -111,10 +111,11 @@ Standard options:
>    that enabling this option as a non-root user requires enabling the
>    user_allow_other option in the global fuse.conf configuration file.  Setting
>    ``allow-other`` to auto (the default) will try enabling this option, and on
> -  error fall back to disabling it.
> -
> -  The ``vduse-blk`` export type takes a ``name`` (must be unique across the host)
> -  to create the VDUSE device.
> +  error fall back to disabling it. Once ``io-uring`` is enabled (off by default),
> +  the FUSE-over-io_uring-related settings will be initialized to bypass the
> +  traditional /dev/fuse communication mechanism and instead use io_uring to
> +  handle FUSE operations. The ``vduse-blk`` export type takes a ``name``
> +  (must be unique across the host) to create the VDUSE device.
>    ``num-queues`` sets the number of virtqueues (the default is 1).
>    ``queue-size`` sets the virtqueue descriptor table size (the default is 256).
>  
> diff --git a/qapi/block-export.json b/qapi/block-export.json
> index 9ae703ad01..37f2fc47e2 100644
> --- a/qapi/block-export.json
> +++ b/qapi/block-export.json
> @@ -184,12 +184,15 @@
>  #     mount the export with allow_other, and if that fails, try again
>  #     without.  (since 6.1; default: auto)
>  #
> +# @io-uring: Use FUSE-over-io-uring.  (since 10.2; default: false)
> +#
>  # Since: 6.0
>  ##
>  { 'struct': 'BlockExportOptionsFuse',
>    'data': { 'mountpoint': 'str',
>              '*growable': 'bool',
> -            '*allow-other': 'FuseExportAllowOther' },
> +            '*allow-other': 'FuseExportAllowOther',
> +            '*io-uring': 'bool' },
>    'if': 'CONFIG_FUSE' }
>  
>  ##
> diff --git a/storage-daemon/qemu-storage-daemon.c b/storage-daemon/qemu-storage-daemon.c
> index eb72561358..0cd4cd2b58 100644
> --- a/storage-daemon/qemu-storage-daemon.c
> +++ b/storage-daemon/qemu-storage-daemon.c
> @@ -107,6 +107,7 @@ static void help(void)
>  #ifdef CONFIG_FUSE
>  "  --export [type=]fuse,id=<id>,node-name=<node-name>,mountpoint=<file>\n"
>  "           [,growable=on|off][,writable=on|off][,allow-other=on|off|auto]\n"
> +"           [,io-uring=on|off]"
>  "                         export the specified block node over FUSE\n"
>  "\n"
>  #endif /* CONFIG_FUSE */
> diff --git a/util/fdmon-io_uring.c b/util/fdmon-io_uring.c
> index d2433d1d99..68d3fe8e01 100644
> --- a/util/fdmon-io_uring.c
> +++ b/util/fdmon-io_uring.c
> @@ -452,10 +452,13 @@ static const FDMonOps fdmon_io_uring_ops = {
>  void fdmon_io_uring_setup(AioContext *ctx, Error **errp)
>  {
>      int ret;
> +    int flags;
>  
>      ctx->io_uring_fd_tag = NULL;
> +    flags = IORING_SETUP_SQE128;

Please add /* needed by FUSE-over-io_uring */ so it's clear who the user
is.

>  
> -    ret = io_uring_queue_init(FDMON_IO_URING_ENTRIES, &ctx->fdmon_io_uring, 0);
> +    ret = io_uring_queue_init(FDMON_IO_URING_ENTRIES,
> +                            &ctx->fdmon_io_uring, flags);
>      if (ret != 0) {
>          error_setg_errno(errp, -ret, "Failed to initialize io_uring");
>          return;
> -- 
> 2.45.2
>

Re: [PATCH 1/4] export/fuse: add opt to enable FUSE-over-io_uring

Posted by Brian Song 3 weeks, 3 days ago


On 9/3/25 6:53 AM, Stefan Hajnoczi wrote:
> On Fri, Aug 29, 2025 at 10:50:22PM -0400, Brian Song wrote:
>> This patch adds a new export option for storage-export-daemon to enable
>> FUSE-over-io_uring via the switch io-uring=on|off (disableby default).
>> It also implements the protocol handshake with the Linux kernel
>> during the FUSE-over-io_uring initialization phase.
>>
>> See: https://docs.kernel.org/filesystems/fuse-io-uring.html
>>
>> The kernel documentation describes in detail how FUSE-over-io_uring
>> works. This patch implements the Initial SQE stage shown in thediagram:
>> it initializes one queue per IOThread, each currently supporting a
>> single submission queue entry (SQE). When the FUSE driver sends the
>> first FUSE request (FUSE_INIT), storage-export-daemon calls
>> fuse_uring_start() to complete initialization, ultimately submitting
>> the SQE with the FUSE_IO_URING_CMD_REGISTER command to confirm
>> successful initialization with the kernel.
>>
>> We also added support for multiple IOThreads. The current Linux kernel
>> requires registering $(nproc) queues when setting up FUSE-over-io_uring
>> To let users customize the number of FUSE Queues (i.e., IOThreads),
>> we first create nproc Ring Queues as required by the kernel, then
>> distribute them in a round-robin manner to the FUSE Queues for
>> registration. In addition, to support multiple in-flight requests,
>> we configure each Ring Queue with FUSE_DEFAULT_RING_QUEUE_DEPTH
>> entries/requests.
> 
> The previous paragraph says "each currently supporting a single
> submission queue entry (SQE)" whereas this paragraph says "we configure
> each Ring Queue with FUSE_DEFAULT_RING_QUEUE_DEPTH entries/requests".
> Maybe this paragraph was squashed into the commit description in a later
> step and the previous paragraph can be updated to reflect that multiple
> SQEs are submitted?
> 
>>
>> Suggested-by: Kevin Wolf <kwolf@redhat.com>
>> Suggested-by: Stefan Hajnoczi <stefanha@redhat.com>
>> Signed-off-by: Brian Song <hibriansong@gmail.com>
>> ---
>>   block/export/fuse.c                  | 310 +++++++++++++++++++++++++--
>>   docs/tools/qemu-storage-daemon.rst   |  11 +-
>>   qapi/block-export.json               |   5 +-
>>   storage-daemon/qemu-storage-daemon.c |   1 +
>>   util/fdmon-io_uring.c                |   5 +-
>>   5 files changed, 309 insertions(+), 23 deletions(-)
>>
>> diff --git a/block/export/fuse.c b/block/export/fuse.c
>> index c0ad4696ce..19bf9e5f74 100644
>> --- a/block/export/fuse.c
>> +++ b/block/export/fuse.c
>> @@ -48,6 +48,9 @@
>>   #include <linux/fs.h>
>>   #endif
>>   
>> +/* room needed in buffer to accommodate header */
>> +#define FUSE_BUFFER_HEADER_SIZE 0x1000
> 
> Is it possible to write this in a way that shows how the constant is
> calculated? That way the constant would automatically adjust on systems
> where the underlying assumptions have changed (e.g. page size, header
> struct size). This approach is also self-documenting so it's possible to
> understand where the magic number comes from.
> 
> For example:
> 
>    #define FUSE_BUFFER_HEADER_SIZE DIV_ROUND_UP(sizeof(struct fuse_uring_req_header), qemu_real_host_page_size())
> 
> (I'm guessing what the formula you used is, so this example may be
> incorrect...)
> 

In libfuse, the way to calculate the bufsize (for req_payload) is the 
same as in this patch. For different requests, the request header sizes 
are not the same, but they should never exceed a certain value. So is 
that why libfuse has this kind of magic number?

>> +
>>   /* Prevent overly long bounce buffer allocations */
>>   #define FUSE_MAX_READ_BYTES (MIN(BDRV_REQUEST_MAX_BYTES, 1 * 1024 * 1024))
>>   /*
>> @@ -63,12 +66,59 @@
>>       (FUSE_MAX_WRITE_BYTES - FUSE_IN_PLACE_WRITE_BYTES)
>>   
>>   typedef struct FuseExport FuseExport;
>> +typedef struct FuseQueue FuseQueue;
>> +
>> +#ifdef CONFIG_LINUX_IO_URING
>> +#define FUSE_DEFAULT_RING_QUEUE_DEPTH 64
>> +#define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32
>> +
>> +typedef struct FuseRingQueue FuseRingQueue;
>> +typedef struct FuseRingEnt {
>> +    /* back pointer */
>> +    FuseRingQueue *rq;
>> +
>> +    /* commit id of a fuse request */
>> +    uint64_t req_commit_id;
> 
> This field is not used in this commit. Please introduce it in the commit
> that uses it so it's easier to review and understand the purpose of this
> field.
> 
>> +
>> +    /* fuse request header and payload */
>> +    struct fuse_uring_req_header req_header;
>> +    void *op_payload;
>> +    size_t req_payload_sz;
> 
> op_payload and req_payload_sz refer to the same buffer, and they are
> submitted alongside req_header. It would be nice to name the fields
> consistently:
> 
>    struct fuse_uring_req_header req_header;
>    void *req_payload;
>    size_t req_payload_sz;
> 
> req_payload and req_payload_sz could be eliminated since they are also
> stored in iov[1].iov_base and .iov_len. If you feel that would be harder
> to understand, then it's okay to keep the duplicate fields.
> 

Makes sense. I followed the design in libfuse. Probably best to just 
leave them in the struct for readability

>> +
>> +    /* The vector passed to the kernel */
>> +    struct iovec iov[2];
>> +
>> +    CqeHandler fuse_cqe_handler;
>> +} FuseRingEnt;
>> +
>> +struct FuseRingQueue {
> 
> A comment would be nice here to explain that the kernel requires one
> FuseRingQueue per host CPU and this concept is independent of /dev/fuse
> (FuseQueue).
> 
>> +    int rqid;
>> +
>> +    /* back pointer */
>> +    FuseQueue *q;
>> +    FuseRingEnt *ent;
>> +
>> +    /* List entry for ring_queues */
>> +    QLIST_ENTRY(FuseRingQueue) next;
>> +};
>> +
>> +/*
>> + * Round-robin distribution of ring queues across FUSE queues.
>> + * This structure manages the mapping between kernel ring queues and user
>> + * FUSE queues.
>> + */
>> +typedef struct FuseRingQueueManager {
>> +    FuseRingQueue *ring_queues;
>> +    int num_ring_queues;
>> +    int num_fuse_queues;
>> +} FuseRingQueueManager;
>> +#endif
> 
> It's easy to forget which #ifdef we're inside after a few lines, so it
> helps to indicate that in a comment:
> 
> #endif /* CONFIG_LINUX_IO_URING */
> 
>>   
>>   /*
>>    * One FUSE "queue", representing one FUSE FD from which requests are fetched
>>    * and processed.  Each queue is tied to an AioContext.
>>    */
>> -typedef struct FuseQueue {
>> +struct FuseQueue {
>>       FuseExport *exp;
>>   
>>       AioContext *ctx;
>> @@ -109,15 +159,11 @@ typedef struct FuseQueue {
>>        * Free this buffer with qemu_vfree().
>>        */
>>       void *spillover_buf;
>> -} FuseQueue;
>>   
>> -/*
>> - * Verify that FuseQueue.request_buf plus the spill-over buffer together
>> - * are big enough to be accepted by the FUSE kernel driver.
>> - */
>> -QEMU_BUILD_BUG_ON(sizeof(((FuseQueue *)0)->request_buf) +
>> -                  FUSE_SPILLOVER_BUF_SIZE <
>> -                  FUSE_MIN_READ_BUFFER);
> 
> Why was this removed, it's probably still necessary in the non-io_uring
> case (which is compiled in even when CONFIG_LINUX_IO_URING is defined)?
> 

You can check Hanna’s patch. In fuse_co_process_request, Hanna 
introduced this check when using FUSE_OUT_OP_STRUCT to cast void *buf 
into the corresponding in/out header for the given operation.

But in the v2 patch, we merged the legacy process_request and the uring 
version into one. This caused the legacy path to pass the array into the 
common function as a pointer. Now, when we do the buf header size check, 
what gets checked is just the pointer size.

#define FUSE_OUT_OP_STRUCT(op_name, out_buf) \
     ({ \
         struct fuse_out_header *__out_hdr = \
             (struct fuse_out_header *)(out_buf); \
         struct fuse_##op_name##_out *__out = \
             (struct fuse_##op_name##_out *)(__out_hdr + 1); \
         \
         QEMU_BUILD_BUG_ON(sizeof(*__out_hdr) + sizeof(*__out) > \
                           sizeof(out_buf)); \
         \
         __out; \
     })


>> +#ifdef CONFIG_LINUX_IO_URING
>> +    QLIST_HEAD(, FuseRingQueue) ring_queue_list;
>> +#endif
>> +};
>>   
>>   struct FuseExport {
>>       BlockExport common;
>> @@ -133,7 +179,7 @@ struct FuseExport {
>>        */
>>       bool halted;
>>   
>> -    int num_queues;
>> +    size_t num_queues;
>>       FuseQueue *queues;
>>       /*
>>        * True if this export should follow the generic export's AioContext.
>> @@ -149,6 +195,12 @@ struct FuseExport {
>>       /* Whether allow_other was used as a mount option or not */
>>       bool allow_other;
>>   
>> +#ifdef CONFIG_LINUX_IO_URING
>> +    bool is_uring;
>> +    size_t ring_queue_depth;
>> +    FuseRingQueueManager *ring_queue_manager;
>> +#endif
>> +
>>       mode_t st_mode;
>>       uid_t st_uid;
>>       gid_t st_gid;
>> @@ -205,7 +257,7 @@ static void fuse_attach_handlers(FuseExport *exp)
>>           return;
>>       }
>>   
>> -    for (int i = 0; i < exp->num_queues; i++) {
>> +    for (size_t i = 0; i < exp->num_queues; i++) {
>>           aio_set_fd_handler(exp->queues[i].ctx, exp->queues[i].fuse_fd,
>>                              read_from_fuse_fd, NULL, NULL, NULL,
>>                              &exp->queues[i]);
>> @@ -257,6 +309,189 @@ static const BlockDevOps fuse_export_blk_dev_ops = {
>>       .drained_poll  = fuse_export_drained_poll,
>>   };
>>   
>> +#ifdef CONFIG_LINUX_IO_URING
>> +static void fuse_uring_sqe_set_req_data(struct fuse_uring_cmd_req *req,
>> +                    const unsigned int rqid,
>> +                    const unsigned int commit_id)
>> +{
>> +    req->qid = rqid;
>> +    req->commit_id = commit_id;
>> +    req->flags = 0;
>> +}
>> +
>> +static void fuse_uring_sqe_prepare(struct io_uring_sqe *sqe, FuseQueue *q,
>> +               __u32 cmd_op)
>> +{
>> +    sqe->opcode = IORING_OP_URING_CMD;
>> +
>> +    sqe->fd = q->fuse_fd;
>> +    sqe->rw_flags = 0;
>> +    sqe->ioprio = 0;
>> +    sqe->off = 0;
>> +
>> +    sqe->cmd_op = cmd_op;
>> +    sqe->__pad1 = 0;
>> +}
>> +
>> +static void fuse_uring_prep_sqe_register(struct io_uring_sqe *sqe, void *opaque)
>> +{
>> +    FuseRingEnt *ent = opaque;
>> +    struct fuse_uring_cmd_req *req = (void *)&sqe->cmd[0];
>> +
>> +    fuse_uring_sqe_prepare(sqe, ent->rq->q, FUSE_IO_URING_CMD_REGISTER);
>> +
>> +    sqe->addr = (uint64_t)(ent->iov);
>> +    sqe->len = 2;
>> +
>> +    fuse_uring_sqe_set_req_data(req, ent->rq->rqid, 0);
>> +}
>> +
>> +static void fuse_uring_submit_register(void *opaque)
>> +{
>> +    FuseRingEnt *ent = opaque;
>> +    FuseExport *exp = ent->rq->q->exp;
> 
> This variable is unused in this commit? Does this commit compile for
> you? Usually the compiler warns about unused variables.
> 

The first version was a large single patch. I split it with git, and 
this variable is now used in a different patch

>> +
>> +
>> +    aio_add_sqe(fuse_uring_prep_sqe_register, ent, &(ent->fuse_cqe_handler));
>> +}
>> +
>> +/**
>> + * Distribute ring queues across FUSE queues using round-robin algorithm.
>> + * This ensures even distribution of kernel ring queues across user-specified
>> + * FUSE queues.
>> + */
>> +static
>> +FuseRingQueueManager *fuse_ring_queue_manager_create(int num_fuse_queues,
>> +                                                    size_t ring_queue_depth,
>> +                                                    size_t bufsize)
>> +{
>> +    int num_ring_queues = get_nprocs();
> 
> The kernel code uses num_possible_cpus() in
> fs/fuse/dev_uring.c:fuse_uring_create() so I think this should be
> get_nprocs_conf() instead of get_nprocs().
> 
>> +    FuseRingQueueManager *manager = g_new(FuseRingQueueManager, 1);
>> +
>> +    if (!manager) {
> 
> g_new() never returns NULL, so you can remove this if statement. If
> memory cannot be allocated then the process will abort.
> 
>> +        return NULL;
>> +    }
>> +
>> +    manager->ring_queues = g_new(FuseRingQueue, num_ring_queues);
>> +    manager->num_ring_queues = num_ring_queues;
>> +    manager->num_fuse_queues = num_fuse_queues;
>> +
>> +    if (!manager->ring_queues) {
> 
> Same here.
> 
>> +        g_free(manager);
>> +        return NULL;
>> +    }
>> +
>> +    for (int i = 0; i < num_ring_queues; i++) {
>> +        FuseRingQueue *rq = &manager->ring_queues[i];
>> +        rq->rqid = i;
>> +        rq->ent = g_new(FuseRingEnt, ring_queue_depth);
>> +
>> +        if (!rq->ent) {
> 
> Same here.
> 
>> +            for (int j = 0; j < i; j++) {
>> +                g_free(manager->ring_queues[j].ent);
>> +            }
>> +            g_free(manager->ring_queues);
>> +            g_free(manager);
>> +            return NULL;
>> +        }
>> +
>> +        for (size_t j = 0; j < ring_queue_depth; j++) {
>> +            FuseRingEnt *ent = &rq->ent[j];
>> +            ent->rq = rq;
>> +            ent->req_payload_sz = bufsize - FUSE_BUFFER_HEADER_SIZE;
>> +            ent->op_payload = g_malloc0(ent->req_payload_sz);
>> +
>> +            if (!ent->op_payload) {
> 
> Same here.
> 
>> +                for (size_t k = 0; k < j; k++) {
>> +                    g_free(rq->ent[k].op_payload);
>> +                }
>> +                g_free(rq->ent);
>> +                for (int k = 0; k < i; k++) {
>> +                    g_free(manager->ring_queues[k].ent);
>> +                }
>> +                g_free(manager->ring_queues);
>> +                g_free(manager);
> 
> Where are these structures freed in the normal lifecycle of a FUSE
> export? I only see this error handling code, but nothing is freed when
> the export is shut down.


Same here. The first version was a large single patch. I split it with 
git, and we do cleanup in a different patch

> 
>> +                return NULL;
>> +            }
>> +
>> +            ent->iov[0] = (struct iovec) {
>> +                &(ent->req_header),
>> +                sizeof(struct fuse_uring_req_header)
>> +            };
>> +            ent->iov[1] = (struct iovec) {
>> +                ent->op_payload,
>> +                ent->req_payload_sz
>> +            };
>> +
>> +            ent->fuse_cqe_handler.cb = fuse_uring_cqe_handler;
>> +        }
>> +    }
>> +
>> +    return manager;
>> +}
>> +
>> +static
>> +void fuse_distribute_ring_queues(FuseExport *exp, FuseRingQueueManager *manager)
>> +{
>> +    int queue_index = 0;
>> +
>> +    for (int i = 0; i < manager->num_ring_queues; i++) {
>> +        FuseRingQueue *rq = &manager->ring_queues[i];
>> +
>> +        rq->q = &exp->queues[queue_index];
>> +        QLIST_INSERT_HEAD(&(rq->q->ring_queue_list), rq, next);
>> +
>> +        queue_index = (queue_index + 1) % manager->num_fuse_queues;
>> +    }
>> +}
>> +
>> +static
>> +void fuse_schedule_ring_queue_registrations(FuseExport *exp,
>> +                                            FuseRingQueueManager *manager)
>> +{
>> +    for (int i = 0; i < manager->num_fuse_queues; i++) {
>> +        FuseQueue *q = &exp->queues[i];
>> +        FuseRingQueue *rq;
>> +
>> +        QLIST_FOREACH(rq, &q->ring_queue_list, next) {
>> +            for (int j = 0; j < exp->ring_queue_depth; j++) {
>> +                aio_bh_schedule_oneshot(q->ctx, fuse_uring_submit_register,
>> +                                        &(rq->ent[j]));
>> +            }
>> +        }
>> +    }
>> +}
>> +
>> +static void fuse_uring_start(FuseExport *exp, struct fuse_init_out *out)
>> +{
>> +    /*
>> +     * Since we didn't enable the FUSE_MAX_PAGES feature, the value of
>> +     * fc->max_pages should be FUSE_DEFAULT_MAX_PAGES_PER_REQ, which is set by
>> +     * the kernel by default. Also, max_write should not exceed
>> +     * FUSE_DEFAULT_MAX_PAGES_PER_REQ * PAGE_SIZE.
>> +     */
>> +    size_t bufsize = out->max_write + FUSE_BUFFER_HEADER_SIZE;
>> +
>> +    if (!(out->flags & FUSE_MAX_PAGES)) {
>> +        bufsize = FUSE_DEFAULT_MAX_PAGES_PER_REQ * qemu_real_host_page_size()
>> +                         + FUSE_BUFFER_HEADER_SIZE;
>> +    }
>> +
>> +    exp->ring_queue_manager = fuse_ring_queue_manager_create(
>> +        exp->num_queues, exp->ring_queue_depth, bufsize);
>> +
>> +    if (!exp->ring_queue_manager) {
>> +        error_report("Failed to create ring queue manager");
>> +        return;
>> +    }
>> +
>> +    /* Distribute ring queues across FUSE queues using round-robin */
>> +    fuse_distribute_ring_queues(exp, exp->ring_queue_manager);
>> +
>> +    fuse_schedule_ring_queue_registrations(exp, exp->ring_queue_manager);
>> +}
>> +#endif
>> +
>>   static int fuse_export_create(BlockExport *blk_exp,
>>                                 BlockExportOptions *blk_exp_args,
>>                                 AioContext *const *multithread,
>> @@ -270,6 +505,11 @@ static int fuse_export_create(BlockExport *blk_exp,
>>   
>>       assert(blk_exp_args->type == BLOCK_EXPORT_TYPE_FUSE);
>>   
>> +#ifdef CONFIG_LINUX_IO_URING
>> +    exp->is_uring = args->io_uring;
>> +    exp->ring_queue_depth = FUSE_DEFAULT_RING_QUEUE_DEPTH;
>> +#endif
>> +
>>       if (multithread) {
>>           /* Guaranteed by common export code */
>>           assert(mt_count >= 1);
>> @@ -283,6 +523,10 @@ static int fuse_export_create(BlockExport *blk_exp,
>>                   .exp = exp,
>>                   .ctx = multithread[i],
>>                   .fuse_fd = -1,
>> +#ifdef CONFIG_LINUX_IO_URING
>> +                .ring_queue_list =
>> +                    QLIST_HEAD_INITIALIZER(exp->queues[i].ring_queue_list),
>> +#endif
>>               };
>>           }
>>       } else {
>> @@ -296,6 +540,10 @@ static int fuse_export_create(BlockExport *blk_exp,
>>               .exp = exp,
>>               .ctx = exp->common.ctx,
>>               .fuse_fd = -1,
>> +#ifdef CONFIG_LINUX_IO_URING
>> +            .ring_queue_list =
>> +                QLIST_HEAD_INITIALIZER(exp->queues[0].ring_queue_list),
>> +#endif
>>           };
>>       }
>>   
>> @@ -685,17 +933,39 @@ static bool is_regular_file(const char *path, Error **errp)
>>    */
>>   static ssize_t coroutine_fn
>>   fuse_co_init(FuseExport *exp, struct fuse_init_out *out,
>> -             uint32_t max_readahead, uint32_t flags)
>> +             uint32_t max_readahead, const struct fuse_init_in *in)
>>   {
>> -    const uint32_t supported_flags = FUSE_ASYNC_READ | FUSE_ASYNC_DIO;
>> +    uint64_t supported_flags = FUSE_ASYNC_READ | FUSE_ASYNC_DIO
>> +                                     | FUSE_INIT_EXT;
>> +    uint64_t outargflags = 0;
>> +    uint64_t inargflags = in->flags;
>> +
>> +    ssize_t ret = 0;
>> +
>> +    if (inargflags & FUSE_INIT_EXT) {
>> +        inargflags = inargflags | (uint64_t) in->flags2 << 32;
>> +    }
>> +
>> +#ifdef CONFIG_LINUX_IO_URING
>> +    if (exp->is_uring) {
>> +        if (inargflags & FUSE_OVER_IO_URING) {
>> +            supported_flags |= FUSE_OVER_IO_URING;
>> +        } else {
>> +            exp->is_uring = false;
>> +            ret = -ENODEV;
>> +        }
>> +    }
>> +#endif
>> +
>> +    outargflags = inargflags & supported_flags;
>>   
>>       *out = (struct fuse_init_out) {
>>           .major = FUSE_KERNEL_VERSION,
>>           .minor = FUSE_KERNEL_MINOR_VERSION,
>>           .max_readahead = max_readahead,
>>           .max_write = FUSE_MAX_WRITE_BYTES,
>> -        .flags = flags & supported_flags,
>> -        .flags2 = 0,
>> +        .flags = outargflags,
>> +        .flags2 = outargflags >> 32,
>>   
>>           /* libfuse maximum: 2^16 - 1 */
>>           .max_background = UINT16_MAX,
>> @@ -717,7 +987,7 @@ fuse_co_init(FuseExport *exp, struct fuse_init_out *out,
>>           .map_alignment = 0,
>>       };
>>   
>> -    return sizeof(*out);
>> +    return ret < 0 ? ret : sizeof(*out);
>>   }
>>   
>>   /**
>> @@ -1506,6 +1776,14 @@ fuse_co_process_request(FuseQueue *q, void *spillover_buf)
>>           fuse_write_buf_response(q->fuse_fd, req_id, out_hdr,
>>                                   out_data_buffer, ret);
>>           qemu_vfree(out_data_buffer);
>> +#ifdef CONFIG_LINUX_IO_URING
>> +    /* Handle FUSE-over-io_uring initialization */
>> +    if (unlikely(opcode == FUSE_INIT && exp->is_uring)) {
>> +        struct fuse_init_out *out =
>> +            (struct fuse_init_out *)FUSE_OUT_OP_STRUCT(out_buf);
>> +        fuse_uring_start(exp, out);
> 
> Is there any scenario where FUSE_INIT can be received multiple times?
> Maybe if the FUSE file system is umounted and mounted again? I want to
> check that this doesn't leak previously allocated ring state.
> 

I don't think so, even in a multi-threaded FUSE setup, the kernel only 
sends a single FUSE_INIT to userspace. In the legacy mode, whichever 
thread receives that request can handle it and initialize FUSE-over-io_uring

Re: [PATCH 1/4] export/fuse: add opt to enable FUSE-over-io_uring

Posted by Stefan Hajnoczi 2 weeks, 5 days ago

On Wed, Sep 03, 2025 at 02:00:55PM -0400, Brian Song wrote:
> 
> 
> On 9/3/25 6:53 AM, Stefan Hajnoczi wrote:
> > On Fri, Aug 29, 2025 at 10:50:22PM -0400, Brian Song wrote:
> > > This patch adds a new export option for storage-export-daemon to enable
> > > FUSE-over-io_uring via the switch io-uring=on|off (disableby default).
> > > It also implements the protocol handshake with the Linux kernel
> > > during the FUSE-over-io_uring initialization phase.
> > > 
> > > See: https://docs.kernel.org/filesystems/fuse-io-uring.html
> > > 
> > > The kernel documentation describes in detail how FUSE-over-io_uring
> > > works. This patch implements the Initial SQE stage shown in thediagram:
> > > it initializes one queue per IOThread, each currently supporting a
> > > single submission queue entry (SQE). When the FUSE driver sends the
> > > first FUSE request (FUSE_INIT), storage-export-daemon calls
> > > fuse_uring_start() to complete initialization, ultimately submitting
> > > the SQE with the FUSE_IO_URING_CMD_REGISTER command to confirm
> > > successful initialization with the kernel.
> > > 
> > > We also added support for multiple IOThreads. The current Linux kernel
> > > requires registering $(nproc) queues when setting up FUSE-over-io_uring
> > > To let users customize the number of FUSE Queues (i.e., IOThreads),
> > > we first create nproc Ring Queues as required by the kernel, then
> > > distribute them in a round-robin manner to the FUSE Queues for
> > > registration. In addition, to support multiple in-flight requests,
> > > we configure each Ring Queue with FUSE_DEFAULT_RING_QUEUE_DEPTH
> > > entries/requests.
> > 
> > The previous paragraph says "each currently supporting a single
> > submission queue entry (SQE)" whereas this paragraph says "we configure
> > each Ring Queue with FUSE_DEFAULT_RING_QUEUE_DEPTH entries/requests".
> > Maybe this paragraph was squashed into the commit description in a later
> > step and the previous paragraph can be updated to reflect that multiple
> > SQEs are submitted?
> > 
> > > 
> > > Suggested-by: Kevin Wolf <kwolf@redhat.com>
> > > Suggested-by: Stefan Hajnoczi <stefanha@redhat.com>
> > > Signed-off-by: Brian Song <hibriansong@gmail.com>
> > > ---
> > >   block/export/fuse.c                  | 310 +++++++++++++++++++++++++--
> > >   docs/tools/qemu-storage-daemon.rst   |  11 +-
> > >   qapi/block-export.json               |   5 +-
> > >   storage-daemon/qemu-storage-daemon.c |   1 +
> > >   util/fdmon-io_uring.c                |   5 +-
> > >   5 files changed, 309 insertions(+), 23 deletions(-)
> > > 
> > > diff --git a/block/export/fuse.c b/block/export/fuse.c
> > > index c0ad4696ce..19bf9e5f74 100644
> > > --- a/block/export/fuse.c
> > > +++ b/block/export/fuse.c
> > > @@ -48,6 +48,9 @@
> > >   #include <linux/fs.h>
> > >   #endif
> > > +/* room needed in buffer to accommodate header */
> > > +#define FUSE_BUFFER_HEADER_SIZE 0x1000
> > 
> > Is it possible to write this in a way that shows how the constant is
> > calculated? That way the constant would automatically adjust on systems
> > where the underlying assumptions have changed (e.g. page size, header
> > struct size). This approach is also self-documenting so it's possible to
> > understand where the magic number comes from.
> > 
> > For example:
> > 
> >    #define FUSE_BUFFER_HEADER_SIZE DIV_ROUND_UP(sizeof(struct fuse_uring_req_header), qemu_real_host_page_size())
> > 
> > (I'm guessing what the formula you used is, so this example may be
> > incorrect...)
> > 
> 
> In libfuse, the way to calculate the bufsize (for req_payload) is the same
> as in this patch. For different requests, the request header sizes are not
> the same, but they should never exceed a certain value. So is that why
> libfuse has this kind of magic number?

From <linux/fuse.h>:

  #define FUSE_URING_IN_OUT_HEADER_SZ 128
  #define FUSE_URING_OP_IN_OUT_SZ 128
  ...
  struct fuse_uring_req_header {
          /* struct fuse_in_header / struct fuse_out_header */
          char in_out[FUSE_URING_IN_OUT_HEADER_SZ];

          /* per op code header */
          char op_in[FUSE_URING_OP_IN_OUT_SZ];

          struct fuse_uring_ent_in_out ring_ent_in_out;
  };

The size of struct fuse_uring_req_header is 128 + 128 + (4 * 8) = 288
bytes. A single 4 KB page easily fits this. I guess that's why 0x1000
was chosen in libfuse.

> 
> > > +
> > >   /* Prevent overly long bounce buffer allocations */
> > >   #define FUSE_MAX_READ_BYTES (MIN(BDRV_REQUEST_MAX_BYTES, 1 * 1024 * 1024))
> > >   /*
> > > @@ -63,12 +66,59 @@
> > >       (FUSE_MAX_WRITE_BYTES - FUSE_IN_PLACE_WRITE_BYTES)
> > >   typedef struct FuseExport FuseExport;
> > > +typedef struct FuseQueue FuseQueue;
> > > +
> > > +#ifdef CONFIG_LINUX_IO_URING
> > > +#define FUSE_DEFAULT_RING_QUEUE_DEPTH 64
> > > +#define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32
> > > +
> > > +typedef struct FuseRingQueue FuseRingQueue;
> > > +typedef struct FuseRingEnt {
> > > +    /* back pointer */
> > > +    FuseRingQueue *rq;
> > > +
> > > +    /* commit id of a fuse request */
> > > +    uint64_t req_commit_id;
> > 
> > This field is not used in this commit. Please introduce it in the commit
> > that uses it so it's easier to review and understand the purpose of this
> > field.
> > 
> > > +
> > > +    /* fuse request header and payload */
> > > +    struct fuse_uring_req_header req_header;
> > > +    void *op_payload;
> > > +    size_t req_payload_sz;
> > 
> > op_payload and req_payload_sz refer to the same buffer, and they are
> > submitted alongside req_header. It would be nice to name the fields
> > consistently:
> > 
> >    struct fuse_uring_req_header req_header;
> >    void *req_payload;
> >    size_t req_payload_sz;
> > 
> > req_payload and req_payload_sz could be eliminated since they are also
> > stored in iov[1].iov_base and .iov_len. If you feel that would be harder
> > to understand, then it's okay to keep the duplicate fields.
> > 
> 
> Makes sense. I followed the design in libfuse. Probably best to just leave
> them in the struct for readability
> 
> > > +
> > > +    /* The vector passed to the kernel */
> > > +    struct iovec iov[2];
> > > +
> > > +    CqeHandler fuse_cqe_handler;
> > > +} FuseRingEnt;
> > > +
> > > +struct FuseRingQueue {
> > 
> > A comment would be nice here to explain that the kernel requires one
> > FuseRingQueue per host CPU and this concept is independent of /dev/fuse
> > (FuseQueue).
> > 
> > > +    int rqid;
> > > +
> > > +    /* back pointer */
> > > +    FuseQueue *q;
> > > +    FuseRingEnt *ent;
> > > +
> > > +    /* List entry for ring_queues */
> > > +    QLIST_ENTRY(FuseRingQueue) next;
> > > +};
> > > +
> > > +/*
> > > + * Round-robin distribution of ring queues across FUSE queues.
> > > + * This structure manages the mapping between kernel ring queues and user
> > > + * FUSE queues.
> > > + */
> > > +typedef struct FuseRingQueueManager {
> > > +    FuseRingQueue *ring_queues;
> > > +    int num_ring_queues;
> > > +    int num_fuse_queues;
> > > +} FuseRingQueueManager;
> > > +#endif
> > 
> > It's easy to forget which #ifdef we're inside after a few lines, so it
> > helps to indicate that in a comment:
> > 
> > #endif /* CONFIG_LINUX_IO_URING */
> > 
> > >   /*
> > >    * One FUSE "queue", representing one FUSE FD from which requests are fetched
> > >    * and processed.  Each queue is tied to an AioContext.
> > >    */
> > > -typedef struct FuseQueue {
> > > +struct FuseQueue {
> > >       FuseExport *exp;
> > >       AioContext *ctx;
> > > @@ -109,15 +159,11 @@ typedef struct FuseQueue {
> > >        * Free this buffer with qemu_vfree().
> > >        */
> > >       void *spillover_buf;
> > > -} FuseQueue;
> > > -/*
> > > - * Verify that FuseQueue.request_buf plus the spill-over buffer together
> > > - * are big enough to be accepted by the FUSE kernel driver.
> > > - */
> > > -QEMU_BUILD_BUG_ON(sizeof(((FuseQueue *)0)->request_buf) +
> > > -                  FUSE_SPILLOVER_BUF_SIZE <
> > > -                  FUSE_MIN_READ_BUFFER);
> > 
> > Why was this removed, it's probably still necessary in the non-io_uring
> > case (which is compiled in even when CONFIG_LINUX_IO_URING is defined)?
> > 
> 
> You can check Hanna’s patch. In fuse_co_process_request, Hanna introduced
> this check when using FUSE_OUT_OP_STRUCT to cast void *buf into the
> corresponding in/out header for the given operation.
> 
> But in the v2 patch, we merged the legacy process_request and the uring
> version into one. This caused the legacy path to pass the array into the
> common function as a pointer. Now, when we do the buf header size check,
> what gets checked is just the pointer size.
> 
> #define FUSE_OUT_OP_STRUCT(op_name, out_buf) \
>     ({ \
>         struct fuse_out_header *__out_hdr = \
>             (struct fuse_out_header *)(out_buf); \
>         struct fuse_##op_name##_out *__out = \
>             (struct fuse_##op_name##_out *)(__out_hdr + 1); \
>         \
>         QEMU_BUILD_BUG_ON(sizeof(*__out_hdr) + sizeof(*__out) > \
>                           sizeof(out_buf)); \
>         \
>         __out; \
>     })

Your patch does not change how ->request_buf is used by the non-io_uring
code path. ->request_buf needs to fit at least FUSE_MIN_READ_BUFFER
bytes so I think this QEMU_BUILD_BUG_ON() should not be deleted.

> 
> 
> > > +#ifdef CONFIG_LINUX_IO_URING
> > > +    QLIST_HEAD(, FuseRingQueue) ring_queue_list;
> > > +#endif
> > > +};
> > >   struct FuseExport {
> > >       BlockExport common;
> > > @@ -133,7 +179,7 @@ struct FuseExport {
> > >        */
> > >       bool halted;
> > > -    int num_queues;
> > > +    size_t num_queues;
> > >       FuseQueue *queues;
> > >       /*
> > >        * True if this export should follow the generic export's AioContext.
> > > @@ -149,6 +195,12 @@ struct FuseExport {
> > >       /* Whether allow_other was used as a mount option or not */
> > >       bool allow_other;
> > > +#ifdef CONFIG_LINUX_IO_URING
> > > +    bool is_uring;
> > > +    size_t ring_queue_depth;
> > > +    FuseRingQueueManager *ring_queue_manager;
> > > +#endif
> > > +
> > >       mode_t st_mode;
> > >       uid_t st_uid;
> > >       gid_t st_gid;
> > > @@ -205,7 +257,7 @@ static void fuse_attach_handlers(FuseExport *exp)
> > >           return;
> > >       }
> > > -    for (int i = 0; i < exp->num_queues; i++) {
> > > +    for (size_t i = 0; i < exp->num_queues; i++) {
> > >           aio_set_fd_handler(exp->queues[i].ctx, exp->queues[i].fuse_fd,
> > >                              read_from_fuse_fd, NULL, NULL, NULL,
> > >                              &exp->queues[i]);
> > > @@ -257,6 +309,189 @@ static const BlockDevOps fuse_export_blk_dev_ops = {
> > >       .drained_poll  = fuse_export_drained_poll,
> > >   };
> > > +#ifdef CONFIG_LINUX_IO_URING
> > > +static void fuse_uring_sqe_set_req_data(struct fuse_uring_cmd_req *req,
> > > +                    const unsigned int rqid,
> > > +                    const unsigned int commit_id)
> > > +{
> > > +    req->qid = rqid;
> > > +    req->commit_id = commit_id;
> > > +    req->flags = 0;
> > > +}
> > > +
> > > +static void fuse_uring_sqe_prepare(struct io_uring_sqe *sqe, FuseQueue *q,
> > > +               __u32 cmd_op)
> > > +{
> > > +    sqe->opcode = IORING_OP_URING_CMD;
> > > +
> > > +    sqe->fd = q->fuse_fd;
> > > +    sqe->rw_flags = 0;
> > > +    sqe->ioprio = 0;
> > > +    sqe->off = 0;
> > > +
> > > +    sqe->cmd_op = cmd_op;
> > > +    sqe->__pad1 = 0;
> > > +}
> > > +
> > > +static void fuse_uring_prep_sqe_register(struct io_uring_sqe *sqe, void *opaque)
> > > +{
> > > +    FuseRingEnt *ent = opaque;
> > > +    struct fuse_uring_cmd_req *req = (void *)&sqe->cmd[0];
> > > +
> > > +    fuse_uring_sqe_prepare(sqe, ent->rq->q, FUSE_IO_URING_CMD_REGISTER);
> > > +
> > > +    sqe->addr = (uint64_t)(ent->iov);
> > > +    sqe->len = 2;
> > > +
> > > +    fuse_uring_sqe_set_req_data(req, ent->rq->rqid, 0);
> > > +}
> > > +
> > > +static void fuse_uring_submit_register(void *opaque)
> > > +{
> > > +    FuseRingEnt *ent = opaque;
> > > +    FuseExport *exp = ent->rq->q->exp;
> > 
> > This variable is unused in this commit? Does this commit compile for
> > you? Usually the compiler warns about unused variables.
> > 
> 
> The first version was a large single patch. I split it with git, and this
> variable is now used in a different patch
> 
> > > +
> > > +
> > > +    aio_add_sqe(fuse_uring_prep_sqe_register, ent, &(ent->fuse_cqe_handler));
> > > +}
> > > +
> > > +/**
> > > + * Distribute ring queues across FUSE queues using round-robin algorithm.
> > > + * This ensures even distribution of kernel ring queues across user-specified
> > > + * FUSE queues.
> > > + */
> > > +static
> > > +FuseRingQueueManager *fuse_ring_queue_manager_create(int num_fuse_queues,
> > > +                                                    size_t ring_queue_depth,
> > > +                                                    size_t bufsize)
> > > +{
> > > +    int num_ring_queues = get_nprocs();
> > 
> > The kernel code uses num_possible_cpus() in
> > fs/fuse/dev_uring.c:fuse_uring_create() so I think this should be
> > get_nprocs_conf() instead of get_nprocs().
> > 
> > > +    FuseRingQueueManager *manager = g_new(FuseRingQueueManager, 1);
> > > +
> > > +    if (!manager) {
> > 
> > g_new() never returns NULL, so you can remove this if statement. If
> > memory cannot be allocated then the process will abort.
> > 
> > > +        return NULL;
> > > +    }
> > > +
> > > +    manager->ring_queues = g_new(FuseRingQueue, num_ring_queues);
> > > +    manager->num_ring_queues = num_ring_queues;
> > > +    manager->num_fuse_queues = num_fuse_queues;
> > > +
> > > +    if (!manager->ring_queues) {
> > 
> > Same here.
> > 
> > > +        g_free(manager);
> > > +        return NULL;
> > > +    }
> > > +
> > > +    for (int i = 0; i < num_ring_queues; i++) {
> > > +        FuseRingQueue *rq = &manager->ring_queues[i];
> > > +        rq->rqid = i;
> > > +        rq->ent = g_new(FuseRingEnt, ring_queue_depth);
> > > +
> > > +        if (!rq->ent) {
> > 
> > Same here.
> > 
> > > +            for (int j = 0; j < i; j++) {
> > > +                g_free(manager->ring_queues[j].ent);
> > > +            }
> > > +            g_free(manager->ring_queues);
> > > +            g_free(manager);
> > > +            return NULL;
> > > +        }
> > > +
> > > +        for (size_t j = 0; j < ring_queue_depth; j++) {
> > > +            FuseRingEnt *ent = &rq->ent[j];
> > > +            ent->rq = rq;
> > > +            ent->req_payload_sz = bufsize - FUSE_BUFFER_HEADER_SIZE;
> > > +            ent->op_payload = g_malloc0(ent->req_payload_sz);
> > > +
> > > +            if (!ent->op_payload) {
> > 
> > Same here.
> > 
> > > +                for (size_t k = 0; k < j; k++) {
> > > +                    g_free(rq->ent[k].op_payload);
> > > +                }
> > > +                g_free(rq->ent);
> > > +                for (int k = 0; k < i; k++) {
> > > +                    g_free(manager->ring_queues[k].ent);
> > > +                }
> > > +                g_free(manager->ring_queues);
> > > +                g_free(manager);
> > 
> > Where are these structures freed in the normal lifecycle of a FUSE
> > export? I only see this error handling code, but nothing is freed when
> > the export is shut down.
> 
> 
> Same here. The first version was a large single patch. I split it with git,
> and we do cleanup in a different patch

It's easier for reviewers and safer for backports if each patch is
self-contained with the cleanup code included in the same patch where
the resource is created. If you make changes to the patch organization
in the next revision then it would be nice to included the cleanup in
this patch.

> 
> > 
> > > +                return NULL;
> > > +            }
> > > +
> > > +            ent->iov[0] = (struct iovec) {
> > > +                &(ent->req_header),
> > > +                sizeof(struct fuse_uring_req_header)
> > > +            };
> > > +            ent->iov[1] = (struct iovec) {
> > > +                ent->op_payload,
> > > +                ent->req_payload_sz
> > > +            };
> > > +
> > > +            ent->fuse_cqe_handler.cb = fuse_uring_cqe_handler;
> > > +        }
> > > +    }
> > > +
> > > +    return manager;
> > > +}
> > > +
> > > +static
> > > +void fuse_distribute_ring_queues(FuseExport *exp, FuseRingQueueManager *manager)
> > > +{
> > > +    int queue_index = 0;
> > > +
> > > +    for (int i = 0; i < manager->num_ring_queues; i++) {
> > > +        FuseRingQueue *rq = &manager->ring_queues[i];
> > > +
> > > +        rq->q = &exp->queues[queue_index];
> > > +        QLIST_INSERT_HEAD(&(rq->q->ring_queue_list), rq, next);
> > > +
> > > +        queue_index = (queue_index + 1) % manager->num_fuse_queues;
> > > +    }
> > > +}
> > > +
> > > +static
> > > +void fuse_schedule_ring_queue_registrations(FuseExport *exp,
> > > +                                            FuseRingQueueManager *manager)
> > > +{
> > > +    for (int i = 0; i < manager->num_fuse_queues; i++) {
> > > +        FuseQueue *q = &exp->queues[i];
> > > +        FuseRingQueue *rq;
> > > +
> > > +        QLIST_FOREACH(rq, &q->ring_queue_list, next) {
> > > +            for (int j = 0; j < exp->ring_queue_depth; j++) {
> > > +                aio_bh_schedule_oneshot(q->ctx, fuse_uring_submit_register,
> > > +                                        &(rq->ent[j]));
> > > +            }
> > > +        }
> > > +    }
> > > +}
> > > +
> > > +static void fuse_uring_start(FuseExport *exp, struct fuse_init_out *out)
> > > +{
> > > +    /*
> > > +     * Since we didn't enable the FUSE_MAX_PAGES feature, the value of
> > > +     * fc->max_pages should be FUSE_DEFAULT_MAX_PAGES_PER_REQ, which is set by
> > > +     * the kernel by default. Also, max_write should not exceed
> > > +     * FUSE_DEFAULT_MAX_PAGES_PER_REQ * PAGE_SIZE.
> > > +     */
> > > +    size_t bufsize = out->max_write + FUSE_BUFFER_HEADER_SIZE;
> > > +
> > > +    if (!(out->flags & FUSE_MAX_PAGES)) {
> > > +        bufsize = FUSE_DEFAULT_MAX_PAGES_PER_REQ * qemu_real_host_page_size()
> > > +                         + FUSE_BUFFER_HEADER_SIZE;
> > > +    }
> > > +
> > > +    exp->ring_queue_manager = fuse_ring_queue_manager_create(
> > > +        exp->num_queues, exp->ring_queue_depth, bufsize);
> > > +
> > > +    if (!exp->ring_queue_manager) {
> > > +        error_report("Failed to create ring queue manager");
> > > +        return;
> > > +    }
> > > +
> > > +    /* Distribute ring queues across FUSE queues using round-robin */
> > > +    fuse_distribute_ring_queues(exp, exp->ring_queue_manager);
> > > +
> > > +    fuse_schedule_ring_queue_registrations(exp, exp->ring_queue_manager);
> > > +}
> > > +#endif
> > > +
> > >   static int fuse_export_create(BlockExport *blk_exp,
> > >                                 BlockExportOptions *blk_exp_args,
> > >                                 AioContext *const *multithread,
> > > @@ -270,6 +505,11 @@ static int fuse_export_create(BlockExport *blk_exp,
> > >       assert(blk_exp_args->type == BLOCK_EXPORT_TYPE_FUSE);
> > > +#ifdef CONFIG_LINUX_IO_URING
> > > +    exp->is_uring = args->io_uring;
> > > +    exp->ring_queue_depth = FUSE_DEFAULT_RING_QUEUE_DEPTH;
> > > +#endif
> > > +
> > >       if (multithread) {
> > >           /* Guaranteed by common export code */
> > >           assert(mt_count >= 1);
> > > @@ -283,6 +523,10 @@ static int fuse_export_create(BlockExport *blk_exp,
> > >                   .exp = exp,
> > >                   .ctx = multithread[i],
> > >                   .fuse_fd = -1,
> > > +#ifdef CONFIG_LINUX_IO_URING
> > > +                .ring_queue_list =
> > > +                    QLIST_HEAD_INITIALIZER(exp->queues[i].ring_queue_list),
> > > +#endif
> > >               };
> > >           }
> > >       } else {
> > > @@ -296,6 +540,10 @@ static int fuse_export_create(BlockExport *blk_exp,
> > >               .exp = exp,
> > >               .ctx = exp->common.ctx,
> > >               .fuse_fd = -1,
> > > +#ifdef CONFIG_LINUX_IO_URING
> > > +            .ring_queue_list =
> > > +                QLIST_HEAD_INITIALIZER(exp->queues[0].ring_queue_list),
> > > +#endif
> > >           };
> > >       }
> > > @@ -685,17 +933,39 @@ static bool is_regular_file(const char *path, Error **errp)
> > >    */
> > >   static ssize_t coroutine_fn
> > >   fuse_co_init(FuseExport *exp, struct fuse_init_out *out,
> > > -             uint32_t max_readahead, uint32_t flags)
> > > +             uint32_t max_readahead, const struct fuse_init_in *in)
> > >   {
> > > -    const uint32_t supported_flags = FUSE_ASYNC_READ | FUSE_ASYNC_DIO;
> > > +    uint64_t supported_flags = FUSE_ASYNC_READ | FUSE_ASYNC_DIO
> > > +                                     | FUSE_INIT_EXT;
> > > +    uint64_t outargflags = 0;
> > > +    uint64_t inargflags = in->flags;
> > > +
> > > +    ssize_t ret = 0;
> > > +
> > > +    if (inargflags & FUSE_INIT_EXT) {
> > > +        inargflags = inargflags | (uint64_t) in->flags2 << 32;
> > > +    }
> > > +
> > > +#ifdef CONFIG_LINUX_IO_URING
> > > +    if (exp->is_uring) {
> > > +        if (inargflags & FUSE_OVER_IO_URING) {
> > > +            supported_flags |= FUSE_OVER_IO_URING;
> > > +        } else {
> > > +            exp->is_uring = false;
> > > +            ret = -ENODEV;
> > > +        }
> > > +    }
> > > +#endif
> > > +
> > > +    outargflags = inargflags & supported_flags;
> > >       *out = (struct fuse_init_out) {
> > >           .major = FUSE_KERNEL_VERSION,
> > >           .minor = FUSE_KERNEL_MINOR_VERSION,
> > >           .max_readahead = max_readahead,
> > >           .max_write = FUSE_MAX_WRITE_BYTES,
> > > -        .flags = flags & supported_flags,
> > > -        .flags2 = 0,
> > > +        .flags = outargflags,
> > > +        .flags2 = outargflags >> 32,
> > >           /* libfuse maximum: 2^16 - 1 */
> > >           .max_background = UINT16_MAX,
> > > @@ -717,7 +987,7 @@ fuse_co_init(FuseExport *exp, struct fuse_init_out *out,
> > >           .map_alignment = 0,
> > >       };
> > > -    return sizeof(*out);
> > > +    return ret < 0 ? ret : sizeof(*out);
> > >   }
> > >   /**
> > > @@ -1506,6 +1776,14 @@ fuse_co_process_request(FuseQueue *q, void *spillover_buf)
> > >           fuse_write_buf_response(q->fuse_fd, req_id, out_hdr,
> > >                                   out_data_buffer, ret);
> > >           qemu_vfree(out_data_buffer);
> > > +#ifdef CONFIG_LINUX_IO_URING
> > > +    /* Handle FUSE-over-io_uring initialization */
> > > +    if (unlikely(opcode == FUSE_INIT && exp->is_uring)) {
> > > +        struct fuse_init_out *out =
> > > +            (struct fuse_init_out *)FUSE_OUT_OP_STRUCT(out_buf);
> > > +        fuse_uring_start(exp, out);
> > 
> > Is there any scenario where FUSE_INIT can be received multiple times?
> > Maybe if the FUSE file system is umounted and mounted again? I want to
> > check that this doesn't leak previously allocated ring state.
> > 
> 
> I don't think so, even in a multi-threaded FUSE setup, the kernel only sends
> a single FUSE_INIT to userspace. In the legacy mode, whichever thread
> receives that request can handle it and initialize FUSE-over-io_uring

Okay. Please add an assertion to fuse_uring_start() to catch the case
where it is called twice.

Thanks,
Stefan

Re: [PATCH 1/4] export/fuse: add opt to enable FUSE-over-io_uring

Posted by Brian Song 2 weeks, 4 days ago


On 9/9/25 10:48 AM, Stefan Hajnoczi wrote:
> On Wed, Sep 03, 2025 at 02:00:55PM -0400, Brian Song wrote:
>>
>>
>> On 9/3/25 6:53 AM, Stefan Hajnoczi wrote:
>>> On Fri, Aug 29, 2025 at 10:50:22PM -0400, Brian Song wrote:
>>>> This patch adds a new export option for storage-export-daemon to enable
>>>> FUSE-over-io_uring via the switch io-uring=on|off (disableby default).
>>>> It also implements the protocol handshake with the Linux kernel
>>>> during the FUSE-over-io_uring initialization phase.
>>>>
>>>> See: https://docs.kernel.org/filesystems/fuse-io-uring.html
>>>>
>>>> The kernel documentation describes in detail how FUSE-over-io_uring
>>>> works. This patch implements the Initial SQE stage shown in thediagram:
>>>> it initializes one queue per IOThread, each currently supporting a
>>>> single submission queue entry (SQE). When the FUSE driver sends the
>>>> first FUSE request (FUSE_INIT), storage-export-daemon calls
>>>> fuse_uring_start() to complete initialization, ultimately submitting
>>>> the SQE with the FUSE_IO_URING_CMD_REGISTER command to confirm
>>>> successful initialization with the kernel.
>>>>
>>>> We also added support for multiple IOThreads. The current Linux kernel
>>>> requires registering $(nproc) queues when setting up FUSE-over-io_uring
>>>> To let users customize the number of FUSE Queues (i.e., IOThreads),
>>>> we first create nproc Ring Queues as required by the kernel, then
>>>> distribute them in a round-robin manner to the FUSE Queues for
>>>> registration. In addition, to support multiple in-flight requests,
>>>> we configure each Ring Queue with FUSE_DEFAULT_RING_QUEUE_DEPTH
>>>> entries/requests.
>>>
>>> The previous paragraph says "each currently supporting a single
>>> submission queue entry (SQE)" whereas this paragraph says "we configure
>>> each Ring Queue with FUSE_DEFAULT_RING_QUEUE_DEPTH entries/requests".
>>> Maybe this paragraph was squashed into the commit description in a later
>>> step and the previous paragraph can be updated to reflect that multiple
>>> SQEs are submitted?
>>>
>>>>
>>>> Suggested-by: Kevin Wolf <kwolf@redhat.com>
>>>> Suggested-by: Stefan Hajnoczi <stefanha@redhat.com>
>>>> Signed-off-by: Brian Song <hibriansong@gmail.com>
>>>> ---
>>>>    block/export/fuse.c                  | 310 +++++++++++++++++++++++++--
>>>>    docs/tools/qemu-storage-daemon.rst   |  11 +-
>>>>    qapi/block-export.json               |   5 +-
>>>>    storage-daemon/qemu-storage-daemon.c |   1 +
>>>>    util/fdmon-io_uring.c                |   5 +-
>>>>    5 files changed, 309 insertions(+), 23 deletions(-)
>>>>
>>>> diff --git a/block/export/fuse.c b/block/export/fuse.c
>>>> index c0ad4696ce..19bf9e5f74 100644
>>>> --- a/block/export/fuse.c
>>>> +++ b/block/export/fuse.c
>>>> @@ -48,6 +48,9 @@
>>>>    #include <linux/fs.h>
>>>>    #endif
>>>> +/* room needed in buffer to accommodate header */
>>>> +#define FUSE_BUFFER_HEADER_SIZE 0x1000
>>>
>>> Is it possible to write this in a way that shows how the constant is
>>> calculated? That way the constant would automatically adjust on systems
>>> where the underlying assumptions have changed (e.g. page size, header
>>> struct size). This approach is also self-documenting so it's possible to
>>> understand where the magic number comes from.
>>>
>>> For example:
>>>
>>>     #define FUSE_BUFFER_HEADER_SIZE DIV_ROUND_UP(sizeof(struct fuse_uring_req_header), qemu_real_host_page_size())
>>>
>>> (I'm guessing what the formula you used is, so this example may be
>>> incorrect...)
>>>
>>
>> In libfuse, the way to calculate the bufsize (for req_payload) is the same
>> as in this patch. For different requests, the request header sizes are not
>> the same, but they should never exceed a certain value. So is that why
>> libfuse has this kind of magic number?
> 
>  From <linux/fuse.h>:
> 
>    #define FUSE_URING_IN_OUT_HEADER_SZ 128
>    #define FUSE_URING_OP_IN_OUT_SZ 128
>    ...
>    struct fuse_uring_req_header {
>            /* struct fuse_in_header / struct fuse_out_header */
>            char in_out[FUSE_URING_IN_OUT_HEADER_SZ];
> 
>            /* per op code header */
>            char op_in[FUSE_URING_OP_IN_OUT_SZ];
> 
>            struct fuse_uring_ent_in_out ring_ent_in_out;
>    };
> 
> The size of struct fuse_uring_req_header is 128 + 128 + (4 * 8) = 288
> bytes. A single 4 KB page easily fits this. I guess that's why 0x1000
> was chosen in libfuse.
> 

Yes, the two iovecs in the ring entry: one refers to the general request 
header (fuse_uring_req_header) and the other refers to the payload. The 
variable bufsize represents the space for these two objects and is used 
to calculate the payload size in case max_write changes.

Alright, let me document the buffer usage. It's been a while since I 
started this, so I don’t fully remember how the buffer works here.

>>
>>>> +
>>>>    /* Prevent overly long bounce buffer allocations */
>>>>    #define FUSE_MAX_READ_BYTES (MIN(BDRV_REQUEST_MAX_BYTES, 1 * 1024 * 1024))
>>>>    /*
>>>> @@ -63,12 +66,59 @@
>>>>        (FUSE_MAX_WRITE_BYTES - FUSE_IN_PLACE_WRITE_BYTES)
>>>>    typedef struct FuseExport FuseExport;
>>>> +typedef struct FuseQueue FuseQueue;
>>>> +
>>>> +#ifdef CONFIG_LINUX_IO_URING
>>>> +#define FUSE_DEFAULT_RING_QUEUE_DEPTH 64
>>>> +#define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32
>>>> +
>>>> +typedef struct FuseRingQueue FuseRingQueue;
>>>> +typedef struct FuseRingEnt {
>>>> +    /* back pointer */
>>>> +    FuseRingQueue *rq;
>>>> +
>>>> +    /* commit id of a fuse request */
>>>> +    uint64_t req_commit_id;
>>>
>>> This field is not used in this commit. Please introduce it in the commit
>>> that uses it so it's easier to review and understand the purpose of this
>>> field.
>>>
>>>> +
>>>> +    /* fuse request header and payload */
>>>> +    struct fuse_uring_req_header req_header;
>>>> +    void *op_payload;
>>>> +    size_t req_payload_sz;
>>>
>>> op_payload and req_payload_sz refer to the same buffer, and they are
>>> submitted alongside req_header. It would be nice to name the fields
>>> consistently:
>>>
>>>     struct fuse_uring_req_header req_header;
>>>     void *req_payload;
>>>     size_t req_payload_sz;
>>>
>>> req_payload and req_payload_sz could be eliminated since they are also
>>> stored in iov[1].iov_base and .iov_len. If you feel that would be harder
>>> to understand, then it's okay to keep the duplicate fields.
>>>
>>
>> Makes sense. I followed the design in libfuse. Probably best to just leave
>> them in the struct for readability
>>
>>>> +
>>>> +    /* The vector passed to the kernel */
>>>> +    struct iovec iov[2];
>>>> +
>>>> +    CqeHandler fuse_cqe_handler;
>>>> +} FuseRingEnt;
>>>> +
>>>> +struct FuseRingQueue {
>>>
>>> A comment would be nice here to explain that the kernel requires one
>>> FuseRingQueue per host CPU and this concept is independent of /dev/fuse
>>> (FuseQueue).
>>>
>>>> +    int rqid;
>>>> +
>>>> +    /* back pointer */
>>>> +    FuseQueue *q;
>>>> +    FuseRingEnt *ent;
>>>> +
>>>> +    /* List entry for ring_queues */
>>>> +    QLIST_ENTRY(FuseRingQueue) next;
>>>> +};
>>>> +
>>>> +/*
>>>> + * Round-robin distribution of ring queues across FUSE queues.
>>>> + * This structure manages the mapping between kernel ring queues and user
>>>> + * FUSE queues.
>>>> + */
>>>> +typedef struct FuseRingQueueManager {
>>>> +    FuseRingQueue *ring_queues;
>>>> +    int num_ring_queues;
>>>> +    int num_fuse_queues;
>>>> +} FuseRingQueueManager;
>>>> +#endif
>>>
>>> It's easy to forget which #ifdef we're inside after a few lines, so it
>>> helps to indicate that in a comment:
>>>
>>> #endif /* CONFIG_LINUX_IO_URING */
>>>
>>>>    /*
>>>>     * One FUSE "queue", representing one FUSE FD from which requests are fetched
>>>>     * and processed.  Each queue is tied to an AioContext.
>>>>     */
>>>> -typedef struct FuseQueue {
>>>> +struct FuseQueue {
>>>>        FuseExport *exp;
>>>>        AioContext *ctx;
>>>> @@ -109,15 +159,11 @@ typedef struct FuseQueue {
>>>>         * Free this buffer with qemu_vfree().
>>>>         */
>>>>        void *spillover_buf;
>>>> -} FuseQueue;
>>>> -/*
>>>> - * Verify that FuseQueue.request_buf plus the spill-over buffer together
>>>> - * are big enough to be accepted by the FUSE kernel driver.
>>>> - */
>>>> -QEMU_BUILD_BUG_ON(sizeof(((FuseQueue *)0)->request_buf) +
>>>> -                  FUSE_SPILLOVER_BUF_SIZE <
>>>> -                  FUSE_MIN_READ_BUFFER);
>>>
>>> Why was this removed, it's probably still necessary in the non-io_uring
>>> case (which is compiled in even when CONFIG_LINUX_IO_URING is defined)?
>>>
>>
>> You can check Hanna’s patch. In fuse_co_process_request, Hanna introduced
>> this check when using FUSE_OUT_OP_STRUCT to cast void *buf into the
>> corresponding in/out header for the given operation.
>>
>> But in the v2 patch, we merged the legacy process_request and the uring
>> version into one. This caused the legacy path to pass the array into the
>> common function as a pointer. Now, when we do the buf header size check,
>> what gets checked is just the pointer size.
>>
>> #define FUSE_OUT_OP_STRUCT(op_name, out_buf) \
>>      ({ \
>>          struct fuse_out_header *__out_hdr = \
>>              (struct fuse_out_header *)(out_buf); \
>>          struct fuse_##op_name##_out *__out = \
>>              (struct fuse_##op_name##_out *)(__out_hdr + 1); \
>>          \
>>          QEMU_BUILD_BUG_ON(sizeof(*__out_hdr) + sizeof(*__out) > \
>>                            sizeof(out_buf)); \
>>          \
>>          __out; \
>>      })
> 
> Your patch does not change how ->request_buf is used by the non-io_uring
> code path. ->request_buf needs to fit at least FUSE_MIN_READ_BUFFER
> bytes so I think this QEMU_BUILD_BUG_ON() should not be deleted.
> 

Oh, I misread and thought you were mentioning the QEMU_BUILD_BUG_ON 
deleted in FUSE_IN/OUT_OP_STRUCT_LEGACY. Yes, I mistakenly deleted the 
static assertion for the read buffer and will put it back.

>>
>>
>>>> +#ifdef CONFIG_LINUX_IO_URING
>>>> +    QLIST_HEAD(, FuseRingQueue) ring_queue_list;
>>>> +#endif
>>>> +};
>>>>    struct FuseExport {
>>>>        BlockExport common;
>>>> @@ -133,7 +179,7 @@ struct FuseExport {
>>>>         */
>>>>        bool halted;
>>>> -    int num_queues;
>>>> +    size_t num_queues;
>>>>        FuseQueue *queues;
>>>>        /*
>>>>         * True if this export should follow the generic export's AioContext.
>>>> @@ -149,6 +195,12 @@ struct FuseExport {
>>>>        /* Whether allow_other was used as a mount option or not */
>>>>        bool allow_other;
>>>> +#ifdef CONFIG_LINUX_IO_URING
>>>> +    bool is_uring;
>>>> +    size_t ring_queue_depth;
>>>> +    FuseRingQueueManager *ring_queue_manager;
>>>> +#endif
>>>> +
>>>>        mode_t st_mode;
>>>>        uid_t st_uid;
>>>>        gid_t st_gid;
>>>> @@ -205,7 +257,7 @@ static void fuse_attach_handlers(FuseExport *exp)
>>>>            return;
>>>>        }
>>>> -    for (int i = 0; i < exp->num_queues; i++) {
>>>> +    for (size_t i = 0; i < exp->num_queues; i++) {
>>>>            aio_set_fd_handler(exp->queues[i].ctx, exp->queues[i].fuse_fd,
>>>>                               read_from_fuse_fd, NULL, NULL, NULL,
>>>>                               &exp->queues[i]);
>>>> @@ -257,6 +309,189 @@ static const BlockDevOps fuse_export_blk_dev_ops = {
>>>>        .drained_poll  = fuse_export_drained_poll,
>>>>    };
>>>> +#ifdef CONFIG_LINUX_IO_URING
>>>> +static void fuse_uring_sqe_set_req_data(struct fuse_uring_cmd_req *req,
>>>> +                    const unsigned int rqid,
>>>> +                    const unsigned int commit_id)
>>>> +{
>>>> +    req->qid = rqid;
>>>> +    req->commit_id = commit_id;
>>>> +    req->flags = 0;
>>>> +}
>>>> +
>>>> +static void fuse_uring_sqe_prepare(struct io_uring_sqe *sqe, FuseQueue *q,
>>>> +               __u32 cmd_op)
>>>> +{
>>>> +    sqe->opcode = IORING_OP_URING_CMD;
>>>> +
>>>> +    sqe->fd = q->fuse_fd;
>>>> +    sqe->rw_flags = 0;
>>>> +    sqe->ioprio = 0;
>>>> +    sqe->off = 0;
>>>> +
>>>> +    sqe->cmd_op = cmd_op;
>>>> +    sqe->__pad1 = 0;
>>>> +}
>>>> +
>>>> +static void fuse_uring_prep_sqe_register(struct io_uring_sqe *sqe, void *opaque)
>>>> +{
>>>> +    FuseRingEnt *ent = opaque;
>>>> +    struct fuse_uring_cmd_req *req = (void *)&sqe->cmd[0];
>>>> +
>>>> +    fuse_uring_sqe_prepare(sqe, ent->rq->q, FUSE_IO_URING_CMD_REGISTER);
>>>> +
>>>> +    sqe->addr = (uint64_t)(ent->iov);
>>>> +    sqe->len = 2;
>>>> +
>>>> +    fuse_uring_sqe_set_req_data(req, ent->rq->rqid, 0);
>>>> +}
>>>> +
>>>> +static void fuse_uring_submit_register(void *opaque)
>>>> +{
>>>> +    FuseRingEnt *ent = opaque;
>>>> +    FuseExport *exp = ent->rq->q->exp;
>>>
>>> This variable is unused in this commit? Does this commit compile for
>>> you? Usually the compiler warns about unused variables.
>>>
>>
>> The first version was a large single patch. I split it with git, and this
>> variable is now used in a different patch
>>
>>>> +
>>>> +
>>>> +    aio_add_sqe(fuse_uring_prep_sqe_register, ent, &(ent->fuse_cqe_handler));
>>>> +}
>>>> +
>>>> +/**
>>>> + * Distribute ring queues across FUSE queues using round-robin algorithm.
>>>> + * This ensures even distribution of kernel ring queues across user-specified
>>>> + * FUSE queues.
>>>> + */
>>>> +static
>>>> +FuseRingQueueManager *fuse_ring_queue_manager_create(int num_fuse_queues,
>>>> +                                                    size_t ring_queue_depth,
>>>> +                                                    size_t bufsize)
>>>> +{
>>>> +    int num_ring_queues = get_nprocs();
>>>
>>> The kernel code uses num_possible_cpus() in
>>> fs/fuse/dev_uring.c:fuse_uring_create() so I think this should be
>>> get_nprocs_conf() instead of get_nprocs().
>>>
>>>> +    FuseRingQueueManager *manager = g_new(FuseRingQueueManager, 1);
>>>> +
>>>> +    if (!manager) {
>>>
>>> g_new() never returns NULL, so you can remove this if statement. If
>>> memory cannot be allocated then the process will abort.
>>>
>>>> +        return NULL;
>>>> +    }
>>>> +
>>>> +    manager->ring_queues = g_new(FuseRingQueue, num_ring_queues);
>>>> +    manager->num_ring_queues = num_ring_queues;
>>>> +    manager->num_fuse_queues = num_fuse_queues;
>>>> +
>>>> +    if (!manager->ring_queues) {
>>>
>>> Same here.
>>>
>>>> +        g_free(manager);
>>>> +        return NULL;
>>>> +    }
>>>> +
>>>> +    for (int i = 0; i < num_ring_queues; i++) {
>>>> +        FuseRingQueue *rq = &manager->ring_queues[i];
>>>> +        rq->rqid = i;
>>>> +        rq->ent = g_new(FuseRingEnt, ring_queue_depth);
>>>> +
>>>> +        if (!rq->ent) {
>>>
>>> Same here.
>>>
>>>> +            for (int j = 0; j < i; j++) {
>>>> +                g_free(manager->ring_queues[j].ent);
>>>> +            }
>>>> +            g_free(manager->ring_queues);
>>>> +            g_free(manager);
>>>> +            return NULL;
>>>> +        }
>>>> +
>>>> +        for (size_t j = 0; j < ring_queue_depth; j++) {
>>>> +            FuseRingEnt *ent = &rq->ent[j];
>>>> +            ent->rq = rq;
>>>> +            ent->req_payload_sz = bufsize - FUSE_BUFFER_HEADER_SIZE;
>>>> +            ent->op_payload = g_malloc0(ent->req_payload_sz);
>>>> +
>>>> +            if (!ent->op_payload) {
>>>
>>> Same here.
>>>
>>>> +                for (size_t k = 0; k < j; k++) {
>>>> +                    g_free(rq->ent[k].op_payload);
>>>> +                }
>>>> +                g_free(rq->ent);
>>>> +                for (int k = 0; k < i; k++) {
>>>> +                    g_free(manager->ring_queues[k].ent);
>>>> +                }
>>>> +                g_free(manager->ring_queues);
>>>> +                g_free(manager);
>>>
>>> Where are these structures freed in the normal lifecycle of a FUSE
>>> export? I only see this error handling code, but nothing is freed when
>>> the export is shut down.
>>
>>
>> Same here. The first version was a large single patch. I split it with git,
>> and we do cleanup in a different patch
> 
> It's easier for reviewers and safer for backports if each patch is
> self-contained with the cleanup code included in the same patch where
> the resource is created. If you make changes to the patch organization
> in the next revision then it would be nice to included the cleanup in
> this patch.
> 
>>
>>>
>>>> +                return NULL;
>>>> +            }
>>>> +
>>>> +            ent->iov[0] = (struct iovec) {
>>>> +                &(ent->req_header),
>>>> +                sizeof(struct fuse_uring_req_header)
>>>> +            };
>>>> +            ent->iov[1] = (struct iovec) {
>>>> +                ent->op_payload,
>>>> +                ent->req_payload_sz
>>>> +            };
>>>> +
>>>> +            ent->fuse_cqe_handler.cb = fuse_uring_cqe_handler;
>>>> +        }
>>>> +    }
>>>> +
>>>> +    return manager;
>>>> +}
>>>> +
>>>> +static
>>>> +void fuse_distribute_ring_queues(FuseExport *exp, FuseRingQueueManager *manager)
>>>> +{
>>>> +    int queue_index = 0;
>>>> +
>>>> +    for (int i = 0; i < manager->num_ring_queues; i++) {
>>>> +        FuseRingQueue *rq = &manager->ring_queues[i];
>>>> +
>>>> +        rq->q = &exp->queues[queue_index];
>>>> +        QLIST_INSERT_HEAD(&(rq->q->ring_queue_list), rq, next);
>>>> +
>>>> +        queue_index = (queue_index + 1) % manager->num_fuse_queues;
>>>> +    }
>>>> +}
>>>> +
>>>> +static
>>>> +void fuse_schedule_ring_queue_registrations(FuseExport *exp,
>>>> +                                            FuseRingQueueManager *manager)
>>>> +{
>>>> +    for (int i = 0; i < manager->num_fuse_queues; i++) {
>>>> +        FuseQueue *q = &exp->queues[i];
>>>> +        FuseRingQueue *rq;
>>>> +
>>>> +        QLIST_FOREACH(rq, &q->ring_queue_list, next) {
>>>> +            for (int j = 0; j < exp->ring_queue_depth; j++) {
>>>> +                aio_bh_schedule_oneshot(q->ctx, fuse_uring_submit_register,
>>>> +                                        &(rq->ent[j]));
>>>> +            }
>>>> +        }
>>>> +    }
>>>> +}
>>>> +
>>>> +static void fuse_uring_start(FuseExport *exp, struct fuse_init_out *out)
>>>> +{
>>>> +    /*
>>>> +     * Since we didn't enable the FUSE_MAX_PAGES feature, the value of
>>>> +     * fc->max_pages should be FUSE_DEFAULT_MAX_PAGES_PER_REQ, which is set by
>>>> +     * the kernel by default. Also, max_write should not exceed
>>>> +     * FUSE_DEFAULT_MAX_PAGES_PER_REQ * PAGE_SIZE.
>>>> +     */
>>>> +    size_t bufsize = out->max_write + FUSE_BUFFER_HEADER_SIZE;
>>>> +
>>>> +    if (!(out->flags & FUSE_MAX_PAGES)) {
>>>> +        bufsize = FUSE_DEFAULT_MAX_PAGES_PER_REQ * qemu_real_host_page_size()
>>>> +                         + FUSE_BUFFER_HEADER_SIZE;
>>>> +    }
>>>> +
>>>> +    exp->ring_queue_manager = fuse_ring_queue_manager_create(
>>>> +        exp->num_queues, exp->ring_queue_depth, bufsize);
>>>> +
>>>> +    if (!exp->ring_queue_manager) {
>>>> +        error_report("Failed to create ring queue manager");
>>>> +        return;
>>>> +    }
>>>> +
>>>> +    /* Distribute ring queues across FUSE queues using round-robin */
>>>> +    fuse_distribute_ring_queues(exp, exp->ring_queue_manager);
>>>> +
>>>> +    fuse_schedule_ring_queue_registrations(exp, exp->ring_queue_manager);
>>>> +}
>>>> +#endif
>>>> +
>>>>    static int fuse_export_create(BlockExport *blk_exp,
>>>>                                  BlockExportOptions *blk_exp_args,
>>>>                                  AioContext *const *multithread,
>>>> @@ -270,6 +505,11 @@ static int fuse_export_create(BlockExport *blk_exp,
>>>>        assert(blk_exp_args->type == BLOCK_EXPORT_TYPE_FUSE);
>>>> +#ifdef CONFIG_LINUX_IO_URING
>>>> +    exp->is_uring = args->io_uring;
>>>> +    exp->ring_queue_depth = FUSE_DEFAULT_RING_QUEUE_DEPTH;
>>>> +#endif
>>>> +
>>>>        if (multithread) {
>>>>            /* Guaranteed by common export code */
>>>>            assert(mt_count >= 1);
>>>> @@ -283,6 +523,10 @@ static int fuse_export_create(BlockExport *blk_exp,
>>>>                    .exp = exp,
>>>>                    .ctx = multithread[i],
>>>>                    .fuse_fd = -1,
>>>> +#ifdef CONFIG_LINUX_IO_URING
>>>> +                .ring_queue_list =
>>>> +                    QLIST_HEAD_INITIALIZER(exp->queues[i].ring_queue_list),
>>>> +#endif
>>>>                };
>>>>            }
>>>>        } else {
>>>> @@ -296,6 +540,10 @@ static int fuse_export_create(BlockExport *blk_exp,
>>>>                .exp = exp,
>>>>                .ctx = exp->common.ctx,
>>>>                .fuse_fd = -1,
>>>> +#ifdef CONFIG_LINUX_IO_URING
>>>> +            .ring_queue_list =
>>>> +                QLIST_HEAD_INITIALIZER(exp->queues[0].ring_queue_list),
>>>> +#endif
>>>>            };
>>>>        }
>>>> @@ -685,17 +933,39 @@ static bool is_regular_file(const char *path, Error **errp)
>>>>     */
>>>>    static ssize_t coroutine_fn
>>>>    fuse_co_init(FuseExport *exp, struct fuse_init_out *out,
>>>> -             uint32_t max_readahead, uint32_t flags)
>>>> +             uint32_t max_readahead, const struct fuse_init_in *in)
>>>>    {
>>>> -    const uint32_t supported_flags = FUSE_ASYNC_READ | FUSE_ASYNC_DIO;
>>>> +    uint64_t supported_flags = FUSE_ASYNC_READ | FUSE_ASYNC_DIO
>>>> +                                     | FUSE_INIT_EXT;
>>>> +    uint64_t outargflags = 0;
>>>> +    uint64_t inargflags = in->flags;
>>>> +
>>>> +    ssize_t ret = 0;
>>>> +
>>>> +    if (inargflags & FUSE_INIT_EXT) {
>>>> +        inargflags = inargflags | (uint64_t) in->flags2 << 32;
>>>> +    }
>>>> +
>>>> +#ifdef CONFIG_LINUX_IO_URING
>>>> +    if (exp->is_uring) {
>>>> +        if (inargflags & FUSE_OVER_IO_URING) {
>>>> +            supported_flags |= FUSE_OVER_IO_URING;
>>>> +        } else {
>>>> +            exp->is_uring = false;
>>>> +            ret = -ENODEV;
>>>> +        }
>>>> +    }
>>>> +#endif
>>>> +
>>>> +    outargflags = inargflags & supported_flags;
>>>>        *out = (struct fuse_init_out) {
>>>>            .major = FUSE_KERNEL_VERSION,
>>>>            .minor = FUSE_KERNEL_MINOR_VERSION,
>>>>            .max_readahead = max_readahead,
>>>>            .max_write = FUSE_MAX_WRITE_BYTES,
>>>> -        .flags = flags & supported_flags,
>>>> -        .flags2 = 0,
>>>> +        .flags = outargflags,
>>>> +        .flags2 = outargflags >> 32,
>>>>            /* libfuse maximum: 2^16 - 1 */
>>>>            .max_background = UINT16_MAX,
>>>> @@ -717,7 +987,7 @@ fuse_co_init(FuseExport *exp, struct fuse_init_out *out,
>>>>            .map_alignment = 0,
>>>>        };
>>>> -    return sizeof(*out);
>>>> +    return ret < 0 ? ret : sizeof(*out);
>>>>    }
>>>>    /**
>>>> @@ -1506,6 +1776,14 @@ fuse_co_process_request(FuseQueue *q, void *spillover_buf)
>>>>            fuse_write_buf_response(q->fuse_fd, req_id, out_hdr,
>>>>                                    out_data_buffer, ret);
>>>>            qemu_vfree(out_data_buffer);
>>>> +#ifdef CONFIG_LINUX_IO_URING
>>>> +    /* Handle FUSE-over-io_uring initialization */
>>>> +    if (unlikely(opcode == FUSE_INIT && exp->is_uring)) {
>>>> +        struct fuse_init_out *out =
>>>> +            (struct fuse_init_out *)FUSE_OUT_OP_STRUCT(out_buf);
>>>> +        fuse_uring_start(exp, out);
>>>
>>> Is there any scenario where FUSE_INIT can be received multiple times?
>>> Maybe if the FUSE file system is umounted and mounted again? I want to
>>> check that this doesn't leak previously allocated ring state.
>>>
>>
>> I don't think so, even in a multi-threaded FUSE setup, the kernel only sends
>> a single FUSE_INIT to userspace. In the legacy mode, whichever thread
>> receives that request can handle it and initialize FUSE-over-io_uring
> 
> Okay. Please add an assertion to fuse_uring_start() to catch the case
> where it is called twice.
> 
> Thanks,
> Stefan

Re: [PATCH 1/4] export/fuse: add opt to enable FUSE-over-io_uring

Posted by Bernd Schubert 2 weeks, 4 days ago


On 9/9/25 19:46, Brian Song wrote:
> 
> 
> On 9/9/25 10:48 AM, Stefan Hajnoczi wrote:
>> On Wed, Sep 03, 2025 at 02:00:55PM -0400, Brian Song wrote:
>>>
>>>
>>> On 9/3/25 6:53 AM, Stefan Hajnoczi wrote:
>>>> On Fri, Aug 29, 2025 at 10:50:22PM -0400, Brian Song wrote:
>>>>> This patch adds a new export option for storage-export-daemon to enable
>>>>> FUSE-over-io_uring via the switch io-uring=on|off (disableby default).
>>>>> It also implements the protocol handshake with the Linux kernel
>>>>> during the FUSE-over-io_uring initialization phase.
>>>>>
>>>>> See: https://docs.kernel.org/filesystems/fuse-io-uring.html
>>>>>
>>>>> The kernel documentation describes in detail how FUSE-over-io_uring
>>>>> works. This patch implements the Initial SQE stage shown in thediagram:
>>>>> it initializes one queue per IOThread, each currently supporting a
>>>>> single submission queue entry (SQE). When the FUSE driver sends the
>>>>> first FUSE request (FUSE_INIT), storage-export-daemon calls
>>>>> fuse_uring_start() to complete initialization, ultimately submitting
>>>>> the SQE with the FUSE_IO_URING_CMD_REGISTER command to confirm
>>>>> successful initialization with the kernel.
>>>>>
>>>>> We also added support for multiple IOThreads. The current Linux kernel
>>>>> requires registering $(nproc) queues when setting up FUSE-over-io_uring
>>>>> To let users customize the number of FUSE Queues (i.e., IOThreads),
>>>>> we first create nproc Ring Queues as required by the kernel, then
>>>>> distribute them in a round-robin manner to the FUSE Queues for
>>>>> registration. In addition, to support multiple in-flight requests,
>>>>> we configure each Ring Queue with FUSE_DEFAULT_RING_QUEUE_DEPTH
>>>>> entries/requests.
>>>>
>>>> The previous paragraph says "each currently supporting a single
>>>> submission queue entry (SQE)" whereas this paragraph says "we configure
>>>> each Ring Queue with FUSE_DEFAULT_RING_QUEUE_DEPTH entries/requests".
>>>> Maybe this paragraph was squashed into the commit description in a later
>>>> step and the previous paragraph can be updated to reflect that multiple
>>>> SQEs are submitted?
>>>>
>>>>>
>>>>> Suggested-by: Kevin Wolf <kwolf@redhat.com>
>>>>> Suggested-by: Stefan Hajnoczi <stefanha@redhat.com>
>>>>> Signed-off-by: Brian Song <hibriansong@gmail.com>
>>>>> ---
>>>>>    block/export/fuse.c                  | 310 +++++++++++++++++++++++++--
>>>>>    docs/tools/qemu-storage-daemon.rst   |  11 +-
>>>>>    qapi/block-export.json               |   5 +-
>>>>>    storage-daemon/qemu-storage-daemon.c |   1 +
>>>>>    util/fdmon-io_uring.c                |   5 +-
>>>>>    5 files changed, 309 insertions(+), 23 deletions(-)
>>>>>
>>>>> diff --git a/block/export/fuse.c b/block/export/fuse.c
>>>>> index c0ad4696ce..19bf9e5f74 100644
>>>>> --- a/block/export/fuse.c
>>>>> +++ b/block/export/fuse.c
>>>>> @@ -48,6 +48,9 @@
>>>>>    #include <linux/fs.h>
>>>>>    #endif
>>>>> +/* room needed in buffer to accommodate header */
>>>>> +#define FUSE_BUFFER_HEADER_SIZE 0x1000
>>>>
>>>> Is it possible to write this in a way that shows how the constant is
>>>> calculated? That way the constant would automatically adjust on systems
>>>> where the underlying assumptions have changed (e.g. page size, header
>>>> struct size). This approach is also self-documenting so it's possible to
>>>> understand where the magic number comes from.
>>>>
>>>> For example:
>>>>
>>>>     #define FUSE_BUFFER_HEADER_SIZE DIV_ROUND_UP(sizeof(struct fuse_uring_req_header), qemu_real_host_page_size())
>>>>
>>>> (I'm guessing what the formula you used is, so this example may be
>>>> incorrect...)
>>>>
>>>
>>> In libfuse, the way to calculate the bufsize (for req_payload) is the same
>>> as in this patch. For different requests, the request header sizes are not
>>> the same, but they should never exceed a certain value. So is that why
>>> libfuse has this kind of magic number?
>>
>>  From <linux/fuse.h>:
>>
>>    #define FUSE_URING_IN_OUT_HEADER_SZ 128
>>    #define FUSE_URING_OP_IN_OUT_SZ 128
>>    ...
>>    struct fuse_uring_req_header {
>>            /* struct fuse_in_header / struct fuse_out_header */
>>            char in_out[FUSE_URING_IN_OUT_HEADER_SZ];
>>
>>            /* per op code header */
>>            char op_in[FUSE_URING_OP_IN_OUT_SZ];
>>
>>            struct fuse_uring_ent_in_out ring_ent_in_out;
>>    };
>>
>> The size of struct fuse_uring_req_header is 128 + 128 + (4 * 8) = 288
>> bytes. A single 4 KB page easily fits this. I guess that's why 0x1000
>> was chosen in libfuse.
>>
> 
> Yes, the two iovecs in the ring entry: one refers to the general request 
> header (fuse_uring_req_header) and the other refers to the payload. The 
> variable bufsize represents the space for these two objects and is used 
> to calculate the payload size in case max_write changes.
> 
> Alright, let me document the buffer usage. It's been a while since I 
> started this, so I don’t fully remember how the buffer works here.

For current kernel code we could make this 288 allocations for the header. 
This just does not work with page pinning, which we are using at DDN
(kernel patches not upstreamed yet).

Maybe I should make the header allocation way dependent if page pinning,
there is a bit overhead with 4K headers, although 4K doesn't sound too bad,
even with many queues.


Thanks,
Bernd

[PATCH 1/4] export/fuse: add opt to enable FUSE-over-io_uring
[PATCH 2/4] export/fuse: process FUSE-over-io_uring requests
[PATCH 3/4] export/fuse: Safe termination for FUSE-uring
[PATCH 4/4] iotests: add tests for FUSE-over-io_uring