export/fuse: Use coroutines and multi-threading

[PATCH 15/15] fuse: Increase MAX_WRITE_SIZE with a second buffer

Posted by Hanna Czenczek 1 week ago

We probably want to support larger write sizes than just 4k; 64k seems
nice.  However, we cannot read partial requests from the FUSE FD, we
always have to read requests in full; so our read buffer must be large
enough to accommodate potential 64k writes if we want to support that.

Always allocating FuseRequest objects with 64k buffers in them seems
wasteful, though.  But we can get around the issue by splitting the
buffer into two and using readv(): One part will hold all normal (up to
4k) write requests and all other requests, and a second part (the
"spill-over buffer") will be used only for larger write requests.  Each
FuseQueue has its own spill-over buffer, and only if we find it used
when reading a request will we move its ownership into the FuseRequest
object and allocate a new spill-over buffer for the queue.

This way, we get to support "large" write sizes without having to
allocate big buffers when they aren't used.

Also, this even reduces the size of the FuseRequest objects because the
read buffer has to have at least FUSE_MIN_READ_BUFFER (8192) bytes; but
the requests we support are not quite so large (except for >4k writes),
so until now, we basically had to have useless padding in there.

With the spill-over buffer added, the FUSE_MIN_READ_BUFFER requirement
is easily met and we can decrease the size of the buffer portion that is
right inside of FuseRequest.

As for benchmarks, the benefit of this patch can be shown easily by
writing a 4G image (with qemu-img convert) to a FUSE export:
- Before this patch: Takes 25.6 s (14.4 s with -t none)
- After this patch: Takes 4.5 s (5.5 s with -t none)

Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
---
 block/export/fuse.c | 137 ++++++++++++++++++++++++++++++++++++++------
 1 file changed, 118 insertions(+), 19 deletions(-)

diff --git a/block/export/fuse.c b/block/export/fuse.c
index 0edd994392..a24c5538b3 100644
--- a/block/export/fuse.c
+++ b/block/export/fuse.c
@@ -50,8 +50,17 @@
 
 /* Prevent overly long bounce buffer allocations */
 #define FUSE_MAX_READ_BYTES (MIN(BDRV_REQUEST_MAX_BYTES, 1 * 1024 * 1024))
-/* Small enough to fit in the request buffer */
-#define FUSE_MAX_WRITE_BYTES (4 * 1024)
+/*
+ * FUSE_MAX_WRITE_BYTES determines the maximum number of bytes we support in a
+ * write request; FUSE_IN_PLACE_WRITE_BYTES and FUSE_SPILLOVER_BUF_SIZE
+ * determine the split between the size of the in-place buffer in FuseRequest
+ * and the spill-over buffer in FuseQueue.  See FuseQueue.spillover_buf for a
+ * detailed explanation.
+ */
+#define FUSE_IN_PLACE_WRITE_BYTES (4 * 1024)
+#define FUSE_MAX_WRITE_BYTES (64 * 1024)
+#define FUSE_SPILLOVER_BUF_SIZE \
+    (FUSE_MAX_WRITE_BYTES - FUSE_IN_PLACE_WRITE_BYTES)
 
 typedef struct FuseExport FuseExport;
 
@@ -67,15 +76,49 @@ typedef struct FuseQueue {
 
     /*
      * The request buffer must be able to hold a full write, and/or at least
-     * FUSE_MIN_READ_BUFFER (from linux/fuse.h) bytes
+     * FUSE_MIN_READ_BUFFER (from linux/fuse.h) bytes.
+     * This however is just the first part of the buffer; every read is given
+     * a vector of this buffer (which should be enough for all normal requests,
+     * which we check via the static assertion in FUSE_IN_OP_STRUCT()) and the
+     * spill-over buffer below.
+     * Therefore, the size of this buffer plus FUSE_SPILLOVER_BUF_SIZE must be
+     * FUSE_MIN_READ_BUFFER or more (checked via static assertion below).
+     */
+    char request_buf[sizeof(struct fuse_in_header) +
+                     sizeof(struct fuse_write_in) +
+                     FUSE_IN_PLACE_WRITE_BYTES];
+
+    /*
+     * When retrieving a FUSE request, the destination buffer must always be
+     * sufficiently large for the whole request, i.e. with max_write=64k, we
+     * must provide a buffer that fits the WRITE header and 64 kB of space for
+     * data.
+     * We do want to support 64k write requests without requiring them to be
+     * split up, but at the same time, do not want to do such a large allocation
+     * for every single request.
+     * Therefore, the FuseRequest object provides an in-line buffer that is
+     * enough for write requests up to 4k (and all other requests), and for
+     * every request that is bigger, we provide a spill-over buffer here (for
+     * the remaining 64k - 4k = 60k).
+     * When poll_fuse_fd() reads a FUSE request, it passes these buffers as an
+     * I/O vector, and then checks the return value (number of bytes read) to
+     * find out whether the spill-over buffer was used.  If so, it will move the
+     * buffer to the request, and will allocate a new spill-over buffer for the
+     * next request.
+     *
+     * Free this buffer with qemu_vfree().
      */
-    char request_buf[MAX_CONST(
-        sizeof(struct fuse_in_header) + sizeof(struct fuse_write_in) +
-             FUSE_MAX_WRITE_BYTES,
-        FUSE_MIN_READ_BUFFER
-    )];
+    void *spillover_buf;
 } FuseQueue;
 
+/*
+ * Verify that FuseQueue.request_buf plus the spill-over buffer together
+ * are big enough to be accepted by the FUSE kernel driver.
+ */
+QEMU_BUILD_BUG_ON(sizeof(((FuseQueue *)0)->request_buf) +
+                  FUSE_SPILLOVER_BUF_SIZE <
+                  FUSE_MIN_READ_BUFFER);
+
 struct FuseExport {
     BlockExport common;
 
@@ -132,7 +175,8 @@ static bool is_regular_file(const char *path, Error **errp);
 
 static bool poll_fuse_fd(void *opaque);
 static void read_fuse_fd(void *opaque);
-static void coroutine_fn fuse_co_process_request(FuseQueue *q);
+static void coroutine_fn
+fuse_co_process_request(FuseQueue *q, void *spillover_buf);
 
 static void fuse_inc_in_flight(FuseExport *exp)
 {
@@ -494,6 +538,8 @@ static void coroutine_fn co_read_from_fuse_fd(void *opaque)
     FuseExport *exp = q->exp;
     ssize_t ret;
     const struct fuse_in_header *in_hdr;
+    struct iovec iov[2];
+    void *spillover_buf = NULL;
 
     fuse_inc_in_flight(exp);
 
@@ -501,7 +547,20 @@ static void coroutine_fn co_read_from_fuse_fd(void *opaque)
         goto no_request;
     }
 
-    ret = RETRY_ON_EINTR(read(fuse_fd, q->request_buf, sizeof(q->request_buf)));
+    /*
+     * If handling the last request consumed the spill-over buffer, allocate a
+     * new one.  Align it to the block device's alignment, which admittedly is
+     * only useful if FUSE_IN_PLACE_WRITE_BYTES is aligned, too.
+     */
+    if (unlikely(!q->spillover_buf)) {
+        q->spillover_buf = blk_blockalign(exp->common.blk,
+                                          FUSE_SPILLOVER_BUF_SIZE);
+    }
+    /* Construct the I/O vector to hold the FUSE request */
+    iov[0] = (struct iovec) { q->request_buf, sizeof(q->request_buf) };
+    iov[1] = (struct iovec) { q->spillover_buf, FUSE_SPILLOVER_BUF_SIZE };
+
+    ret = RETRY_ON_EINTR(readv(fuse_fd, iov, ARRAY_SIZE(iov)));
     if (ret < 0 && errno == EAGAIN) {
         /* No request available */
         goto no_request;
@@ -530,9 +589,15 @@ static void coroutine_fn co_read_from_fuse_fd(void *opaque)
         goto no_request;
     }
 
+    if (unlikely(ret > sizeof(q->request_buf))) {
+        /* Spillover buffer used, take ownership */
+        spillover_buf = q->spillover_buf;
+        q->spillover_buf = NULL;
+    }
+
     /* Must set this before yielding */
     co_param->got_request = 1;
-    fuse_co_process_request(q);
+    fuse_co_process_request(q, spillover_buf);
     fuse_dec_in_flight(exp);
     return;
 
@@ -607,6 +672,9 @@ static void fuse_export_delete(BlockExport *blk_exp)
         if (i > 0 && q->fuse_fd >= 0) {
             close(q->fuse_fd);
         }
+        if (q->spillover_buf) {
+            qemu_vfree(q->spillover_buf);
+        }
     }
     g_free(exp->queues);
 
@@ -915,17 +983,25 @@ fuse_co_read(FuseExport *exp, void **bufptr, uint64_t offset, uint32_t size)
 }
 
 /**
- * Handle client writes to the exported image.  @buf has the data to be written
- * and will be copied to a bounce buffer before yielding for the first time.
+ * Handle client writes to the exported image.  @in_place_buf has the first
+ * FUSE_IN_PLACE_WRITE_BYTES bytes of the data to be written, @spillover_buf
+ * contains the rest (if any; NULL otherwise).
+ * Data in @in_place_buf is assumed to be overwritten after yielding, so will
+ * be copied to a bounce buffer beforehand.  @spillover_buf in contrast is
+ * assumed to be exclusively owned and will be used as-is.
  * Return the number of bytes written to *out on success, and -errno on error.
  */
 static ssize_t coroutine_fn
 fuse_co_write(FuseExport *exp, struct fuse_write_out *out,
-              uint64_t offset, uint32_t size, const void *buf)
+              uint64_t offset, uint32_t size,
+              const void *in_place_buf, const void *spillover_buf)
 {
+    size_t in_place_size;
     void *copied;
     int64_t blk_len;
     int ret;
+    struct iovec iov[2];
+    QEMUIOVector qiov;
 
     /* Limited by max_write, should not happen */
     if (size > BDRV_REQUEST_MAX_BYTES) {
@@ -937,8 +1013,9 @@ fuse_co_write(FuseExport *exp, struct fuse_write_out *out,
     }
 
     /* Must copy to bounce buffer before potentially yielding */
-    copied = blk_blockalign(exp->common.blk, size);
-    memcpy(copied, buf, size);
+    in_place_size = MIN(size, FUSE_IN_PLACE_WRITE_BYTES);
+    copied = blk_blockalign(exp->common.blk, in_place_size);
+    memcpy(copied, in_place_buf, in_place_size);
 
     /**
      * Clients will expect short writes at EOF, so we have to limit
@@ -962,7 +1039,21 @@ fuse_co_write(FuseExport *exp, struct fuse_write_out *out,
         }
     }
 
-    ret = blk_co_pwrite(exp->common.blk, offset, size, copied, 0);
+    iov[0] = (struct iovec) {
+        .iov_base = copied,
+        .iov_len = in_place_size,
+    };
+    if (size > FUSE_IN_PLACE_WRITE_BYTES) {
+        assert(size - FUSE_IN_PLACE_WRITE_BYTES <= FUSE_SPILLOVER_BUF_SIZE);
+        iov[1] = (struct iovec) {
+            .iov_base = (void *)spillover_buf,
+            .iov_len = size - FUSE_IN_PLACE_WRITE_BYTES,
+        };
+        qemu_iovec_init_external(&qiov, iov, 2);
+    } else {
+        qemu_iovec_init_external(&qiov, iov, 1);
+    }
+    ret = blk_co_pwritev(exp->common.blk, offset, size, &qiov, 0);
     if (ret < 0) {
         goto fail_free_buffer;
     }
@@ -1321,8 +1412,14 @@ static int fuse_write_buf_response(int fd, uint32_t req_id,
  * Note that yielding in any request-processing function can overwrite the
  * contents of q->request_buf.  Anything that takes a buffer needs to take
  * care that the content is copied before yielding.
+ *
+ * @spillover_buf can contain the tail of a write request too large to fit into
+ * q->request_buf.  This function takes ownership of it (i.e. will free it),
+ * which assumes that its contents will not be overwritten by concurrent
+ * requests (as opposed to q->request_buf).
  */
-static void coroutine_fn fuse_co_process_request(FuseQueue *q)
+static void coroutine_fn
+fuse_co_process_request(FuseQueue *q, void *spillover_buf)
 {
     FuseExport *exp = q->exp;
     uint32_t opcode;
@@ -1418,7 +1515,7 @@ static void coroutine_fn fuse_co_process_request(FuseQueue *q)
          * yielding.
          */
         ret = fuse_co_write(exp, FUSE_OUT_OP_STRUCT(write, out_buf),
-                            in->offset, in->size, in + 1);
+                            in->offset, in->size, in + 1, spillover_buf);
         break;
     }
 
@@ -1460,6 +1557,8 @@ static void coroutine_fn fuse_co_process_request(FuseQueue *q)
                             ret < 0 ? ret : 0,
                             ret < 0 ? 0 : ret);
     }
+
+    qemu_vfree(spillover_buf);
 }
 
 const BlockExportDriver blk_exp_fuse = {
-- 
2.48.1

Re: [PATCH 15/15] fuse: Increase MAX_WRITE_SIZE with a second buffer

Posted by Eric Blake 16 hours ago

On Tue, Mar 25, 2025 at 05:06:55PM +0100, Hanna Czenczek wrote:
> We probably want to support larger write sizes than just 4k; 64k seems
> nice.  However, we cannot read partial requests from the FUSE FD, we
> always have to read requests in full; so our read buffer must be large
> enough to accommodate potential 64k writes if we want to support that.
> 
> Always allocating FuseRequest objects with 64k buffers in them seems
> wasteful, though.  But we can get around the issue by splitting the
> buffer into two and using readv(): One part will hold all normal (up to
> 4k) write requests and all other requests, and a second part (the
> "spill-over buffer") will be used only for larger write requests.  Each
> FuseQueue has its own spill-over buffer, and only if we find it used
> when reading a request will we move its ownership into the FuseRequest
> object and allocate a new spill-over buffer for the queue.
> 
> This way, we get to support "large" write sizes without having to
> allocate big buffers when they aren't used.
> 
> Also, this even reduces the size of the FuseRequest objects because the
> read buffer has to have at least FUSE_MIN_READ_BUFFER (8192) bytes; but
> the requests we support are not quite so large (except for >4k writes),
> so until now, we basically had to have useless padding in there.
> 
> With the spill-over buffer added, the FUSE_MIN_READ_BUFFER requirement
> is easily met and we can decrease the size of the buffer portion that is
> right inside of FuseRequest.
> 
> As for benchmarks, the benefit of this patch can be shown easily by
> writing a 4G image (with qemu-img convert) to a FUSE export:
> - Before this patch: Takes 25.6 s (14.4 s with -t none)
> - After this patch: Takes 4.5 s (5.5 s with -t none)

Hmm - 64k is still small for some tasks; with qcow2, you can have
clusters up to 2M, and writing a cluster at a time seems to be a
reasonable desire.  Or in other storage areas, nbdcopy (from libnbd)
defaults to 256k and allows up to 32M, or we can even look at lvm that
defaults to extent size of 4M (although with lvm thin volumes,
behavior is more like qcow2 with subclusters in that you don't have to
allocate the entire extent at once).

Maybe it makes sense to have this as a hierarchichal spillover (first
level spills over to 64k, second level spills over to 2M or whatever
ACTUAL maximum you are willing to support)?

Or you can feel free to ignore my comments on this patch - allowing
64k instead of 4k is ALREADY a nice change.

> @@ -501,7 +547,20 @@ static void coroutine_fn co_read_from_fuse_fd(void *opaque)
>          goto no_request;
>      }
>  
> -    ret = RETRY_ON_EINTR(read(fuse_fd, q->request_buf, sizeof(q->request_buf)));
> +    /*
> +     * If handling the last request consumed the spill-over buffer, allocate a
> +     * new one.  Align it to the block device's alignment, which admittedly is
> +     * only useful if FUSE_IN_PLACE_WRITE_BYTES is aligned, too.

What are typical block device minimum alignments?  If 4k is typical,
then it should be easy to make FUSE_IN_PLACE_WRITE_BYTES be 4k.  If
64k is typical, we're already doomed at being aligned.  Is being
unaligned going to cause the block layer to add bounce buffers on us,
if the block layer has a larger alignment than what we use here?

> @@ -915,17 +983,25 @@ fuse_co_read(FuseExport *exp, void **bufptr, uint64_t offset, uint32_t size)
>  }
>  
>  /**
> - * Handle client writes to the exported image.  @buf has the data to be written
> - * and will be copied to a bounce buffer before yielding for the first time.
> + * Handle client writes to the exported image.  @in_place_buf has the first
> + * FUSE_IN_PLACE_WRITE_BYTES bytes of the data to be written, @spillover_buf
> + * contains the rest (if any; NULL otherwise).
> + * Data in @in_place_buf is assumed to be overwritten after yielding, so will
> + * be copied to a bounce buffer beforehand.  @spillover_buf in contrast is
> + * assumed to be exclusively owned and will be used as-is.

Makes sense, although it leads to some interesting bookkeeping.  (I'm
wondering if nbd would benefit from this split-level buffering, since
it supports reads and writes up to 32M)

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.
Virtualization:  qemu.org | libguestfs.org

Re: [PATCH 15/15] fuse: Increase MAX_WRITE_SIZE with a second buffer

Posted by Stefan Hajnoczi 5 days, 21 hours ago

On Tue, Mar 25, 2025 at 05:06:55PM +0100, Hanna Czenczek wrote:
> We probably want to support larger write sizes than just 4k; 64k seems
> nice.  However, we cannot read partial requests from the FUSE FD, we
> always have to read requests in full; so our read buffer must be large
> enough to accommodate potential 64k writes if we want to support that.
> 
> Always allocating FuseRequest objects with 64k buffers in them seems
> wasteful, though.  But we can get around the issue by splitting the
> buffer into two and using readv(): One part will hold all normal (up to
> 4k) write requests and all other requests, and a second part (the
> "spill-over buffer") will be used only for larger write requests.  Each
> FuseQueue has its own spill-over buffer, and only if we find it used
> when reading a request will we move its ownership into the FuseRequest
> object and allocate a new spill-over buffer for the queue.
> 
> This way, we get to support "large" write sizes without having to
> allocate big buffers when they aren't used.
> 
> Also, this even reduces the size of the FuseRequest objects because the
> read buffer has to have at least FUSE_MIN_READ_BUFFER (8192) bytes; but
> the requests we support are not quite so large (except for >4k writes),
> so until now, we basically had to have useless padding in there.
> 
> With the spill-over buffer added, the FUSE_MIN_READ_BUFFER requirement
> is easily met and we can decrease the size of the buffer portion that is
> right inside of FuseRequest.
> 
> As for benchmarks, the benefit of this patch can be shown easily by
> writing a 4G image (with qemu-img convert) to a FUSE export:
> - Before this patch: Takes 25.6 s (14.4 s with -t none)
> - After this patch: Takes 4.5 s (5.5 s with -t none)
> 
> Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
> ---
>  block/export/fuse.c | 137 ++++++++++++++++++++++++++++++++++++++------
>  1 file changed, 118 insertions(+), 19 deletions(-)

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>