[RFC 7/8] blkio: implement BDRV_REQ_REGISTERED_BUF optimization

Stefan Hajnoczi posted 8 patches 3 years, 10 months ago
Maintainers: Stefan Hajnoczi <stefanha@redhat.com>, Kevin Wolf <kwolf@redhat.com>, Hanna Reitz <hreitz@redhat.com>, Fam Zheng <fam@euphon.net>, John Snow <jsnow@redhat.com>, Vladimir Sementsov-Ogievskiy <v.sementsov-og@mail.ru>, "Philippe Mathieu-Daudé" <f4bug@amsat.org>, "Michael S. Tsirkin" <mst@redhat.com>, Eduardo Habkost <eduardo@habkost.net>, Marcel Apfelbaum <marcel.apfelbaum@gmail.com>, Yanan Wang <wangyanan55@huawei.com>, Markus Armbruster <armbru@redhat.com>, Eric Blake <eblake@redhat.com>, Paolo Bonzini <pbonzini@redhat.com>, Thomas Huth <thuth@redhat.com>, Laurent Vivier <lvivier@redhat.com>
There is a newer version of this series
[RFC 7/8] blkio: implement BDRV_REQ_REGISTERED_BUF optimization
Posted by Stefan Hajnoczi 3 years, 10 months ago
Avoid bounce buffers when QEMUIOVector elements are within previously
registered bdrv_register_buf() buffers.

The idea is that emulated storage controllers will register guest RAM
using bdrv_register_buf() and set the BDRV_REQ_REGISTERED_BUF on I/O
requests. Therefore no blkio_add_mem_region() calls are necessary in the
performance-critical I/O code path.

This optimization doesn't apply if the I/O buffer is internally
allocated by QEMU (e.g. qcow2 metadata). There we still take the slow
path because BDRV_REQ_REGISTERED_BUF is not set.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/blkio.c | 108 ++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 104 insertions(+), 4 deletions(-)

diff --git a/block/blkio.c b/block/blkio.c
index dd2308b967..78f4ca5f49 100644
--- a/block/blkio.c
+++ b/block/blkio.c
@@ -1,7 +1,9 @@
 #include "qemu/osdep.h"
 #include <blkio.h>
 #include "block/block_int.h"
+#include "exec/memory.h"
 #include "qapi/error.h"
+#include "qemu/error-report.h"
 #include "qapi/qmp/qdict.h"
 #include "qemu/module.h"
 
@@ -26,6 +28,9 @@ typedef struct {
     /* Can we skip adding/deleting blkio_mem_regions? */
     bool needs_mem_regions;
 
+    /* Are file descriptors necessary for blkio_mem_regions? */
+    bool needs_mem_region_fd;
+
     /*
      * blkio_completion_fd_poll() stashes the next completion for
      * blkio_completion_fd_poll_ready().
@@ -170,6 +175,8 @@ static BlockAIOCB *blkio_aio_preadv(BlockDriverState *bs, int64_t offset,
         BlockCompletionFunc *cb, void *opaque)
 {
     BDRVBlkioState *s = bs->opaque;
+    bool needs_mem_regions =
+        s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF);
     struct iovec *iov = qiov->iov;
     int iovcnt = qiov->niov;
     BlkioAIOCB *acb;
@@ -179,7 +186,7 @@ static BlockAIOCB *blkio_aio_preadv(BlockDriverState *bs, int64_t offset,
 
     acb = blkio_aiocb_get(bs, cb, opaque);
 
-    if (s->needs_mem_regions) {
+    if (needs_mem_regions) {
         if (blkio_aiocb_init_mem_region_locked(acb, bytes) < 0) {
             qemu_aio_unref(&acb->common);
             return NULL;
@@ -194,7 +201,7 @@ static BlockAIOCB *blkio_aio_preadv(BlockDriverState *bs, int64_t offset,
 
     ret = blkioq_readv(s->blkioq, offset, iov, iovcnt, acb, 0);
     if (ret < 0) {
-        if (s->needs_mem_regions) {
+        if (needs_mem_regions) {
             blkio_free_mem_region(s->blkio, &acb->mem_region);
             qemu_iovec_destroy(&acb->qiov);
         }
@@ -215,6 +222,8 @@ static BlockAIOCB *blkio_aio_pwritev(BlockDriverState *bs, int64_t offset,
 {
     uint32_t blkio_flags = (flags & BDRV_REQ_FUA) ? BLKIO_REQ_FUA : 0;
     BDRVBlkioState *s = bs->opaque;
+    bool needs_mem_regions =
+        s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF);
     struct iovec *iov = qiov->iov;
     int iovcnt = qiov->niov;
     BlkioAIOCB *acb;
@@ -224,7 +233,7 @@ static BlockAIOCB *blkio_aio_pwritev(BlockDriverState *bs, int64_t offset,
 
     acb = blkio_aiocb_get(bs, cb, opaque);
 
-    if (s->needs_mem_regions) {
+    if (needs_mem_regions) {
         if (blkio_aiocb_init_mem_region_locked(acb, bytes) < 0) {
             qemu_aio_unref(&acb->common);
             return NULL;
@@ -238,7 +247,7 @@ static BlockAIOCB *blkio_aio_pwritev(BlockDriverState *bs, int64_t offset,
 
     ret = blkioq_writev(s->blkioq, offset, iov, iovcnt, acb, blkio_flags);
     if (ret < 0) {
-        if (s->needs_mem_regions) {
+        if (needs_mem_regions) {
             blkio_free_mem_region(s->blkio, &acb->mem_region);
         }
         qemu_aio_unref(&acb->common);
@@ -286,6 +295,83 @@ static void blkio_io_unplug(BlockDriverState *bs)
     }
 }
 
+static void blkio_register_buf(BlockDriverState *bs, void *host, size_t size)
+{
+    BDRVBlkioState *s = bs->opaque;
+    char *errmsg;
+    int ret;
+    struct blkio_mem_region region = (struct blkio_mem_region){
+        .addr = host,
+        .len = size,
+        .fd = -1,
+    };
+
+    if (((uintptr_t)host | size) % s->mem_region_alignment) {
+        error_report_once("%s: skipping unaligned buf %p with size %zu",
+                          __func__, host, size);
+        return; /* skip unaligned */
+    }
+
+    /* Attempt to find the fd for a MemoryRegion */
+    if (s->needs_mem_region_fd) {
+        int fd = -1;
+        ram_addr_t offset;
+        MemoryRegion *mr;
+
+        /*
+         * bdrv_register_buf() is called with the BQL held so mr lives at least
+         * until this function returns.
+         */
+        mr = memory_region_from_host(host, &offset);
+        if (mr) {
+            fd = memory_region_get_fd(mr);
+        }
+        if (fd == -1) {
+            error_report_once("%s: skipping fd-less buf %p with size %zu",
+                              __func__, host, size);
+            return; /* skip if there is no fd */
+        }
+
+        region.fd = fd;
+        region.fd_offset = offset;
+    }
+
+    WITH_QEMU_LOCK_GUARD(&s->lock) {
+        ret = blkio_add_mem_region(s->blkio, &region, &errmsg);
+    }
+
+    if (ret < 0) {
+        error_report_once("Failed to add blkio mem region %p with size %zu: %s",
+                          host, size, errmsg);
+        free(errmsg);
+    }
+}
+
+static void blkio_unregister_buf(BlockDriverState *bs, void *host, size_t size)
+{
+    BDRVBlkioState *s = bs->opaque;
+    char *errmsg;
+    int ret;
+    struct blkio_mem_region region = (struct blkio_mem_region){
+        .addr = host,
+        .len = size,
+    };
+
+    if (((uintptr_t)host | size) % s->mem_region_alignment) {
+        return; /* skip unaligned */
+    }
+
+    WITH_QEMU_LOCK_GUARD(&s->lock) {
+        ret = blkio_del_mem_region(s->blkio, &region, &errmsg);
+    }
+
+    if (ret < 0) {
+        error_report_once("Failed to delete blkio mem region %p with size %zu: %s",
+                          host, size, errmsg);
+        free(errmsg);
+    }
+}
+
 static void blkio_parse_filename_io_uring(const char *filename, QDict *options,
                                           Error **errp)
 {
@@ -356,6 +442,18 @@ static int blkio_file_open(BlockDriverState *bs, QDict *options, int flags,
         return ret;
     }
 
+    ret = blkio_get_bool(s->blkio,
+                         "needs-mem-region-fd",
+                         &s->needs_mem_region_fd,
+                         &errmsg);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret,
+                         "failed to get needs-mem-region-fd: %s", errmsg);
+        free(errmsg);
+        blkio_destroy(&s->blkio);
+        return ret;
+    }
+
     ret = blkio_get_uint64(s->blkio,
                            "mem-region-alignment",
                            &s->mem_region_alignment,
@@ -436,6 +534,8 @@ static BlockDriver bdrv_io_uring = {
     .bdrv_aio_pwritev           = blkio_aio_pwritev,
     .bdrv_aio_flush             = blkio_aio_flush,
     .bdrv_io_unplug             = blkio_io_unplug,
+    .bdrv_register_buf          = blkio_register_buf,
+    .bdrv_unregister_buf        = blkio_unregister_buf,
 
     /*
      * TODO
-- 
2.35.1
Re: [RFC 7/8] blkio: implement BDRV_REQ_REGISTERED_BUF optimization
Posted by Stefano Garzarella 3 years, 10 months ago
On Wed, Mar 23, 2022 at 11:17:26AM +0000, Stefan Hajnoczi wrote:
>Avoid bounce buffers when QEMUIOVector elements are within previously
>registered bdrv_register_buf() buffers.
>
>The idea is that emulated storage controllers will register guest RAM
>using bdrv_register_buf() and set the BDRV_REQ_REGISTERED_BUF on I/O
>requests. Therefore no blkio_add_mem_region() calls are necessary in the
>performance-critical I/O code path.
>
>This optimization doesn't apply if the I/O buffer is internally
>allocated by QEMU (e.g. qcow2 metadata). There we still take the slow
>path because BDRV_REQ_REGISTERED_BUF is not set.
>
>Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
>---
> block/blkio.c | 108 ++++++++++++++++++++++++++++++++++++++++++++++++--
> 1 file changed, 104 insertions(+), 4 deletions(-)
>
>diff --git a/block/blkio.c b/block/blkio.c
>index dd2308b967..78f4ca5f49 100644
>--- a/block/blkio.c
>+++ b/block/blkio.c
>@@ -1,7 +1,9 @@
> #include "qemu/osdep.h"
> #include <blkio.h>
> #include "block/block_int.h"
>+#include "exec/memory.h"
> #include "qapi/error.h"
>+#include "qemu/error-report.h"
> #include "qapi/qmp/qdict.h"
> #include "qemu/module.h"
>
>@@ -26,6 +28,9 @@ typedef struct {
>     /* Can we skip adding/deleting blkio_mem_regions? */
>     bool needs_mem_regions;
>
>+    /* Are file descriptors necessary for blkio_mem_regions? */
>+    bool needs_mem_region_fd;
>+
>     /*
>      * blkio_completion_fd_poll() stashes the next completion for
>      * blkio_completion_fd_poll_ready().
>@@ -170,6 +175,8 @@ static BlockAIOCB *blkio_aio_preadv(BlockDriverState *bs, int64_t offset,
>         BlockCompletionFunc *cb, void *opaque)
> {
>     BDRVBlkioState *s = bs->opaque;
>+    bool needs_mem_regions =
>+        s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF);
>     struct iovec *iov = qiov->iov;
>     int iovcnt = qiov->niov;
>     BlkioAIOCB *acb;
>@@ -179,7 +186,7 @@ static BlockAIOCB *blkio_aio_preadv(BlockDriverState *bs, int64_t offset,
>
>     acb = blkio_aiocb_get(bs, cb, opaque);
>
>-    if (s->needs_mem_regions) {
>+    if (needs_mem_regions) {
>         if (blkio_aiocb_init_mem_region_locked(acb, bytes) < 0) {
>             qemu_aio_unref(&acb->common);
>             return NULL;
>@@ -194,7 +201,7 @@ static BlockAIOCB *blkio_aio_preadv(BlockDriverState *bs, int64_t offset,
>
>     ret = blkioq_readv(s->blkioq, offset, iov, iovcnt, acb, 0);
>     if (ret < 0) {
>-        if (s->needs_mem_regions) {
>+        if (needs_mem_regions) {
>             blkio_free_mem_region(s->blkio, &acb->mem_region);
>             qemu_iovec_destroy(&acb->qiov);
>         }
>@@ -215,6 +222,8 @@ static BlockAIOCB *blkio_aio_pwritev(BlockDriverState *bs, int64_t offset,
> {
>     uint32_t blkio_flags = (flags & BDRV_REQ_FUA) ? BLKIO_REQ_FUA : 0;
>     BDRVBlkioState *s = bs->opaque;
>+    bool needs_mem_regions =
>+        s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF);
>     struct iovec *iov = qiov->iov;
>     int iovcnt = qiov->niov;
>     BlkioAIOCB *acb;
>@@ -224,7 +233,7 @@ static BlockAIOCB *blkio_aio_pwritev(BlockDriverState *bs, int64_t offset,
>
>     acb = blkio_aiocb_get(bs, cb, opaque);
>
>-    if (s->needs_mem_regions) {
>+    if (needs_mem_regions) {
>         if (blkio_aiocb_init_mem_region_locked(acb, bytes) < 0) {
>             qemu_aio_unref(&acb->common);
>             return NULL;
>@@ -238,7 +247,7 @@ static BlockAIOCB *blkio_aio_pwritev(BlockDriverState *bs, int64_t offset,
>
>     ret = blkioq_writev(s->blkioq, offset, iov, iovcnt, acb, blkio_flags);
>     if (ret < 0) {
>-        if (s->needs_mem_regions) {
>+        if (needs_mem_regions) {
>             blkio_free_mem_region(s->blkio, &acb->mem_region);
>         }
>         qemu_aio_unref(&acb->common);
>@@ -286,6 +295,83 @@ static void blkio_io_unplug(BlockDriverState *bs)
>     }
> }
>
>+static void blkio_register_buf(BlockDriverState *bs, void *host, size_t size)
>+{
>+    BDRVBlkioState *s = bs->opaque;
>+    char *errmsg;
>+    int ret;
>+    struct blkio_mem_region region = (struct blkio_mem_region){
>+        .addr = host,
>+        .len = size,
>+        .fd = -1,
>+    };
>+
>+    if (((uintptr_t)host | size) % s->mem_region_alignment) {
>+        error_report_once("%s: skipping unaligned buf %p with size %zu",
>+                          __func__, host, size);
>+        return; /* skip unaligned */
>+    }
>+
>+    /* Attempt to find the fd for a MemoryRegion */
>+    if (s->needs_mem_region_fd) {
>+        int fd = -1;
>+        ram_addr_t offset;
>+        MemoryRegion *mr;
>+
>+        /*
>+         * bdrv_register_buf() is called with the BQL held so mr lives at least
>+         * until this function returns.
>+         */
>+        mr = memory_region_from_host(host, &offset);
>+        if (mr) {
>+            fd = memory_region_get_fd(mr);
>+        }
>+        if (fd == -1) {
>+            error_report_once("%s: skipping fd-less buf %p with size %zu",
>+                              __func__, host, size);
>+            return; /* skip if there is no fd */
>+        }
>+
>+        region.fd = fd;
>+        region.fd_offset = offset;
>+    }
>+
>+    WITH_QEMU_LOCK_GUARD(&s->lock) {
>+        ret = blkio_add_mem_region(s->blkio, &region, &errmsg);
>+    }
>+
>+    if (ret < 0) {
>+        error_report_once("Failed to add blkio mem region %p with size %zu: %s",
>+                          host, size, errmsg);
>+        free(errmsg);
>+    }
>+}
>+
>+static void blkio_unregister_buf(BlockDriverState *bs, void *host, size_t size)
>+{
>+    BDRVBlkioState *s = bs->opaque;
>+    char *errmsg;
>+    int ret;
>+    struct blkio_mem_region region = (struct blkio_mem_region){
>+        .addr = host,
>+        .len = size,
>+    };
>+
>+    if (((uintptr_t)host | size) % s->mem_region_alignment) {
>+        return; /* skip unaligned */
>+    }
>+
>+    WITH_QEMU_LOCK_GUARD(&s->lock) {
>+        ret = blkio_del_mem_region(s->blkio, &region, &errmsg);
>+    }
>+
>+    if (ret < 0) {
>+        error_report_once("Failed to delete blkio mem region %p with size %zu: %s",
>+                          host, size, errmsg);
>+        free(errmsg);
>+    }
>+}
>+
> static void blkio_parse_filename_io_uring(const char *filename, QDict *options,
>                                           Error **errp)
> {
>@@ -356,6 +442,18 @@ static int blkio_file_open(BlockDriverState *bs, QDict *options, int flags,
>         return ret;
>     }
>
>+    ret = blkio_get_bool(s->blkio,
>+                         "needs-mem-region-fd",
>+                         &s->needs_mem_region_fd,
>+                         &errmsg);
>+    if (ret < 0) {
>+        error_setg_errno(errp, -ret,
>+                         "failed to get needs-mem-region-fd: %s", errmsg);
>+        free(errmsg);
>+        blkio_destroy(&s->blkio);
>+        return ret;
>+    }
>+
>     ret = blkio_get_uint64(s->blkio,
>                            "mem-region-alignment",
>                            &s->mem_region_alignment,

I already mentioned on IRC while testing the series, but I'm writing it 
here so we don't forget ;-)

To prevent bdrv_driver_pwritev() from removing the 
BDRV_REQ_REGISTERED_BUF flag from requests, we should add this: 

@@ -474,7 +474,7 @@ static int blkio_file_open(BlockDriverState *bs, QDict *options, int flags,
          return ret;
      }

-    bs->supported_write_flags = BDRV_REQ_FUA;
+    bs->supported_write_flags = BDRV_REQ_FUA | BDRV_REQ_REGISTERED_BUF;

      qemu_mutex_init(&s->lock);
      s->blkioq = blkio_get_queue(s->blkio, 0);


>@@ -436,6 +534,8 @@ static BlockDriver bdrv_io_uring = {
>     .bdrv_aio_pwritev           = blkio_aio_pwritev,
>     .bdrv_aio_flush             = blkio_aio_flush,
>     .bdrv_io_unplug             = blkio_io_unplug,
>+    .bdrv_register_buf          = blkio_register_buf,
>+    .bdrv_unregister_buf        = blkio_unregister_buf,
>
>     /*
>      * TODO
>-- 
>2.35.1
>
Re: [RFC 7/8] blkio: implement BDRV_REQ_REGISTERED_BUF optimization
Posted by Stefan Hajnoczi 3 years, 10 months ago
On Tue, Mar 29, 2022 at 05:24:30PM +0200, Stefano Garzarella wrote:
> On Wed, Mar 23, 2022 at 11:17:26AM +0000, Stefan Hajnoczi wrote:
> > Avoid bounce buffers when QEMUIOVector elements are within previously
> > registered bdrv_register_buf() buffers.
> > 
> > The idea is that emulated storage controllers will register guest RAM
> > using bdrv_register_buf() and set the BDRV_REQ_REGISTERED_BUF on I/O
> > requests. Therefore no blkio_add_mem_region() calls are necessary in the
> > performance-critical I/O code path.
> > 
> > This optimization doesn't apply if the I/O buffer is internally
> > allocated by QEMU (e.g. qcow2 metadata). There we still take the slow
> > path because BDRV_REQ_REGISTERED_BUF is not set.
> > 
> > Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> > ---
> > block/blkio.c | 108 ++++++++++++++++++++++++++++++++++++++++++++++++--
> > 1 file changed, 104 insertions(+), 4 deletions(-)
> > 
> > diff --git a/block/blkio.c b/block/blkio.c
> > index dd2308b967..78f4ca5f49 100644
> > --- a/block/blkio.c
> > +++ b/block/blkio.c
> > @@ -1,7 +1,9 @@
> > #include "qemu/osdep.h"
> > #include <blkio.h>
> > #include "block/block_int.h"
> > +#include "exec/memory.h"
> > #include "qapi/error.h"
> > +#include "qemu/error-report.h"
> > #include "qapi/qmp/qdict.h"
> > #include "qemu/module.h"
> > 
> > @@ -26,6 +28,9 @@ typedef struct {
> >     /* Can we skip adding/deleting blkio_mem_regions? */
> >     bool needs_mem_regions;
> > 
> > +    /* Are file descriptors necessary for blkio_mem_regions? */
> > +    bool needs_mem_region_fd;
> > +
> >     /*
> >      * blkio_completion_fd_poll() stashes the next completion for
> >      * blkio_completion_fd_poll_ready().
> > @@ -170,6 +175,8 @@ static BlockAIOCB *blkio_aio_preadv(BlockDriverState *bs, int64_t offset,
> >         BlockCompletionFunc *cb, void *opaque)
> > {
> >     BDRVBlkioState *s = bs->opaque;
> > +    bool needs_mem_regions =
> > +        s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF);
> >     struct iovec *iov = qiov->iov;
> >     int iovcnt = qiov->niov;
> >     BlkioAIOCB *acb;
> > @@ -179,7 +186,7 @@ static BlockAIOCB *blkio_aio_preadv(BlockDriverState *bs, int64_t offset,
> > 
> >     acb = blkio_aiocb_get(bs, cb, opaque);
> > 
> > -    if (s->needs_mem_regions) {
> > +    if (needs_mem_regions) {
> >         if (blkio_aiocb_init_mem_region_locked(acb, bytes) < 0) {
> >             qemu_aio_unref(&acb->common);
> >             return NULL;
> > @@ -194,7 +201,7 @@ static BlockAIOCB *blkio_aio_preadv(BlockDriverState *bs, int64_t offset,
> > 
> >     ret = blkioq_readv(s->blkioq, offset, iov, iovcnt, acb, 0);
> >     if (ret < 0) {
> > -        if (s->needs_mem_regions) {
> > +        if (needs_mem_regions) {
> >             blkio_free_mem_region(s->blkio, &acb->mem_region);
> >             qemu_iovec_destroy(&acb->qiov);
> >         }
> > @@ -215,6 +222,8 @@ static BlockAIOCB *blkio_aio_pwritev(BlockDriverState *bs, int64_t offset,
> > {
> >     uint32_t blkio_flags = (flags & BDRV_REQ_FUA) ? BLKIO_REQ_FUA : 0;
> >     BDRVBlkioState *s = bs->opaque;
> > +    bool needs_mem_regions =
> > +        s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF);
> >     struct iovec *iov = qiov->iov;
> >     int iovcnt = qiov->niov;
> >     BlkioAIOCB *acb;
> > @@ -224,7 +233,7 @@ static BlockAIOCB *blkio_aio_pwritev(BlockDriverState *bs, int64_t offset,
> > 
> >     acb = blkio_aiocb_get(bs, cb, opaque);
> > 
> > -    if (s->needs_mem_regions) {
> > +    if (needs_mem_regions) {
> >         if (blkio_aiocb_init_mem_region_locked(acb, bytes) < 0) {
> >             qemu_aio_unref(&acb->common);
> >             return NULL;
> > @@ -238,7 +247,7 @@ static BlockAIOCB *blkio_aio_pwritev(BlockDriverState *bs, int64_t offset,
> > 
> >     ret = blkioq_writev(s->blkioq, offset, iov, iovcnt, acb, blkio_flags);
> >     if (ret < 0) {
> > -        if (s->needs_mem_regions) {
> > +        if (needs_mem_regions) {
> >             blkio_free_mem_region(s->blkio, &acb->mem_region);
> >         }
> >         qemu_aio_unref(&acb->common);
> > @@ -286,6 +295,83 @@ static void blkio_io_unplug(BlockDriverState *bs)
> >     }
> > }
> > 
> > +static void blkio_register_buf(BlockDriverState *bs, void *host, size_t size)
> > +{
> > +    BDRVBlkioState *s = bs->opaque;
> > +    char *errmsg;
> > +    int ret;
> > +    struct blkio_mem_region region = (struct blkio_mem_region){
> > +        .addr = host,
> > +        .len = size,
> > +        .fd = -1,
> > +    };
> > +
> > +    if (((uintptr_t)host | size) % s->mem_region_alignment) {
> > +        error_report_once("%s: skipping unaligned buf %p with size %zu",
> > +                          __func__, host, size);
> > +        return; /* skip unaligned */
> > +    }
> > +
> > +    /* Attempt to find the fd for a MemoryRegion */
> > +    if (s->needs_mem_region_fd) {
> > +        int fd = -1;
> > +        ram_addr_t offset;
> > +        MemoryRegion *mr;
> > +
> > +        /*
> > +         * bdrv_register_buf() is called with the BQL held so mr lives at least
> > +         * until this function returns.
> > +         */
> > +        mr = memory_region_from_host(host, &offset);
> > +        if (mr) {
> > +            fd = memory_region_get_fd(mr);
> > +        }
> > +        if (fd == -1) {
> > +            error_report_once("%s: skipping fd-less buf %p with size %zu",
> > +                              __func__, host, size);
> > +            return; /* skip if there is no fd */
> > +        }
> > +
> > +        region.fd = fd;
> > +        region.fd_offset = offset;
> > +    }
> > +
> > +    WITH_QEMU_LOCK_GUARD(&s->lock) {
> > +        ret = blkio_add_mem_region(s->blkio, &region, &errmsg);
> > +    }
> > +
> > +    if (ret < 0) {
> > +        error_report_once("Failed to add blkio mem region %p with size %zu: %s",
> > +                          host, size, errmsg);
> > +        free(errmsg);
> > +    }
> > +}
> > +
> > +static void blkio_unregister_buf(BlockDriverState *bs, void *host, size_t size)
> > +{
> > +    BDRVBlkioState *s = bs->opaque;
> > +    char *errmsg;
> > +    int ret;
> > +    struct blkio_mem_region region = (struct blkio_mem_region){
> > +        .addr = host,
> > +        .len = size,
> > +    };
> > +
> > +    if (((uintptr_t)host | size) % s->mem_region_alignment) {
> > +        return; /* skip unaligned */
> > +    }
> > +
> > +    WITH_QEMU_LOCK_GUARD(&s->lock) {
> > +        ret = blkio_del_mem_region(s->blkio, &region, &errmsg);
> > +    }
> > +
> > +    if (ret < 0) {
> > +        error_report_once("Failed to delete blkio mem region %p with size %zu: %s",
> > +                          host, size, errmsg);
> > +        free(errmsg);
> > +    }
> > +}
> > +
> > static void blkio_parse_filename_io_uring(const char *filename, QDict *options,
> >                                           Error **errp)
> > {
> > @@ -356,6 +442,18 @@ static int blkio_file_open(BlockDriverState *bs, QDict *options, int flags,
> >         return ret;
> >     }
> > 
> > +    ret = blkio_get_bool(s->blkio,
> > +                         "needs-mem-region-fd",
> > +                         &s->needs_mem_region_fd,
> > +                         &errmsg);
> > +    if (ret < 0) {
> > +        error_setg_errno(errp, -ret,
> > +                         "failed to get needs-mem-region-fd: %s", errmsg);
> > +        free(errmsg);
> > +        blkio_destroy(&s->blkio);
> > +        return ret;
> > +    }
> > +
> >     ret = blkio_get_uint64(s->blkio,
> >                            "mem-region-alignment",
> >                            &s->mem_region_alignment,
> 
> I already mentioned on IRC while testing the series, but I'm writing it here
> so we don't forget ;-)
> 
> To prevent bdrv_driver_pwritev() from removing the BDRV_REQ_REGISTERED_BUF
> flag from requests, we should add this:
> 
> @@ -474,7 +474,7 @@ static int blkio_file_open(BlockDriverState *bs, QDict *options, int flags,
>          return ret;
>      }
> 
> -    bs->supported_write_flags = BDRV_REQ_FUA;
> +    bs->supported_write_flags = BDRV_REQ_FUA | BDRV_REQ_REGISTERED_BUF;
> 
>      qemu_mutex_init(&s->lock);
>      s->blkioq = blkio_get_queue(s->blkio, 0);

Thanks, will fix.

Stefan