[RFC v3 7/8] blkio: implement BDRV_REQ_REGISTERED_BUF optimization

Stefan Hajnoczi posted 8 patches 3 years, 7 months ago
Maintainers: Stefan Hajnoczi <stefanha@redhat.com>, Kevin Wolf <kwolf@redhat.com>, Hanna Reitz <hreitz@redhat.com>, Fam Zheng <fam@euphon.net>, John Snow <jsnow@redhat.com>, Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>, "Philippe Mathieu-Daudé" <f4bug@amsat.org>, "Michael S. Tsirkin" <mst@redhat.com>, Eduardo Habkost <eduardo@habkost.net>, Marcel Apfelbaum <marcel.apfelbaum@gmail.com>, Yanan Wang <wangyanan55@huawei.com>, Eric Blake <eblake@redhat.com>, Markus Armbruster <armbru@redhat.com>, Paolo Bonzini <pbonzini@redhat.com>, Thomas Huth <thuth@redhat.com>, Laurent Vivier <lvivier@redhat.com>
There is a newer version of this series
[RFC v3 7/8] blkio: implement BDRV_REQ_REGISTERED_BUF optimization
Posted by Stefan Hajnoczi 3 years, 7 months ago
Avoid bounce buffers when QEMUIOVector elements are within previously
registered bdrv_register_buf() buffers.

The idea is that emulated storage controllers will register guest RAM
using bdrv_register_buf() and set the BDRV_REQ_REGISTERED_BUF on I/O
requests. Therefore no blkio_map_mem_region() calls are necessary in the
performance-critical I/O code path.

This optimization doesn't apply if the I/O buffer is internally
allocated by QEMU (e.g. qcow2 metadata). There we still take the slow
path because BDRV_REQ_REGISTERED_BUF is not set.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/blkio.c | 104 ++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 101 insertions(+), 3 deletions(-)

diff --git a/block/blkio.c b/block/blkio.c
index 7fbdbd7fae..37d593a20c 100644
--- a/block/blkio.c
+++ b/block/blkio.c
@@ -1,7 +1,9 @@
 #include "qemu/osdep.h"
 #include <blkio.h>
 #include "block/block_int.h"
+#include "exec/memory.h"
 #include "qapi/error.h"
+#include "qemu/error-report.h"
 #include "qapi/qmp/qdict.h"
 #include "qemu/module.h"
 
@@ -28,6 +30,9 @@ typedef struct {
 
     /* Can we skip adding/deleting blkio_mem_regions? */
     bool needs_mem_regions;
+
+    /* Are file descriptors necessary for blkio_mem_regions? */
+    bool needs_mem_region_fd;
 } BDRVBlkioState;
 
 static void blkio_aiocb_complete(BlkioAIOCB *acb, int ret)
@@ -198,6 +203,8 @@ static BlockAIOCB *blkio_aio_preadv(BlockDriverState *bs, int64_t offset,
         BlockCompletionFunc *cb, void *opaque)
 {
     BDRVBlkioState *s = bs->opaque;
+    bool needs_mem_regions =
+        s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF);
     struct iovec *iov = qiov->iov;
     int iovcnt = qiov->niov;
     BlkioAIOCB *acb;
@@ -206,7 +213,7 @@ static BlockAIOCB *blkio_aio_preadv(BlockDriverState *bs, int64_t offset,
 
     acb = blkio_aiocb_get(bs, cb, opaque);
 
-    if (s->needs_mem_regions) {
+    if (needs_mem_regions) {
         if (blkio_aiocb_init_mem_region_locked(acb, bytes) < 0) {
             qemu_aio_unref(&acb->common);
             return NULL;
@@ -230,6 +237,8 @@ static BlockAIOCB *blkio_aio_pwritev(BlockDriverState *bs, int64_t offset,
 {
     uint32_t blkio_flags = (flags & BDRV_REQ_FUA) ? BLKIO_REQ_FUA : 0;
     BDRVBlkioState *s = bs->opaque;
+    bool needs_mem_regions =
+        s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF);
     struct iovec *iov = qiov->iov;
     int iovcnt = qiov->niov;
     BlkioAIOCB *acb;
@@ -238,7 +247,7 @@ static BlockAIOCB *blkio_aio_pwritev(BlockDriverState *bs, int64_t offset,
 
     acb = blkio_aiocb_get(bs, cb, opaque);
 
-    if (s->needs_mem_regions) {
+    if (needs_mem_regions) {
         if (blkio_aiocb_init_mem_region_locked(acb, bytes) < 0) {
             qemu_aio_unref(&acb->common);
             return NULL;
@@ -324,6 +333,80 @@ static void blkio_io_unplug(BlockDriverState *bs)
     }
 }
 
+static void blkio_register_buf(BlockDriverState *bs, void *host, size_t size)
+{
+    BDRVBlkioState *s = bs->opaque;
+    int ret;
+    struct blkio_mem_region region = (struct blkio_mem_region){
+        .addr = host,
+        .len = size,
+        .fd = -1,
+    };
+
+    if (((uintptr_t)host | size) % s->mem_region_alignment) {
+        error_report_once("%s: skipping unaligned buf %p with size %zu",
+                          __func__, host, size);
+        return; /* skip unaligned */
+    }
+
+    /* Attempt to find the fd for a MemoryRegion */
+    if (s->needs_mem_region_fd) {
+        int fd = -1;
+        ram_addr_t offset;
+        MemoryRegion *mr;
+
+        /*
+         * bdrv_register_buf() is called with the BQL held so mr lives at least
+         * until this function returns.
+         */
+        mr = memory_region_from_host(host, &offset);
+        if (mr) {
+            fd = memory_region_get_fd(mr);
+        }
+        if (fd == -1) {
+            error_report_once("%s: skipping fd-less buf %p with size %zu",
+                              __func__, host, size);
+            return; /* skip if there is no fd */
+        }
+
+        region.fd = fd;
+        region.fd_offset = offset;
+    }
+
+    WITH_QEMU_LOCK_GUARD(&s->lock) {
+        ret = blkio_map_mem_region(s->blkio, &region);
+    }
+
+    if (ret < 0) {
+        error_report_once("Failed to add blkio mem region %p with size %zu: %s",
+                          host, size, blkio_get_error_msg());
+    }
+}
+
+static void blkio_unregister_buf(BlockDriverState *bs, void *host, size_t size)
+{
+    BDRVBlkioState *s = bs->opaque;
+    int ret;
+    struct blkio_mem_region region = (struct blkio_mem_region){
+        .addr = host,
+        .len = size,
+        .fd = -1,
+    };
+
+    if (((uintptr_t)host | size) % s->mem_region_alignment) {
+        return; /* skip unaligned */
+    }
+
+    WITH_QEMU_LOCK_GUARD(&s->lock) {
+        ret = blkio_unmap_mem_region(s->blkio, &region);
+    }
+
+    if (ret < 0) {
+        error_report_once("Failed to delete blkio mem region %p with size %zu: %s",
+                          host, size, blkio_get_error_msg());
+    }
+}
+
 static void blkio_parse_filename_io_uring(const char *filename, QDict *options,
                                           Error **errp)
 {
@@ -440,6 +523,17 @@ static int blkio_file_open(BlockDriverState *bs, QDict *options, int flags,
         return ret;
     }
 
+    ret = blkio_get_bool(s->blkio,
+                         "needs-mem-region-fd",
+                         &s->needs_mem_region_fd);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret,
+                         "failed to get needs-mem-region-fd: %s",
+                         blkio_get_error_msg());
+        blkio_destroy(&s->blkio);
+        return ret;
+    }
+
     ret = blkio_get_uint64(s->blkio,
                            "mem-region-alignment",
                            &s->mem_region_alignment);
@@ -459,7 +553,7 @@ static int blkio_file_open(BlockDriverState *bs, QDict *options, int flags,
         return ret;
     }
 
-    bs->supported_write_flags = BDRV_REQ_FUA;
+    bs->supported_write_flags = BDRV_REQ_FUA | BDRV_REQ_REGISTERED_BUF;
     bs->supported_zero_flags = BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP |
                                BDRV_REQ_NO_FALLBACK;
 
@@ -627,6 +721,8 @@ static BlockDriver bdrv_io_uring = {
     .bdrv_co_pwrite_zeroes      = blkio_co_pwrite_zeroes,
     .bdrv_io_unplug             = blkio_io_unplug,
     .bdrv_refresh_limits        = blkio_refresh_limits,
+    .bdrv_register_buf          = blkio_register_buf,
+    .bdrv_unregister_buf        = blkio_unregister_buf,
 };
 
 static BlockDriver bdrv_virtio_blk_vhost_vdpa = {
@@ -648,6 +744,8 @@ static BlockDriver bdrv_virtio_blk_vhost_vdpa = {
     .bdrv_co_pwrite_zeroes      = blkio_co_pwrite_zeroes,
     .bdrv_io_unplug             = blkio_io_unplug,
     .bdrv_refresh_limits        = blkio_refresh_limits,
+    .bdrv_register_buf          = blkio_register_buf,
+    .bdrv_unregister_buf        = blkio_unregister_buf,
 };
 
 static void bdrv_blkio_init(void)
-- 
2.36.1
Re: [RFC v3 7/8] blkio: implement BDRV_REQ_REGISTERED_BUF optimization
Posted by Hanna Reitz 3 years, 7 months ago
On 08.07.22 06:17, Stefan Hajnoczi wrote:
> Avoid bounce buffers when QEMUIOVector elements are within previously
> registered bdrv_register_buf() buffers.
>
> The idea is that emulated storage controllers will register guest RAM
> using bdrv_register_buf() and set the BDRV_REQ_REGISTERED_BUF on I/O
> requests. Therefore no blkio_map_mem_region() calls are necessary in the
> performance-critical I/O code path.
>
> This optimization doesn't apply if the I/O buffer is internally
> allocated by QEMU (e.g. qcow2 metadata). There we still take the slow
> path because BDRV_REQ_REGISTERED_BUF is not set.

Which keeps the question relevant of how slow the slow path is, i.e. 
whether it wouldn’t make sense to keep some of the mem regions allocated 
there in a cache instead of allocating/freeing them on every I/O request.

> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> ---
>   block/blkio.c | 104 ++++++++++++++++++++++++++++++++++++++++++++++++--
>   1 file changed, 101 insertions(+), 3 deletions(-)
>
> diff --git a/block/blkio.c b/block/blkio.c
> index 7fbdbd7fae..37d593a20c 100644
> --- a/block/blkio.c
> +++ b/block/blkio.c

[...]

> @@ -198,6 +203,8 @@ static BlockAIOCB *blkio_aio_preadv(BlockDriverState *bs, int64_t offset,
>           BlockCompletionFunc *cb, void *opaque)
>   {
>       BDRVBlkioState *s = bs->opaque;
> +    bool needs_mem_regions =
> +        s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF);

Is that condition sufficient?  bdrv_register_buf() has no way of 
returning an error, so it’s possible that buffers are silently not 
registered.  (And there are conditions in blkio_register_buf() where the 
buffer will not be registered, e.g. because it isn’t aligned.)

The caller knows nothing of this and will still pass 
BDRV_REQ_REGISTERED_BUF, and then we’ll assume the region is mapped but 
it won’t be.

>       struct iovec *iov = qiov->iov;
>       int iovcnt = qiov->niov;
>       BlkioAIOCB *acb;

[...]

> @@ -324,6 +333,80 @@ static void blkio_io_unplug(BlockDriverState *bs)
>       }
>   }
>   
> +static void blkio_register_buf(BlockDriverState *bs, void *host, size_t size)
> +{
> +    BDRVBlkioState *s = bs->opaque;
> +    int ret;
> +    struct blkio_mem_region region = (struct blkio_mem_region){
> +        .addr = host,
> +        .len = size,
> +        .fd = -1,
> +    };
> +
> +    if (((uintptr_t)host | size) % s->mem_region_alignment) {
> +        error_report_once("%s: skipping unaligned buf %p with size %zu",
> +                          __func__, host, size);
> +        return; /* skip unaligned */
> +    }

How big is mem-region-alignment generally?  Is it like 4k or is it going 
to be a real issue?

(Also, we could probably register a truncated region.  I know, that’ll 
break the BDRV_REQ_REGISTERED_BUF idea because the caller won’t know 
we’ve truncated it, but that’s no different than just not registering 
the buffer at all.)

> +
> +    /* Attempt to find the fd for a MemoryRegion */
> +    if (s->needs_mem_region_fd) {
> +        int fd = -1;
> +        ram_addr_t offset;
> +        MemoryRegion *mr;
> +
> +        /*
> +         * bdrv_register_buf() is called with the BQL held so mr lives at least
> +         * until this function returns.
> +         */
> +        mr = memory_region_from_host(host, &offset);
> +        if (mr) {
> +            fd = memory_region_get_fd(mr);
> +        }

I don’t think it’s specified that buffers registered with 
bdrv_register_buf() must be within a single memory region, is it? So can 
we somehow verify that the memory region covers the whole buffer?

> +        if (fd == -1) {
> +            error_report_once("%s: skipping fd-less buf %p with size %zu",
> +                              __func__, host, size);
> +            return; /* skip if there is no fd */
> +        }
> +
> +        region.fd = fd;
> +        region.fd_offset = offset;
> +    }
> +
> +    WITH_QEMU_LOCK_GUARD(&s->lock) {
> +        ret = blkio_map_mem_region(s->blkio, &region);
> +    }
> +
> +    if (ret < 0) {
> +        error_report_once("Failed to add blkio mem region %p with size %zu: %s",
> +                          host, size, blkio_get_error_msg());
> +    }
> +}
> +
> +static void blkio_unregister_buf(BlockDriverState *bs, void *host, size_t size)
> +{
> +    BDRVBlkioState *s = bs->opaque;
> +    int ret;
> +    struct blkio_mem_region region = (struct blkio_mem_region){
> +        .addr = host,
> +        .len = size,
> +        .fd = -1,
> +    };
> +
> +    if (((uintptr_t)host | size) % s->mem_region_alignment) {
> +        return; /* skip unaligned */
> +    }
> +
> +    WITH_QEMU_LOCK_GUARD(&s->lock) {
> +        ret = blkio_unmap_mem_region(s->blkio, &region);
> +    }

The documentation of libblkio says that “memory regions must be 
unmapped/freed with exactly the same `region` field values that they 
were mapped/allocated with.”  We don’t set .fd here, though.

It’s also unclear whether it’s allowed to unmap a region that wasn’t 
mapped, but I’ll trust libblkio to detect that.

> +
> +    if (ret < 0) {
> +        error_report_once("Failed to delete blkio mem region %p with size %zu: %s",
> +                          host, size, blkio_get_error_msg());
> +    }
> +}
> +
>   static void blkio_parse_filename_io_uring(const char *filename, QDict *options,
>                                             Error **errp)
>   {

[...]

> @@ -459,7 +553,7 @@ static int blkio_file_open(BlockDriverState *bs, QDict *options, int flags,
>           return ret;
>       }
>   
> -    bs->supported_write_flags = BDRV_REQ_FUA;
> +    bs->supported_write_flags = BDRV_REQ_FUA | BDRV_REQ_REGISTERED_BUF;

Shouldn’t we also report it as a supported read flag then?

Hanna

>       bs->supported_zero_flags = BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP |
>                                  BDRV_REQ_NO_FALLBACK;


Re: [RFC v3 7/8] blkio: implement BDRV_REQ_REGISTERED_BUF optimization
Posted by Stefan Hajnoczi 3 years, 5 months ago
On Thu, Jul 14, 2022 at 12:13:53PM +0200, Hanna Reitz wrote:
> On 08.07.22 06:17, Stefan Hajnoczi wrote:
> > Avoid bounce buffers when QEMUIOVector elements are within previously
> > registered bdrv_register_buf() buffers.
> > 
> > The idea is that emulated storage controllers will register guest RAM
> > using bdrv_register_buf() and set the BDRV_REQ_REGISTERED_BUF on I/O
> > requests. Therefore no blkio_map_mem_region() calls are necessary in the
> > performance-critical I/O code path.
> > 
> > This optimization doesn't apply if the I/O buffer is internally
> > allocated by QEMU (e.g. qcow2 metadata). There we still take the slow
> > path because BDRV_REQ_REGISTERED_BUF is not set.
> 
> Which keeps the question relevant of how slow the slow path is, i.e. whether
> it wouldn’t make sense to keep some of the mem regions allocated there in a
> cache instead of allocating/freeing them on every I/O request.

Yes, bounce buffer reuse would be possible, but let's keep it simple for
now.

> > Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> > ---
> >   block/blkio.c | 104 ++++++++++++++++++++++++++++++++++++++++++++++++--
> >   1 file changed, 101 insertions(+), 3 deletions(-)
> > 
> > diff --git a/block/blkio.c b/block/blkio.c
> > index 7fbdbd7fae..37d593a20c 100644
> > --- a/block/blkio.c
> > +++ b/block/blkio.c
> 
> [...]
> 
> > @@ -198,6 +203,8 @@ static BlockAIOCB *blkio_aio_preadv(BlockDriverState *bs, int64_t offset,
> >           BlockCompletionFunc *cb, void *opaque)
> >   {
> >       BDRVBlkioState *s = bs->opaque;
> > +    bool needs_mem_regions =
> > +        s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF);
> 
> Is that condition sufficient?  bdrv_register_buf() has no way of returning
> an error, so it’s possible that buffers are silently not registered.  (And
> there are conditions in blkio_register_buf() where the buffer will not be
> registered, e.g. because it isn’t aligned.)
> 
> The caller knows nothing of this and will still pass
> BDRV_REQ_REGISTERED_BUF, and then we’ll assume the region is mapped but it
> won’t be.
> 
> >       struct iovec *iov = qiov->iov;
> >       int iovcnt = qiov->niov;
> >       BlkioAIOCB *acb;
> 
> [...]
> 
> > @@ -324,6 +333,80 @@ static void blkio_io_unplug(BlockDriverState *bs)
> >       }
> >   }
> > +static void blkio_register_buf(BlockDriverState *bs, void *host, size_t size)
> > +{
> > +    BDRVBlkioState *s = bs->opaque;
> > +    int ret;
> > +    struct blkio_mem_region region = (struct blkio_mem_region){
> > +        .addr = host,
> > +        .len = size,
> > +        .fd = -1,
> > +    };
> > +
> > +    if (((uintptr_t)host | size) % s->mem_region_alignment) {
> > +        error_report_once("%s: skipping unaligned buf %p with size %zu",
> > +                          __func__, host, size);
> > +        return; /* skip unaligned */
> > +    }
> 
> How big is mem-region-alignment generally?  Is it like 4k or is it going to
> be a real issue?

Yes, it's usually the page size of the MMU/IOMMU. vhost-user and VFIO
have the same requirements so I don't think anything special is
necessary.

> (Also, we could probably register a truncated region.  I know, that’ll break
> the BDRV_REQ_REGISTERED_BUF idea because the caller won’t know we’ve
> truncated it, but that’s no different than just not registering the buffer
> at all.)
> 
> > +
> > +    /* Attempt to find the fd for a MemoryRegion */
> > +    if (s->needs_mem_region_fd) {
> > +        int fd = -1;
> > +        ram_addr_t offset;
> > +        MemoryRegion *mr;
> > +
> > +        /*
> > +         * bdrv_register_buf() is called with the BQL held so mr lives at least
> > +         * until this function returns.
> > +         */
> > +        mr = memory_region_from_host(host, &offset);
> > +        if (mr) {
> > +            fd = memory_region_get_fd(mr);
> > +        }
> 
> I don’t think it’s specified that buffers registered with
> bdrv_register_buf() must be within a single memory region, is it? So can we
> somehow verify that the memory region covers the whole buffer?

You are right, there is no guarantee. However, the range will always be
within a RAMBlock at the moment because the bdrv_register_buf() calls
are driven by a RAMBlock notifier and match the boundaries of the
RAMBlocks.

I will add a check so this starts failing when that assumption is
violated.

> 
> > +        if (fd == -1) {
> > +            error_report_once("%s: skipping fd-less buf %p with size %zu",
> > +                              __func__, host, size);
> > +            return; /* skip if there is no fd */
> > +        }
> > +
> > +        region.fd = fd;
> > +        region.fd_offset = offset;
> > +    }
> > +
> > +    WITH_QEMU_LOCK_GUARD(&s->lock) {
> > +        ret = blkio_map_mem_region(s->blkio, &region);
> > +    }
> > +
> > +    if (ret < 0) {
> > +        error_report_once("Failed to add blkio mem region %p with size %zu: %s",
> > +                          host, size, blkio_get_error_msg());
> > +    }
> > +}
> > +
> > +static void blkio_unregister_buf(BlockDriverState *bs, void *host, size_t size)
> > +{
> > +    BDRVBlkioState *s = bs->opaque;
> > +    int ret;
> > +    struct blkio_mem_region region = (struct blkio_mem_region){
> > +        .addr = host,
> > +        .len = size,
> > +        .fd = -1,
> > +    };
> > +
> > +    if (((uintptr_t)host | size) % s->mem_region_alignment) {
> > +        return; /* skip unaligned */
> > +    }
> > +
> > +    WITH_QEMU_LOCK_GUARD(&s->lock) {
> > +        ret = blkio_unmap_mem_region(s->blkio, &region);
> > +    }
> 
> The documentation of libblkio says that “memory regions must be
> unmapped/freed with exactly the same `region` field values that they were
> mapped/allocated with.”  We don’t set .fd here, though.

That's a bug. The memory region will not be unmapped because libblkio's
HashSet won't match. I'll fix the QEMU code to pass the exact same
struct blkio_mem_region fields.

> 
> It’s also unclear whether it’s allowed to unmap a region that wasn’t mapped,
> but I’ll trust libblkio to detect that.

Yes, it's a nop.

> 
> > +
> > +    if (ret < 0) {
> > +        error_report_once("Failed to delete blkio mem region %p with size %zu: %s",
> > +                          host, size, blkio_get_error_msg());
> > +    }
> > +}
> > +
> >   static void blkio_parse_filename_io_uring(const char *filename, QDict *options,
> >                                             Error **errp)
> >   {
> 
> [...]
> 
> > @@ -459,7 +553,7 @@ static int blkio_file_open(BlockDriverState *bs, QDict *options, int flags,
> >           return ret;
> >       }
> > -    bs->supported_write_flags = BDRV_REQ_FUA;
> > +    bs->supported_write_flags = BDRV_REQ_FUA | BDRV_REQ_REGISTERED_BUF;
> 
> Shouldn’t we also report it as a supported read flag then?

Yes, thank you!

Stefan
Re: [RFC v3 7/8] blkio: implement BDRV_REQ_REGISTERED_BUF optimization
Posted by Stefano Garzarella 3 years, 7 months ago
On Fri, Jul 08, 2022 at 05:17:36AM +0100, Stefan Hajnoczi wrote:
>Avoid bounce buffers when QEMUIOVector elements are within previously
>registered bdrv_register_buf() buffers.
>
>The idea is that emulated storage controllers will register guest RAM
>using bdrv_register_buf() and set the BDRV_REQ_REGISTERED_BUF on I/O
>requests. Therefore no blkio_map_mem_region() calls are necessary in the
>performance-critical I/O code path.
>
>This optimization doesn't apply if the I/O buffer is internally
>allocated by QEMU (e.g. qcow2 metadata). There we still take the slow
>path because BDRV_REQ_REGISTERED_BUF is not set.
>
>Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
>---
> block/blkio.c | 104 ++++++++++++++++++++++++++++++++++++++++++++++++--
> 1 file changed, 101 insertions(+), 3 deletions(-)
>
>diff --git a/block/blkio.c b/block/blkio.c
>index 7fbdbd7fae..37d593a20c 100644
>--- a/block/blkio.c
>+++ b/block/blkio.c
>@@ -1,7 +1,9 @@
> #include "qemu/osdep.h"
> #include <blkio.h>
> #include "block/block_int.h"
>+#include "exec/memory.h"
> #include "qapi/error.h"
>+#include "qemu/error-report.h"
> #include "qapi/qmp/qdict.h"
> #include "qemu/module.h"
>
>@@ -28,6 +30,9 @@ typedef struct {
>
>     /* Can we skip adding/deleting blkio_mem_regions? */
>     bool needs_mem_regions;
>+
>+    /* Are file descriptors necessary for blkio_mem_regions? */
>+    bool needs_mem_region_fd;
> } BDRVBlkioState;
>
> static void blkio_aiocb_complete(BlkioAIOCB *acb, int ret)
>@@ -198,6 +203,8 @@ static BlockAIOCB *blkio_aio_preadv(BlockDriverState *bs, int64_t offset,
>         BlockCompletionFunc *cb, void *opaque)
> {
>     BDRVBlkioState *s = bs->opaque;
>+    bool needs_mem_regions =
>+        s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF);
>     struct iovec *iov = qiov->iov;
>     int iovcnt = qiov->niov;
>     BlkioAIOCB *acb;
>@@ -206,7 +213,7 @@ static BlockAIOCB *blkio_aio_preadv(BlockDriverState *bs, int64_t offset,
>
>     acb = blkio_aiocb_get(bs, cb, opaque);
>
>-    if (s->needs_mem_regions) {
>+    if (needs_mem_regions) {
>         if (blkio_aiocb_init_mem_region_locked(acb, bytes) < 0) {
>             qemu_aio_unref(&acb->common);
>             return NULL;
>@@ -230,6 +237,8 @@ static BlockAIOCB *blkio_aio_pwritev(BlockDriverState *bs, int64_t offset,
> {
>     uint32_t blkio_flags = (flags & BDRV_REQ_FUA) ? BLKIO_REQ_FUA : 0;
>     BDRVBlkioState *s = bs->opaque;
>+    bool needs_mem_regions =
>+        s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF);
>     struct iovec *iov = qiov->iov;
>     int iovcnt = qiov->niov;
>     BlkioAIOCB *acb;
>@@ -238,7 +247,7 @@ static BlockAIOCB *blkio_aio_pwritev(BlockDriverState *bs, int64_t offset,
>
>     acb = blkio_aiocb_get(bs, cb, opaque);
>
>-    if (s->needs_mem_regions) {
>+    if (needs_mem_regions) {
>         if (blkio_aiocb_init_mem_region_locked(acb, bytes) < 0) {
>             qemu_aio_unref(&acb->common);
>             return NULL;
>@@ -324,6 +333,80 @@ static void blkio_io_unplug(BlockDriverState *bs)
>     }
> }
>
>+static void blkio_register_buf(BlockDriverState *bs, void *host, size_t size)
>+{
>+    BDRVBlkioState *s = bs->opaque;
>+    int ret;
>+    struct blkio_mem_region region = (struct blkio_mem_region){
>+        .addr = host,
>+        .len = size,
>+        .fd = -1,
>+    };
>+
>+    if (((uintptr_t)host | size) % s->mem_region_alignment) {
>+        error_report_once("%s: skipping unaligned buf %p with size %zu",
>+                          __func__, host, size);
>+        return; /* skip unaligned */
>+    }
>+
>+    /* Attempt to find the fd for a MemoryRegion */
>+    if (s->needs_mem_region_fd) {
>+        int fd = -1;
>+        ram_addr_t offset;
>+        MemoryRegion *mr;
>+
>+        /*
>+         * bdrv_register_buf() is called with the BQL held so mr lives at least
>+         * until this function returns.
>+         */
>+        mr = memory_region_from_host(host, &offset);
>+        if (mr) {
>+            fd = memory_region_get_fd(mr);

If s->needs_mem_region_fd is true, memory_region_get_fd() crashes I 
think because mr->ram_block is not yet set, indeed from the stack trace 
blkio_register_buf() is called inside qemu_ram_alloc_resizeable(), and 
its result is used to set mr->ram_block in 
memory_region_init_resizeable_ram():

Program terminated with signal SIGSEGV, Segmentation fault.
#0  0x000056235bf1f7a3 in memory_region_get_fd (mr=<optimized out>) at ../softmmu/memory.c:2309
#1  0x000056235c07e54d in blkio_register_buf (bs=<optimized out>, host=0x7f824e200000, size=2097152)
     at ../block/blkio.c:364
#2  0x000056235c0246c6 in bdrv_register_buf (bs=0x56235d606b40, host=0x7f824e200000, size=2097152)
     at ../block/io.c:3362
#3  0x000056235bea44e6 in ram_block_notify_add (host=0x7f824e200000, size=131072, max_size=2097152)
     at ../hw/core/numa.c:863
#4  0x000056235bf22c00 in ram_block_add (new_block=<optimized out>, errp=<optimized out>)
     at ../softmmu/physmem.c:2057
#5  0x000056235bf232e4 in qemu_ram_alloc_internal (size=size@entry=131072, 
     max_size=max_size@entry=2097152, resized=resized@entry=0x56235bc0f920 <fw_cfg_resized>, 
     host=host@entry=0x0, ram_flags=ram_flags@entry=4, mr=mr@entry=0x56235dc3fe00, 
     errp=0x7ffcb21f1be0) at ../softmmu/physmem.c:2180
#6  0x000056235bf26426 in qemu_ram_alloc_resizeable (size=size@entry=131072, 
     maxsz=maxsz@entry=2097152, resized=resized@entry=0x56235bc0f920 <fw_cfg_resized>, 
     mr=mr@entry=0x56235dc3fe00, errp=errp@entry=0x7ffcb21f1be0) at ../softmmu/physmem.c:2209
#7  0x000056235bf1cc99 in memory_region_init_resizeable_ram (mr=0x56235dc3fe00, 
     owner=owner@entry=0x56235d93ffc0, name=name@entry=0x7ffcb21f1ca0 "/rom@etc/acpi/tables", 
     size=131072, max_size=2097152, resized=resized@entry=0x56235bc0f920 <fw_cfg_resized>, 
     errp=0x56235c996490 <error_fatal>) at ../softmmu/memory.c:1586
#8  0x000056235bc0f99c in rom_set_mr (rom=rom@entry=0x56235ddd0200, owner=0x56235d93ffc0, 
     name=name@entry=0x7ffcb21f1ca0 "/rom@etc/acpi/tables", ro=ro@entry=true)
     at ../hw/core/loader.c:961
#9  0x000056235bc12a65 in rom_add_blob (name=name@entry=0x56235c1a2a09 "etc/acpi/tables", 
     blob=0x56235df4f4b0, len=<optimized out>, max_len=max_len@entry=2097152, 
     addr=addr@entry=18446744073709551615, 
     fw_file_name=fw_file_name@entry=0x56235c1a2a09 "etc/acpi/tables", 
     fw_callback=0x56235be47f90 <acpi_build_update>, callback_opaque=0x56235d817830, as=0x0, 
     read_only=true) at ../hw/core/loader.c:1102
#10 0x000056235bbe0990 in acpi_add_rom_blob (
     update=update@entry=0x56235be47f90 <acpi_build_update>, opaque=opaque@entry=0x56235d817830, 
     blob=0x56235d3ab750, name=name@entry=0x56235c1a2a09 "etc/acpi/tables") at ../hw/acpi/utils.c:46
#11 0x000056235be481e6 in acpi_setup () at ../hw/i386/acpi-build.c:2805
#12 0x000056235be3e209 in pc_machine_done (notifier=0x56235d5efce8, data=<optimized out>)
     at ../hw/i386/pc.c:758
#13 0x000056235c12e4a7 in notifier_list_notify (
     list=list@entry=0x56235c963790 <machine_init_done_notifiers>, data=data@entry=0x0)
     at ../util/notify.c:39

Thanks,
Stefano
Re: [RFC v3 7/8] blkio: implement BDRV_REQ_REGISTERED_BUF optimization
Posted by Stefan Hajnoczi 3 years, 5 months ago
On Tue, Jul 12, 2022 at 04:28:02PM +0200, Stefano Garzarella wrote:
> On Fri, Jul 08, 2022 at 05:17:36AM +0100, Stefan Hajnoczi wrote:
> > Avoid bounce buffers when QEMUIOVector elements are within previously
> > registered bdrv_register_buf() buffers.
> > 
> > The idea is that emulated storage controllers will register guest RAM
> > using bdrv_register_buf() and set the BDRV_REQ_REGISTERED_BUF on I/O
> > requests. Therefore no blkio_map_mem_region() calls are necessary in the
> > performance-critical I/O code path.
> > 
> > This optimization doesn't apply if the I/O buffer is internally
> > allocated by QEMU (e.g. qcow2 metadata). There we still take the slow
> > path because BDRV_REQ_REGISTERED_BUF is not set.
> > 
> > Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> > ---
> > block/blkio.c | 104 ++++++++++++++++++++++++++++++++++++++++++++++++--
> > 1 file changed, 101 insertions(+), 3 deletions(-)
> > 
> > diff --git a/block/blkio.c b/block/blkio.c
> > index 7fbdbd7fae..37d593a20c 100644
> > --- a/block/blkio.c
> > +++ b/block/blkio.c
> > @@ -1,7 +1,9 @@
> > #include "qemu/osdep.h"
> > #include <blkio.h>
> > #include "block/block_int.h"
> > +#include "exec/memory.h"
> > #include "qapi/error.h"
> > +#include "qemu/error-report.h"
> > #include "qapi/qmp/qdict.h"
> > #include "qemu/module.h"
> > 
> > @@ -28,6 +30,9 @@ typedef struct {
> > 
> >     /* Can we skip adding/deleting blkio_mem_regions? */
> >     bool needs_mem_regions;
> > +
> > +    /* Are file descriptors necessary for blkio_mem_regions? */
> > +    bool needs_mem_region_fd;
> > } BDRVBlkioState;
> > 
> > static void blkio_aiocb_complete(BlkioAIOCB *acb, int ret)
> > @@ -198,6 +203,8 @@ static BlockAIOCB *blkio_aio_preadv(BlockDriverState *bs, int64_t offset,
> >         BlockCompletionFunc *cb, void *opaque)
> > {
> >     BDRVBlkioState *s = bs->opaque;
> > +    bool needs_mem_regions =
> > +        s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF);
> >     struct iovec *iov = qiov->iov;
> >     int iovcnt = qiov->niov;
> >     BlkioAIOCB *acb;
> > @@ -206,7 +213,7 @@ static BlockAIOCB *blkio_aio_preadv(BlockDriverState *bs, int64_t offset,
> > 
> >     acb = blkio_aiocb_get(bs, cb, opaque);
> > 
> > -    if (s->needs_mem_regions) {
> > +    if (needs_mem_regions) {
> >         if (blkio_aiocb_init_mem_region_locked(acb, bytes) < 0) {
> >             qemu_aio_unref(&acb->common);
> >             return NULL;
> > @@ -230,6 +237,8 @@ static BlockAIOCB *blkio_aio_pwritev(BlockDriverState *bs, int64_t offset,
> > {
> >     uint32_t blkio_flags = (flags & BDRV_REQ_FUA) ? BLKIO_REQ_FUA : 0;
> >     BDRVBlkioState *s = bs->opaque;
> > +    bool needs_mem_regions =
> > +        s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF);
> >     struct iovec *iov = qiov->iov;
> >     int iovcnt = qiov->niov;
> >     BlkioAIOCB *acb;
> > @@ -238,7 +247,7 @@ static BlockAIOCB *blkio_aio_pwritev(BlockDriverState *bs, int64_t offset,
> > 
> >     acb = blkio_aiocb_get(bs, cb, opaque);
> > 
> > -    if (s->needs_mem_regions) {
> > +    if (needs_mem_regions) {
> >         if (blkio_aiocb_init_mem_region_locked(acb, bytes) < 0) {
> >             qemu_aio_unref(&acb->common);
> >             return NULL;
> > @@ -324,6 +333,80 @@ static void blkio_io_unplug(BlockDriverState *bs)
> >     }
> > }
> > 
> > +static void blkio_register_buf(BlockDriverState *bs, void *host, size_t size)
> > +{
> > +    BDRVBlkioState *s = bs->opaque;
> > +    int ret;
> > +    struct blkio_mem_region region = (struct blkio_mem_region){
> > +        .addr = host,
> > +        .len = size,
> > +        .fd = -1,
> > +    };
> > +
> > +    if (((uintptr_t)host | size) % s->mem_region_alignment) {
> > +        error_report_once("%s: skipping unaligned buf %p with size %zu",
> > +                          __func__, host, size);
> > +        return; /* skip unaligned */
> > +    }
> > +
> > +    /* Attempt to find the fd for a MemoryRegion */
> > +    if (s->needs_mem_region_fd) {
> > +        int fd = -1;
> > +        ram_addr_t offset;
> > +        MemoryRegion *mr;
> > +
> > +        /*
> > +         * bdrv_register_buf() is called with the BQL held so mr lives at least
> > +         * until this function returns.
> > +         */
> > +        mr = memory_region_from_host(host, &offset);
> > +        if (mr) {
> > +            fd = memory_region_get_fd(mr);
> 
> If s->needs_mem_region_fd is true, memory_region_get_fd() crashes I think
> because mr->ram_block is not yet set, indeed from the stack trace
> blkio_register_buf() is called inside qemu_ram_alloc_resizeable(), and its
> result is used to set mr->ram_block in memory_region_init_resizeable_ram():
> 
> Program terminated with signal SIGSEGV, Segmentation fault.
> #0  0x000056235bf1f7a3 in memory_region_get_fd (mr=<optimized out>) at ../softmmu/memory.c:2309
> #1  0x000056235c07e54d in blkio_register_buf (bs=<optimized out>, host=0x7f824e200000, size=2097152)
>     at ../block/blkio.c:364
> #2  0x000056235c0246c6 in bdrv_register_buf (bs=0x56235d606b40, host=0x7f824e200000, size=2097152)
>     at ../block/io.c:3362
> #3  0x000056235bea44e6 in ram_block_notify_add (host=0x7f824e200000, size=131072, max_size=2097152)
>     at ../hw/core/numa.c:863
> #4  0x000056235bf22c00 in ram_block_add (new_block=<optimized out>, errp=<optimized out>)
>     at ../softmmu/physmem.c:2057
> #5  0x000056235bf232e4 in qemu_ram_alloc_internal (size=size@entry=131072,
> max_size=max_size@entry=2097152, resized=resized@entry=0x56235bc0f920
> <fw_cfg_resized>,     host=host@entry=0x0, ram_flags=ram_flags@entry=4,
> mr=mr@entry=0x56235dc3fe00,     errp=0x7ffcb21f1be0) at
> ../softmmu/physmem.c:2180
> #6  0x000056235bf26426 in qemu_ram_alloc_resizeable (size=size@entry=131072,
> maxsz=maxsz@entry=2097152, resized=resized@entry=0x56235bc0f920
> <fw_cfg_resized>,     mr=mr@entry=0x56235dc3fe00,
> errp=errp@entry=0x7ffcb21f1be0) at ../softmmu/physmem.c:2209
> #7  0x000056235bf1cc99 in memory_region_init_resizeable_ram
> (mr=0x56235dc3fe00,     owner=owner@entry=0x56235d93ffc0,
> name=name@entry=0x7ffcb21f1ca0 "/rom@etc/acpi/tables",     size=131072,
> max_size=2097152, resized=resized@entry=0x56235bc0f920 <fw_cfg_resized>,
> errp=0x56235c996490 <error_fatal>) at ../softmmu/memory.c:1586
> #8  0x000056235bc0f99c in rom_set_mr (rom=rom@entry=0x56235ddd0200,
> owner=0x56235d93ffc0,     name=name@entry=0x7ffcb21f1ca0
> "/rom@etc/acpi/tables", ro=ro@entry=true)
>     at ../hw/core/loader.c:961
> #9  0x000056235bc12a65 in rom_add_blob (name=name@entry=0x56235c1a2a09
> "etc/acpi/tables",     blob=0x56235df4f4b0, len=<optimized out>,
> max_len=max_len@entry=2097152,     addr=addr@entry=18446744073709551615,
> fw_file_name=fw_file_name@entry=0x56235c1a2a09 "etc/acpi/tables",
> fw_callback=0x56235be47f90 <acpi_build_update>,
> callback_opaque=0x56235d817830, as=0x0,     read_only=true) at
> ../hw/core/loader.c:1102
> #10 0x000056235bbe0990 in acpi_add_rom_blob (
>     update=update@entry=0x56235be47f90 <acpi_build_update>,
> opaque=opaque@entry=0x56235d817830,     blob=0x56235d3ab750,
> name=name@entry=0x56235c1a2a09 "etc/acpi/tables") at ../hw/acpi/utils.c:46
> #11 0x000056235be481e6 in acpi_setup () at ../hw/i386/acpi-build.c:2805
> #12 0x000056235be3e209 in pc_machine_done (notifier=0x56235d5efce8, data=<optimized out>)
>     at ../hw/i386/pc.c:758
> #13 0x000056235c12e4a7 in notifier_list_notify (
>     list=list@entry=0x56235c963790 <machine_init_done_notifiers>, data=data@entry=0x0)
>     at ../util/notify.c:39

Hi Stefano,
I have fixed this by using RAMBlock instead of MemoryRegion. The next
revision will call qemu_ram_block_from_host() to fetch the RAMBlock's
fd.

Stefan