Series comparison

-[PULL for-6.1 0/3] Block patches
+[PULL 0/8] Block patches
-The following changes since commit 801f3db7564dcce8a37a70833c0abe40ec19f8ce:
+The following changes since commit c6a5fc2ac76c5ab709896ee1b0edd33685a67ed1:
-  Merge remote-tracking branch 'remotes/philmd/tags/kconfig-20210720' into staging (2021-07-20 19:30:28 +0100)
+  decodetree: Add --output-null for meson testing (2023-05-31 19:56:42 -0700)
 are available in the Git repository at:
   https://gitlab.com/stefanha/qemu.git tags/block-pull-request
-for you to fetch changes up to d7ddd0a1618a75b31dc308bb37365ce1da972154:
+for you to fetch changes up to 98b126f5e3228a346c774e569e26689943b401dd:
-  linux-aio: limit the batch size using `aio-max-batch` parameter (2021-07-21 13:47:50 +0100)
+  qapi: add '@fdset' feature for BlockdevOptionsVirtioBlkVhostVdpa (2023-06-01 11:08:21 -0400)
 ----------------------------------------------------------------
 Pull request
-Stefano's performance regression fix for commit 2558cb8dd4 ("linux-aio:
+- Stefano Garzarella's blkio block driver 'fd' parameter
-increasing MAX_EVENTS to a larger hardcoded value").
+- My thread-local blk_io_plug() series
 ----------------------------------------------------------------
-Stefano Garzarella (3):
+Stefan Hajnoczi (6):
-  iothread: generalize iothread_set_param/iothread_get_param
+  block: add blk_io_plug_call() API
-  iothread: add aio-max-batch parameter
+  block/nvme: convert to blk_io_plug_call() API
-  linux-aio: limit the batch size using `aio-max-batch` parameter
+  block/blkio: convert to blk_io_plug_call() API
   block/io_uring: convert to blk_io_plug_call() API
   block/linux-aio: convert to blk_io_plug_call() API
   block: remove bdrv_co_io_plug() API
- qapi/misc.json            |  6 ++-
+Stefano Garzarella (2):
- qapi/qom.json             |  7 +++-
+  block/blkio: use qemu_open() to support fd passing for virtio-blk
- include/block/aio.h       | 12 ++++++
+  qapi: add '@fdset' feature for BlockdevOptionsVirtioBlkVhostVdpa
- include/sysemu/iothread.h |  3 ++
- block/linux-aio.c         |  9 ++++-
+ MAINTAINERS                       |   1 +
- iothread.c                | 82 ++++++++++++++++++++++++++++++++++-----
+ qapi/block-core.json              |   6 ++
- monitor/hmp-cmds.c        |  2 +
+ meson.build                       |   4 +
- util/aio-posix.c          | 12 ++++++
+ include/block/block-io.h          |   3 -
- util/aio-win32.c          |  5 +++
+ include/block/block_int-common.h  |  11 ---
- util/async.c              |  2 +
+ include/block/raw-aio.h           |  14 ---
- qemu-options.hx           |  8 +++-
+ include/sysemu/block-backend-io.h |  13 +--
-files changed, 134 insertions(+), 14 deletions(-)
+ block/blkio.c                     |  96 ++++++++++++------
  block/block-backend.c             |  22 -----
  block/file-posix.c                |  38 -------
  block/io.c                        |  37 -------
  block/io_uring.c                  |  44 ++++-----
  block/linux-aio.c                 |  41 +++-----
  block/nvme.c                      |  44 +++------
  block/plug.c                      | 159 ++++++++++++++++++++++++++++++
  hw/block/dataplane/xen-block.c    |   8 +-
  hw/block/virtio-blk.c             |   4 +-
  hw/scsi/virtio-scsi.c             |   6 +-
  block/meson.build                 |   1 +
  block/trace-events                |   6 +-
 files changed, 293 insertions(+), 265 deletions(-)
  create mode 100644 block/plug.c
 --
-.31.1
+.40.1

-New patch
+[PULL 1/8] block: add blk_io_plug_call() API
+Introduce a new API for thread-local blk_io_plug() that does not
 traverse the block graph. The goal is to make blk_io_plug() multi-queue
 friendly.
 Instead of having block drivers track whether or not we're in a plugged
 section, provide an API that allows them to defer a function call until
 we're unplugged: blk_io_plug_call(fn, opaque). If blk_io_plug_call() is
 called multiple times with the same fn/opaque pair, then fn() is only
 called once at the end of the function - resulting in batching.
 This patch introduces the API and changes blk_io_plug()/blk_io_unplug().
 blk_io_plug()/blk_io_unplug() no longer require a BlockBackend argument
 because the plug state is now thread-local.
 Later patches convert block drivers to blk_io_plug_call() and then we
 can finally remove .bdrv_co_io_plug() once all block drivers have been
 converted.
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 Reviewed-by: Eric Blake <eblake@redhat.com>
 Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
 Acked-by: Kevin Wolf <kwolf@redhat.com>
 Message-id: 20230530180959.1108766-2-stefanha@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
  MAINTAINERS                       |   1 +
  include/sysemu/block-backend-io.h |  13 +--
  block/block-backend.c             |  22 -----
  block/plug.c                      | 159 ++++++++++++++++++++++++++++++
  hw/block/dataplane/xen-block.c    |   8 +-
  hw/block/virtio-blk.c             |   4 +-
  hw/scsi/virtio-scsi.c             |   6 +-
  block/meson.build                 |   1 +
 files changed, 173 insertions(+), 41 deletions(-)
  create mode 100644 block/plug.c
 diff --git a/MAINTAINERS b/MAINTAINERS
 index XXXXXXX..XXXXXXX 100644
 --- a/MAINTAINERS
 +++ b/MAINTAINERS
@@ -XXX,XX +XXX,XX @@ F: util/aio-*.c
  F: util/aio-*.h
  F: util/fdmon-*.c
  F: block/io.c
 +F: block/plug.c
  F: migration/block*
  F: include/block/aio.h
  F: include/block/aio-wait.h
 diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/sysemu/block-backend-io.h
 +++ b/include/sysemu/block-backend-io.h
@@ -XXX,XX +XXX,XX @@ void blk_iostatus_set_err(BlockBackend *blk, int error);
  int blk_get_max_iov(BlockBackend *blk);
  int blk_get_max_hw_iov(BlockBackend *blk);
 -/*
 - * blk_io_plug/unplug are thread-local operations. This means that multiple
 - * IOThreads can simultaneously call plug/unplug, but the caller must ensure
 - * that each unplug() is called in the same IOThread of the matching plug().
 - */
 -void coroutine_fn blk_co_io_plug(BlockBackend *blk);
 -void co_wrapper blk_io_plug(BlockBackend *blk);
 -
 -void coroutine_fn blk_co_io_unplug(BlockBackend *blk);
 -void co_wrapper blk_io_unplug(BlockBackend *blk);
 +void blk_io_plug(void);
 +void blk_io_unplug(void);
 +void blk_io_plug_call(void (*fn)(void *), void *opaque);
  AioContext *blk_get_aio_context(BlockBackend *blk);
  BlockAcctStats *blk_get_stats(BlockBackend *blk);
 diff --git a/block/block-backend.c b/block/block-backend.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/block-backend.c
 +++ b/block/block-backend.c
@@ -XXX,XX +XXX,XX @@ void blk_add_insert_bs_notifier(BlockBackend *blk, Notifier *notify)
      notifier_list_add(&blk->insert_bs_notifiers, notify);
  }
 -void coroutine_fn blk_co_io_plug(BlockBackend *blk)
 -{
 -    BlockDriverState *bs = blk_bs(blk);
 -    IO_CODE();
 -    GRAPH_RDLOCK_GUARD();
 -
 -    if (bs) {
 -        bdrv_co_io_plug(bs);
 -    }
 -}
 -
 -void coroutine_fn blk_co_io_unplug(BlockBackend *blk)
 -{
 -    BlockDriverState *bs = blk_bs(blk);
 -    IO_CODE();
 -    GRAPH_RDLOCK_GUARD();
 -
 -    if (bs) {
 -        bdrv_co_io_unplug(bs);
 -    }
 -}
 -
  BlockAcctStats *blk_get_stats(BlockBackend *blk)
  {
      IO_CODE();
 diff --git a/block/plug.c b/block/plug.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/block/plug.c
@@ -XXX,XX +XXX,XX @@
 +/* SPDX-License-Identifier: GPL-2.0-or-later */
 +/*
 + * Block I/O plugging
 + *
 + * Copyright Red Hat.
 + *
 + * This API defers a function call within a blk_io_plug()/blk_io_unplug()
 + * section, allowing multiple calls to batch up. This is a performance
 + * optimization that is used in the block layer to submit several I/O requests
 + * at once instead of individually:
 + *
 + *   blk_io_plug(); <-- start of plugged region
 + *   ...
 + *   blk_io_plug_call(my_func, my_obj); <-- deferred my_func(my_obj) call
 + *   blk_io_plug_call(my_func, my_obj); <-- another
 + *   blk_io_plug_call(my_func, my_obj); <-- another
 + *   ...
 + *   blk_io_unplug(); <-- end of plugged region, my_func(my_obj) is called once
 + *
 + * This code is actually generic and not tied to the block layer. If another
 + * subsystem needs this functionality, it could be renamed.
 + */
 +
 +#include "qemu/osdep.h"
 +#include "qemu/coroutine-tls.h"
 +#include "qemu/notify.h"
 +#include "qemu/thread.h"
 +#include "sysemu/block-backend.h"
 +
 +/* A function call that has been deferred until unplug() */
 +typedef struct {
 +    void (*fn)(void *);
 +    void *opaque;
 +} UnplugFn;
 +
 +/* Per-thread state */
 +typedef struct {
 +    unsigned count;       /* how many times has plug() been called? */
 +    GArray *unplug_fns;   /* functions to call at unplug time */
 +} Plug;
 +
 +/* Use get_ptr_plug() to fetch this thread-local value */
 +QEMU_DEFINE_STATIC_CO_TLS(Plug, plug);
 +
 +/* Called at thread cleanup time */
 +static void blk_io_plug_atexit(Notifier *n, void *value)
 +{
 +    Plug *plug = get_ptr_plug();
 +    g_array_free(plug->unplug_fns, TRUE);
 +}
 +
 +/* This won't involve coroutines, so use __thread */
 +static __thread Notifier blk_io_plug_atexit_notifier;
 +
 +/**
 + * blk_io_plug_call:
 + * @fn: a function pointer to be invoked
 + * @opaque: a user-defined argument to @fn()
 + *
 + * Call @fn(@opaque) immediately if not within a blk_io_plug()/blk_io_unplug()
 + * section.
 + *
 + * Otherwise defer the call until the end of the outermost
 + * blk_io_plug()/blk_io_unplug() section in this thread. If the same
 + * @fn/@opaque pair has already been deferred, it will only be called once upon
 + * blk_io_unplug() so that accumulated calls are batched into a single call.
 + *
 + * The caller must ensure that @opaque is not freed before @fn() is invoked.
 + */
 +void blk_io_plug_call(void (*fn)(void *), void *opaque)
 +{
 +    Plug *plug = get_ptr_plug();
 +
 +    /* Call immediately if we're not plugged */
 +    if (plug->count == 0) {
 +        fn(opaque);
 +        return;
 +    }
 +
 +    GArray *array = plug->unplug_fns;
 +    if (!array) {
 +        array = g_array_new(FALSE, FALSE, sizeof(UnplugFn));
 +        plug->unplug_fns = array;
 +        blk_io_plug_atexit_notifier.notify = blk_io_plug_atexit;
 +        qemu_thread_atexit_add(&blk_io_plug_atexit_notifier);
 +    }
 +
 +    UnplugFn *fns = (UnplugFn *)array->data;
 +    UnplugFn new_fn = {
 +        .fn = fn,
 +        .opaque = opaque,
 +    };
 +
 +    /*
 +     * There won't be many, so do a linear search. If this becomes a bottleneck
 +     * then a binary search (glib 2.62+) or different data structure could be
 +     * used.
 +     */
 +    for (guint i = 0; i < array->len; i++) {
 +        if (memcmp(&fns[i], &new_fn, sizeof(new_fn)) == 0) {
 +            return; /* already exists */
 +        }
 +    }
 +
 +    g_array_append_val(array, new_fn);
 +}
 +
 +/**
 + * blk_io_plug: Defer blk_io_plug_call() functions until blk_io_unplug()
 + *
 + * blk_io_plug/unplug are thread-local operations. This means that multiple
 + * threads can simultaneously call plug/unplug, but the caller must ensure that
 + * each unplug() is called in the same thread of the matching plug().
 + *
 + * Nesting is supported. blk_io_plug_call() functions are only called at the
 + * outermost blk_io_unplug().
 + */
 +void blk_io_plug(void)
 +{
 +    Plug *plug = get_ptr_plug();
 +
 +    assert(plug->count < UINT32_MAX);
 +
 +    plug->count++;
 +}
 +
 +/**
 + * blk_io_unplug: Run any pending blk_io_plug_call() functions
 + *
 + * There must have been a matching blk_io_plug() call in the same thread prior
 + * to this blk_io_unplug() call.
 + */
 +void blk_io_unplug(void)
 +{
 +    Plug *plug = get_ptr_plug();
 +
 +    assert(plug->count > 0);
 +
 +    if (--plug->count > 0) {
 +        return;
 +    }
 +
 +    GArray *array = plug->unplug_fns;
 +    if (!array) {
 +        return;
 +    }
 +
 +    UnplugFn *fns = (UnplugFn *)array->data;
 +
 +    for (guint i = 0; i < array->len; i++) {
 +        fns[i].fn(fns[i].opaque);
 +    }
 +
 +    /*
 +     * This resets the array without freeing memory so that appending is cheap
 +     * in the future.
 +     */
 +    g_array_set_size(array, 0);
 +}
 diff --git a/hw/block/dataplane/xen-block.c b/hw/block/dataplane/xen-block.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/block/dataplane/xen-block.c
 +++ b/hw/block/dataplane/xen-block.c
@@ -XXX,XX +XXX,XX @@ static bool xen_block_handle_requests(XenBlockDataPlane *dataplane)
       * is below us.
       */
      if (inflight_atstart > IO_PLUG_THRESHOLD) {
 -        blk_io_plug(dataplane->blk);
 +        blk_io_plug();
      }
      while (rc != rp) {
          /* pull request from ring */
@@ -XXX,XX +XXX,XX @@ static bool xen_block_handle_requests(XenBlockDataPlane *dataplane)
          if (inflight_atstart > IO_PLUG_THRESHOLD &&
              batched >= inflight_atstart) {
 -            blk_io_unplug(dataplane->blk);
 +            blk_io_unplug();
          }
          xen_block_do_aio(request);
          if (inflight_atstart > IO_PLUG_THRESHOLD) {
              if (batched >= inflight_atstart) {
 -                blk_io_plug(dataplane->blk);
 +                blk_io_plug();
                  batched = 0;
              } else {
                  batched++;
@@ -XXX,XX +XXX,XX @@ static bool xen_block_handle_requests(XenBlockDataPlane *dataplane)
          }
      }
      if (inflight_atstart > IO_PLUG_THRESHOLD) {
 -        blk_io_unplug(dataplane->blk);
 +        blk_io_unplug();
      }
      return done_something;
 diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/block/virtio-blk.c
 +++ b/hw/block/virtio-blk.c
@@ -XXX,XX +XXX,XX @@ void virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq)
      bool suppress_notifications = virtio_queue_get_notification(vq);
      aio_context_acquire(blk_get_aio_context(s->blk));
 -    blk_io_plug(s->blk);
 +    blk_io_plug();
      do {
          if (suppress_notifications) {
@@ -XXX,XX +XXX,XX @@ void virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq)
          virtio_blk_submit_multireq(s, &mrb);
      }
 -    blk_io_unplug(s->blk);
 +    blk_io_unplug();
      aio_context_release(blk_get_aio_context(s->blk));
  }
 diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/scsi/virtio-scsi.c
 +++ b/hw/scsi/virtio-scsi.c
@@ -XXX,XX +XXX,XX @@ static int virtio_scsi_handle_cmd_req_prepare(VirtIOSCSI *s, VirtIOSCSIReq *req)
          return -ENOBUFS;
      }
      scsi_req_ref(req->sreq);
 -    blk_io_plug(d->conf.blk);
 +    blk_io_plug();
      object_unref(OBJECT(d));
      return 0;
  }
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_handle_cmd_req_submit(VirtIOSCSI *s, VirtIOSCSIReq *req)
      if (scsi_req_enqueue(sreq)) {
          scsi_req_continue(sreq);
      }
 -    blk_io_unplug(sreq->dev->conf.blk);
 +    blk_io_unplug();
      scsi_req_unref(sreq);
  }
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq)
                  while (!QTAILQ_EMPTY(&reqs)) {
                      req = QTAILQ_FIRST(&reqs);
                      QTAILQ_REMOVE(&reqs, req, next);
 -                    blk_io_unplug(req->sreq->dev->conf.blk);
 +                    blk_io_unplug();
                      scsi_req_unref(req->sreq);
                      virtqueue_detach_element(req->vq, &req->elem, 0);
                      virtio_scsi_free_req(req);
 diff --git a/block/meson.build b/block/meson.build
 index XXXXXXX..XXXXXXX 100644
 --- a/block/meson.build
 +++ b/block/meson.build
@@ -XXX,XX +XXX,XX @@ block_ss.add(files(
    'mirror.c',
    'nbd.c',
    'null.c',
 +  'plug.c',
    'qapi.c',
    'qcow2-bitmap.c',
    'qcow2-cache.c',
 --
 .40.1

-New patch
+[PULL 2/8] block/nvme: convert to blk_io_plug_call() API
+Stop using the .bdrv_co_io_plug() API because it is not multi-queue
+block layer friendly. Use the new blk_io_plug_call() API to batch I/O
+submission instead.
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Reviewed-by: Eric Blake <eblake@redhat.com>
+Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
+Acked-by: Kevin Wolf <kwolf@redhat.com>
+Message-id: 20230530180959.1108766-3-stefanha@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+---
+ block/nvme.c       | 44 ++++++++++++--------------------------------
+ block/trace-events |  1 -
+files changed, 12 insertions(+), 33 deletions(-)
+diff --git a/block/nvme.c b/block/nvme.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/nvme.c
++++ b/block/nvme.c
+@@ -XXX,XX +XXX,XX @@
+ #include "qemu/vfio-helpers.h"
+ #include "block/block-io.h"
+ #include "block/block_int.h"
++#include "sysemu/block-backend.h"
+ #include "sysemu/replay.h"
+ #include "trace.h"
+@@ -XXX,XX +XXX,XX @@ struct BDRVNVMeState {
+     int blkshift;
+     uint64_t max_transfer;
+-    bool plugged;
+     bool supports_write_zeroes;
+     bool supports_discard;
+@@ -XXX,XX +XXX,XX @@ static void nvme_kick(NVMeQueuePair *q)
+ {
+     BDRVNVMeState *s = q->s;
+-    if (s->plugged || !q->need_kick) {
++    if (!q->need_kick) {
+         return;
+     }
+     trace_nvme_kick(s, q->index);
+@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(NVMeQueuePair *q)
+     NvmeCqe *c;
+     trace_nvme_process_completion(s, q->index, q->inflight);
+-    if (s->plugged) {
+-        trace_nvme_process_completion_queue_plugged(s, q->index);
+-        return false;
+-    }
+     /*
+      * Support re-entrancy when a request cb() function invokes aio_poll().
+@@ -XXX,XX +XXX,XX @@ static void nvme_trace_command(const NvmeCmd *cmd)
+     }
+ }
++static void nvme_unplug_fn(void *opaque)
++{
++    NVMeQueuePair *q = opaque;
++
++    QEMU_LOCK_GUARD(&q->lock);
++    nvme_kick(q);
++    nvme_process_completion(q);
++}
++
+ static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req,
+                                 NvmeCmd *cmd, BlockCompletionFunc cb,
+                                 void *opaque)
+@@ -XXX,XX +XXX,XX @@ static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req,
+            q->sq.tail * NVME_SQ_ENTRY_BYTES, cmd, sizeof(*cmd));
+     q->sq.tail = (q->sq.tail + 1) % NVME_QUEUE_SIZE;
+     q->need_kick++;
+-    nvme_kick(q);
+-    nvme_process_completion(q);
++    blk_io_plug_call(nvme_unplug_fn, q);
+     qemu_mutex_unlock(&q->lock);
+ }
+@@ -XXX,XX +XXX,XX @@ static void nvme_attach_aio_context(BlockDriverState *bs,
+     }
+ }
+-static void coroutine_fn nvme_co_io_plug(BlockDriverState *bs)
+-{
+-    BDRVNVMeState *s = bs->opaque;
+-    assert(!s->plugged);
+-    s->plugged = true;
+-}
+-
+-static void coroutine_fn nvme_co_io_unplug(BlockDriverState *bs)
+-{
+-    BDRVNVMeState *s = bs->opaque;
+-    assert(s->plugged);
+-    s->plugged = false;
+-    for (unsigned i = INDEX_IO(0); i < s->queue_count; i++) {
+-        NVMeQueuePair *q = s->queues[i];
+-        qemu_mutex_lock(&q->lock);
+-        nvme_kick(q);
+-        nvme_process_completion(q);
+-        qemu_mutex_unlock(&q->lock);
+-    }
+-}
+-
+ static bool nvme_register_buf(BlockDriverState *bs, void *host, size_t size,
+                               Error **errp)
+ {
+@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_nvme = {
+     .bdrv_detach_aio_context  = nvme_detach_aio_context,
+     .bdrv_attach_aio_context  = nvme_attach_aio_context,
+-    .bdrv_co_io_plug          = nvme_co_io_plug,
+-    .bdrv_co_io_unplug        = nvme_co_io_unplug,
+-
+     .bdrv_register_buf        = nvme_register_buf,
+     .bdrv_unregister_buf      = nvme_unregister_buf,
+ };
+diff --git a/block/trace-events b/block/trace-events
+index XXXXXXX..XXXXXXX 100644
+--- a/block/trace-events
++++ b/block/trace-events
+@@ -XXX,XX +XXX,XX @@ nvme_kick(void *s, unsigned q_index) "s %p q #%u"
+ nvme_dma_flush_queue_wait(void *s) "s %p"
+ nvme_error(int cmd_specific, int sq_head, int sqid, int cid, int status) "cmd_specific %d sq_head %d sqid %d cid %d status 0x%x"
+ nvme_process_completion(void *s, unsigned q_index, int inflight) "s %p q #%u inflight %d"
+-nvme_process_completion_queue_plugged(void *s, unsigned q_index) "s %p q #%u"
+ nvme_complete_command(void *s, unsigned q_index, int cid) "s %p q #%u cid %d"
+ nvme_submit_command(void *s, unsigned q_index, int cid) "s %p q #%u cid %d"
+ nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6, int c7) "%02x %02x %02x %02x %02x %02x %02x %02x"
+--
+.40.1

-New patch
+[PULL 3/8] block/blkio: convert to blk_io_plug_call() API
+Stop using the .bdrv_co_io_plug() API because it is not multi-queue
+block layer friendly. Use the new blk_io_plug_call() API to batch I/O
+submission instead.
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Reviewed-by: Eric Blake <eblake@redhat.com>
+Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
+Acked-by: Kevin Wolf <kwolf@redhat.com>
+Message-id: 20230530180959.1108766-4-stefanha@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+---
+ block/blkio.c | 43 ++++++++++++++++++++++++-------------------
+file changed, 24 insertions(+), 19 deletions(-)
+diff --git a/block/blkio.c b/block/blkio.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/blkio.c
++++ b/block/blkio.c
+@@ -XXX,XX +XXX,XX @@
+ #include "qemu/error-report.h"
+ #include "qapi/qmp/qdict.h"
+ #include "qemu/module.h"
++#include "sysemu/block-backend.h"
+ #include "exec/memory.h" /* for ram_block_discard_disable() */
+ #include "block/block-io.h"
+@@ -XXX,XX +XXX,XX @@ static void blkio_detach_aio_context(BlockDriverState *bs)
+                        NULL, NULL, NULL);
+ }
+-/* Call with s->blkio_lock held to submit I/O after enqueuing a new request */
+-static void blkio_submit_io(BlockDriverState *bs)
++/*
++ * Called by blk_io_unplug() or immediately if not plugged. Called without
++ * blkio_lock.
++ */
++static void blkio_unplug_fn(void *opaque)
+ {
+-    if (qatomic_read(&bs->io_plugged) == 0) {
+-        BDRVBlkioState *s = bs->opaque;
++    BDRVBlkioState *s = opaque;
++    WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
+         blkioq_do_io(s->blkioq, NULL, 0, 0, NULL);
+     }
+ }
++/*
++ * Schedule I/O submission after enqueuing a new request. Called without
++ * blkio_lock.
++ */
++static void blkio_submit_io(BlockDriverState *bs)
++{
++    BDRVBlkioState *s = bs->opaque;
++
++    blk_io_plug_call(blkio_unplug_fn, s);
++}
++
+ static int coroutine_fn
+ blkio_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
+ {
+@@ -XXX,XX +XXX,XX @@ blkio_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
+     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
+         blkioq_discard(s->blkioq, offset, bytes, &cod, 0);
+-        blkio_submit_io(bs);
+     }
++    blkio_submit_io(bs);
+     qemu_coroutine_yield();
+     return cod.ret;
+ }
+@@ -XXX,XX +XXX,XX @@ blkio_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
+     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
+         blkioq_readv(s->blkioq, offset, iov, iovcnt, &cod, 0);
+-        blkio_submit_io(bs);
+     }
++    blkio_submit_io(bs);
+     qemu_coroutine_yield();
+     if (use_bounce_buffer) {
+@@ -XXX,XX +XXX,XX @@ static int coroutine_fn blkio_co_pwritev(BlockDriverState *bs, int64_t offset,
+     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
+         blkioq_writev(s->blkioq, offset, iov, iovcnt, &cod, blkio_flags);
+-        blkio_submit_io(bs);
+     }
++    blkio_submit_io(bs);
+     qemu_coroutine_yield();
+     if (use_bounce_buffer) {
+@@ -XXX,XX +XXX,XX @@ static int coroutine_fn blkio_co_flush(BlockDriverState *bs)
+     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
+         blkioq_flush(s->blkioq, &cod, 0);
+-        blkio_submit_io(bs);
+     }
++    blkio_submit_io(bs);
+     qemu_coroutine_yield();
+     return cod.ret;
+ }
+@@ -XXX,XX +XXX,XX @@ static int coroutine_fn blkio_co_pwrite_zeroes(BlockDriverState *bs,
+     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
+         blkioq_write_zeroes(s->blkioq, offset, bytes, &cod, blkio_flags);
+-        blkio_submit_io(bs);
+     }
++    blkio_submit_io(bs);
+     qemu_coroutine_yield();
+     return cod.ret;
+ }
+-static void coroutine_fn blkio_co_io_unplug(BlockDriverState *bs)
+-{
+-    BDRVBlkioState *s = bs->opaque;
+-
+-    WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
+-        blkio_submit_io(bs);
+-    }
+-}
+-
+ typedef enum {
+     BMRR_OK,
+     BMRR_SKIP,
+@@ -XXX,XX +XXX,XX @@ static void blkio_refresh_limits(BlockDriverState *bs, Error **errp)
+         .bdrv_co_pwritev         = blkio_co_pwritev, \
+         .bdrv_co_flush_to_disk   = blkio_co_flush, \
+         .bdrv_co_pwrite_zeroes   = blkio_co_pwrite_zeroes, \
+-        .bdrv_co_io_unplug       = blkio_co_io_unplug, \
+         .bdrv_refresh_limits     = blkio_refresh_limits, \
+         .bdrv_register_buf       = blkio_register_buf, \
+         .bdrv_unregister_buf     = blkio_unregister_buf, \
+--
+.40.1

-New patch
+[PULL 4/8] block/io_uring: convert to blk_io_plug_call() API
+Stop using the .bdrv_co_io_plug() API because it is not multi-queue
+block layer friendly. Use the new blk_io_plug_call() API to batch I/O
+submission instead.
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Reviewed-by: Eric Blake <eblake@redhat.com>
+Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
+Acked-by: Kevin Wolf <kwolf@redhat.com>
+Message-id: 20230530180959.1108766-5-stefanha@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+---
+ include/block/raw-aio.h |  7 -------
+ block/file-posix.c      | 10 ----------
+ block/io_uring.c        | 44 ++++++++++++++++-------------------------
+ block/trace-events      |  5 ++---
+files changed, 19 insertions(+), 47 deletions(-)
+diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
+index XXXXXXX..XXXXXXX 100644
+--- a/include/block/raw-aio.h
++++ b/include/block/raw-aio.h
+@@ -XXX,XX +XXX,XX @@ int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd, uint64_t offset,
+                                   QEMUIOVector *qiov, int type);
+ void luring_detach_aio_context(LuringState *s, AioContext *old_context);
+ void luring_attach_aio_context(LuringState *s, AioContext *new_context);
+-
+-/*
+- * luring_io_plug/unplug work in the thread's current AioContext, therefore the
+- * caller must ensure that they are paired in the same IOThread.
+- */
+-void luring_io_plug(void);
+-void luring_io_unplug(void);
+ #endif
+ #ifdef _WIN32
+diff --git a/block/file-posix.c b/block/file-posix.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/file-posix.c
++++ b/block/file-posix.c
+@@ -XXX,XX +XXX,XX @@ static void coroutine_fn raw_co_io_plug(BlockDriverState *bs)
+         laio_io_plug();
+     }
+ #endif
+-#ifdef CONFIG_LINUX_IO_URING
+-    if (s->use_linux_io_uring) {
+-        luring_io_plug();
+-    }
+-#endif
+ }
+ static void coroutine_fn raw_co_io_unplug(BlockDriverState *bs)
+@@ -XXX,XX +XXX,XX @@ static void coroutine_fn raw_co_io_unplug(BlockDriverState *bs)
+         laio_io_unplug(s->aio_max_batch);
+     }
+ #endif
+-#ifdef CONFIG_LINUX_IO_URING
+-    if (s->use_linux_io_uring) {
+-        luring_io_unplug();
+-    }
+-#endif
+ }
+ static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs)
+diff --git a/block/io_uring.c b/block/io_uring.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/io_uring.c
++++ b/block/io_uring.c
+@@ -XXX,XX +XXX,XX @@
+ #include "block/raw-aio.h"
+ #include "qemu/coroutine.h"
+ #include "qapi/error.h"
++#include "sysemu/block-backend.h"
+ #include "trace.h"
+ /* Only used for assertions.  */
+@@ -XXX,XX +XXX,XX @@ typedef struct LuringAIOCB {
+ } LuringAIOCB;
+ typedef struct LuringQueue {
+-    int plugged;
+     unsigned int in_queue;
+     unsigned int in_flight;
+     bool blocked;
+@@ -XXX,XX +XXX,XX @@ static void luring_process_completions_and_submit(LuringState *s)
+ {
+     luring_process_completions(s);
+-    if (!s->io_q.plugged && s->io_q.in_queue > 0) {
++    if (s->io_q.in_queue > 0) {
+         ioq_submit(s);
+     }
+ }
+@@ -XXX,XX +XXX,XX @@ static void qemu_luring_poll_ready(void *opaque)
+ static void ioq_init(LuringQueue *io_q)
+ {
+     QSIMPLEQ_INIT(&io_q->submit_queue);
+-    io_q->plugged = 0;
+     io_q->in_queue = 0;
+     io_q->in_flight = 0;
+     io_q->blocked = false;
+ }
+-void luring_io_plug(void)
++static void luring_unplug_fn(void *opaque)
+ {
+-    AioContext *ctx = qemu_get_current_aio_context();
+-    LuringState *s = aio_get_linux_io_uring(ctx);
+-    trace_luring_io_plug(s);
+-    s->io_q.plugged++;
+-}
+-
+-void luring_io_unplug(void)
+-{
+-    AioContext *ctx = qemu_get_current_aio_context();
+-    LuringState *s = aio_get_linux_io_uring(ctx);
+-    assert(s->io_q.plugged);
+-    trace_luring_io_unplug(s, s->io_q.blocked, s->io_q.plugged,
+-                           s->io_q.in_queue, s->io_q.in_flight);
+-    if (--s->io_q.plugged == 0 &&
+-        !s->io_q.blocked && s->io_q.in_queue > 0) {
++    LuringState *s = opaque;
++    trace_luring_unplug_fn(s, s->io_q.blocked, s->io_q.in_queue,
++                           s->io_q.in_flight);
++    if (!s->io_q.blocked && s->io_q.in_queue > 0) {
+         ioq_submit(s);
+     }
+ }
+@@ -XXX,XX +XXX,XX @@ static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s,
+     QSIMPLEQ_INSERT_TAIL(&s->io_q.submit_queue, luringcb, next);
+     s->io_q.in_queue++;
+-    trace_luring_do_submit(s, s->io_q.blocked, s->io_q.plugged,
+-                           s->io_q.in_queue, s->io_q.in_flight);
+-    if (!s->io_q.blocked &&
+-        (!s->io_q.plugged ||
+-         s->io_q.in_flight + s->io_q.in_queue >= MAX_ENTRIES)) {
+-        ret = ioq_submit(s);
+-        trace_luring_do_submit_done(s, ret);
+-        return ret;
++    trace_luring_do_submit(s, s->io_q.blocked, s->io_q.in_queue,
++                           s->io_q.in_flight);
++    if (!s->io_q.blocked) {
++        if (s->io_q.in_flight + s->io_q.in_queue >= MAX_ENTRIES) {
++            ret = ioq_submit(s);
++            trace_luring_do_submit_done(s, ret);
++            return ret;
++        }
++
++        blk_io_plug_call(luring_unplug_fn, s);
+     }
+     return 0;
+ }
+diff --git a/block/trace-events b/block/trace-events
+index XXXXXXX..XXXXXXX 100644
+--- a/block/trace-events
++++ b/block/trace-events
+@@ -XXX,XX +XXX,XX @@ file_paio_submit(void *acb, void *opaque, int64_t offset, int count, int type) "
+ # io_uring.c
+ luring_init_state(void *s, size_t size) "s %p size %zu"
+ luring_cleanup_state(void *s) "%p freed"
+-luring_io_plug(void *s) "LuringState %p plug"
+-luring_io_unplug(void *s, int blocked, int plugged, int queued, int inflight) "LuringState %p blocked %d plugged %d queued %d inflight %d"
+-luring_do_submit(void *s, int blocked, int plugged, int queued, int inflight) "LuringState %p blocked %d plugged %d queued %d inflight %d"
++luring_unplug_fn(void *s, int blocked, int queued, int inflight) "LuringState %p blocked %d queued %d inflight %d"
++luring_do_submit(void *s, int blocked, int queued, int inflight) "LuringState %p blocked %d queued %d inflight %d"
+ luring_do_submit_done(void *s, int ret) "LuringState %p submitted to kernel %d"
+ luring_co_submit(void *bs, void *s, void *luringcb, int fd, uint64_t offset, size_t nbytes, int type) "bs %p s %p luringcb %p fd %d offset %" PRId64 " nbytes %zd type %d"
+ luring_process_completion(void *s, void *aiocb, int ret) "LuringState %p luringcb %p ret %d"
+--
+.40.1

-[PULL for-6.1 3/3] linux-aio: limit the batch size using `aio-max-batch` parameter
+[PULL 5/8] block/linux-aio: convert to blk_io_plug_call() API
-From: Stefano Garzarella <sgarzare@redhat.com>
+Stop using the .bdrv_co_io_plug() API because it is not multi-queue
+block layer friendly. Use the new blk_io_plug_call() API to batch I/O
-When there are multiple queues attached to the same AIO context,
+submission instead.
-some requests may experience high latency, since in the worst case
-the AIO engine queue is only flushed when it is full (MAX_EVENTS) or
+Note that a dev_max_batch check is dropped in laio_io_unplug() because
-there are no more queues plugged.
+the semantics of unplug_fn() are different from .bdrv_co_unplug():
+. unplug_fn() is only called when the last blk_io_unplug() call occurs,
-Commit 2558cb8dd4 ("linux-aio: increasing MAX_EVENTS to a larger
+   not every time blk_io_unplug() is called.
-hardcoded value") changed MAX_EVENTS from 128 to 1024, to increase
+. unplug_fn() is per-thread, not per-BlockDriverState, so there is no
-the number of in-flight requests. But this change also increased
+   way to get per-BlockDriverState fields like dev_max_batch.
-the potential maximum batch to 1024 elements.
+Therefore this condition cannot be moved to laio_unplug_fn(). It is not
-When there is a single queue attached to the AIO context, the issue
+obvious that this condition affects performance in practice, so I am
-is mitigated from laio_io_unplug() that will flush the queue every
+removing it instead of trying to come up with a more complex mechanism
-time is invoked since there can't be others queue plugged.
+to preserve the condition.
-Let's use the new `aio-max-batch` IOThread parameter to mitigate
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-this issue, limiting the number of requests in a batch.
+Reviewed-by: Eric Blake <eblake@redhat.com>
+Acked-by: Kevin Wolf <kwolf@redhat.com>
-We also define a default value (32): this value is obtained running
+Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
-some benchmarks and it represents a good tradeoff between the latency
+Message-id: 20230530180959.1108766-6-stefanha@redhat.com
 increase while a request is queued and the cost of the io_submit(2)
 system call.
 Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
 Message-id: 20210721094211.69853-4-sgarzare@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- block/linux-aio.c | 9 ++++++++-
+ include/block/raw-aio.h |  7 -------
-file changed, 8 insertions(+), 1 deletion(-)
+ block/file-posix.c      | 28 ----------------------------
+ block/linux-aio.c       | 41 +++++++++++------------------------------
 files changed, 11 insertions(+), 65 deletions(-)
 diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/block/raw-aio.h
 +++ b/include/block/raw-aio.h
@@ -XXX,XX +XXX,XX @@ int coroutine_fn laio_co_submit(int fd, uint64_t offset, QEMUIOVector *qiov,
  void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context);
  void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context);
 -
 -/*
 - * laio_io_plug/unplug work in the thread's current AioContext, therefore the
 - * caller must ensure that they are paired in the same IOThread.
 - */
 -void laio_io_plug(void);
 -void laio_io_unplug(uint64_t dev_max_batch);
  #endif
  /* io_uring.c - Linux io_uring implementation */
  #ifdef CONFIG_LINUX_IO_URING
 diff --git a/block/file-posix.c b/block/file-posix.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/file-posix.c
 +++ b/block/file-posix.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, int64_t offset,
      return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE);
  }
 -static void coroutine_fn raw_co_io_plug(BlockDriverState *bs)
 -{
 -    BDRVRawState __attribute__((unused)) *s = bs->opaque;
 -#ifdef CONFIG_LINUX_AIO
 -    if (s->use_linux_aio) {
 -        laio_io_plug();
 -    }
 -#endif
 -}
 -
 -static void coroutine_fn raw_co_io_unplug(BlockDriverState *bs)
 -{
 -    BDRVRawState __attribute__((unused)) *s = bs->opaque;
 -#ifdef CONFIG_LINUX_AIO
 -    if (s->use_linux_aio) {
 -        laio_io_unplug(s->aio_max_batch);
 -    }
 -#endif
 -}
 -
  static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs)
  {
      BDRVRawState *s = bs->opaque;
@@ -XXX,XX +XXX,XX @@ BlockDriver bdrv_file = {
      .bdrv_co_copy_range_from = raw_co_copy_range_from,
      .bdrv_co_copy_range_to  = raw_co_copy_range_to,
      .bdrv_refresh_limits = raw_refresh_limits,
 -    .bdrv_co_io_plug        = raw_co_io_plug,
 -    .bdrv_co_io_unplug      = raw_co_io_unplug,
      .bdrv_attach_aio_context = raw_aio_attach_aio_context,
      .bdrv_co_truncate                   = raw_co_truncate,
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_host_device = {
      .bdrv_co_copy_range_from = raw_co_copy_range_from,
      .bdrv_co_copy_range_to  = raw_co_copy_range_to,
      .bdrv_refresh_limits = raw_refresh_limits,
 -    .bdrv_co_io_plug        = raw_co_io_plug,
 -    .bdrv_co_io_unplug      = raw_co_io_unplug,
      .bdrv_attach_aio_context = raw_aio_attach_aio_context,
      .bdrv_co_truncate                   = raw_co_truncate,
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_host_cdrom = {
      .bdrv_co_pwritev        = raw_co_pwritev,
      .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
      .bdrv_refresh_limits    = cdrom_refresh_limits,
 -    .bdrv_co_io_plug        = raw_co_io_plug,
 -    .bdrv_co_io_unplug      = raw_co_io_unplug,
      .bdrv_attach_aio_context = raw_aio_attach_aio_context,
      .bdrv_co_truncate                   = raw_co_truncate,
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_host_cdrom = {
      .bdrv_co_pwritev        = raw_co_pwritev,
      .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
      .bdrv_refresh_limits    = cdrom_refresh_limits,
 -    .bdrv_co_io_plug        = raw_co_io_plug,
 -    .bdrv_co_io_unplug      = raw_co_io_unplug,
      .bdrv_attach_aio_context = raw_aio_attach_aio_context,
      .bdrv_co_truncate                   = raw_co_truncate,
 diff --git a/block/linux-aio.c b/block/linux-aio.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/linux-aio.c
 +++ b/block/linux-aio.c
 @@ -XXX,XX +XXX,XX @@
-  */
+ #include "qemu/event_notifier.h"
- #define MAX_EVENTS 1024
+ #include "qemu/coroutine.h"
+ #include "qapi/error.h"
-+/* Maximum number of requests in a batch. (default value) */
++#include "sysemu/block-backend.h"
-+#define DEFAULT_MAX_BATCH 32
-+
+ /* Only used for assertions.  */
- struct qemu_laiocb {
+ #include "qemu/coroutine_int.h"
-     Coroutine *co;
+@@ -XXX,XX +XXX,XX @@ struct qemu_laiocb {
-     LinuxAioState *ctx;
+ };
-@@ -XXX,XX +XXX,XX @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
-     LinuxAioState *s = laiocb->ctx;
+ typedef struct {
-     struct iocb *iocbs = &laiocb->iocb;
+-    int plugged;
-     QEMUIOVector *qiov = laiocb->qiov;
+     unsigned int in_queue;
-+    int64_t max_batch = s->aio_context->aio_max_batch ?: DEFAULT_MAX_BATCH;
+     unsigned int in_flight;
-+
+     bool blocked;
-+    /* limit the batch with the number of available events */
+@@ -XXX,XX +XXX,XX @@ static void qemu_laio_process_completions_and_submit(LinuxAioState *s)
-+    max_batch = MIN_NON_ZERO(MAX_EVENTS - s->io_q.in_flight, max_batch);
+ {
+     qemu_laio_process_completions(s);
-     switch (type) {
-     case QEMU_AIO_WRITE:
+-    if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
-@@ -XXX,XX +XXX,XX @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
++    if (!QSIMPLEQ_EMPTY(&s->io_q.pending)) {
      s->io_q.in_queue++;
      if (!s->io_q.blocked &&
          (!s->io_q.plugged ||
 -         s->io_q.in_flight + s->io_q.in_queue >= MAX_EVENTS)) {
 +         s->io_q.in_queue >= max_batch)) {
          ioq_submit(s);
      }
+ }
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_poll_ready(EventNotifier *opaque)
  static void ioq_init(LaioQueue *io_q)
  {
      QSIMPLEQ_INIT(&io_q->pending);
 -    io_q->plugged = 0;
      io_q->in_queue = 0;
      io_q->in_flight = 0;
      io_q->blocked = false;
@@ -XXX,XX +XXX,XX @@ static uint64_t laio_max_batch(LinuxAioState *s, uint64_t dev_max_batch)
      return max_batch;
  }
 -void laio_io_plug(void)
 +static void laio_unplug_fn(void *opaque)
  {
 -    AioContext *ctx = qemu_get_current_aio_context();
 -    LinuxAioState *s = aio_get_linux_aio(ctx);
 +    LinuxAioState *s = opaque;
 -    s->io_q.plugged++;
 -}
 -
 -void laio_io_unplug(uint64_t dev_max_batch)
 -{
 -    AioContext *ctx = qemu_get_current_aio_context();
 -    LinuxAioState *s = aio_get_linux_aio(ctx);
 -
 -    assert(s->io_q.plugged);
 -    s->io_q.plugged--;
 -
 -    /*
 -     * Why max batch checking is performed here:
 -     * Another BDS may have queued requests with a higher dev_max_batch and
 -     * therefore in_queue could now exceed our dev_max_batch. Re-check the max
 -     * batch so we can honor our device's dev_max_batch.
 -     */
 -    if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch) ||
 -        (!s->io_q.plugged &&
 -         !s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending))) {
 +    if (!s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
          ioq_submit(s);
      }
  }
@@ -XXX,XX +XXX,XX @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
      QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next);
      s->io_q.in_queue++;
 -    if (!s->io_q.blocked &&
 -        (!s->io_q.plugged ||
 -         s->io_q.in_queue >= laio_max_batch(s, dev_max_batch))) {
 -        ioq_submit(s);
 +    if (!s->io_q.blocked) {
 +        if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch)) {
 +            ioq_submit(s);
 +        } else {
 +            blk_io_plug_call(laio_unplug_fn, s);
 +        }
      }
      return 0;
 --
-.31.1
+.40.1

-New patch
+[PULL 6/8] block: remove bdrv_co_io_plug() API
+No block driver implements .bdrv_co_io_plug() anymore. Get rid of the
+function pointers.
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Reviewed-by: Eric Blake <eblake@redhat.com>
+Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
+Acked-by: Kevin Wolf <kwolf@redhat.com>
+Message-id: 20230530180959.1108766-7-stefanha@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+---
+ include/block/block-io.h         |  3 ---
+ include/block/block_int-common.h | 11 ----------
+ block/io.c                       | 37 --------------------------------
+files changed, 51 deletions(-)
+diff --git a/include/block/block-io.h b/include/block/block-io.h
+index XXXXXXX..XXXXXXX 100644
+--- a/include/block/block-io.h
++++ b/include/block/block-io.h
+@@ -XXX,XX +XXX,XX @@ void coroutine_fn bdrv_co_leave(BlockDriverState *bs, AioContext *old_ctx);
+ AioContext *child_of_bds_get_parent_aio_context(BdrvChild *c);
+-void coroutine_fn GRAPH_RDLOCK bdrv_co_io_plug(BlockDriverState *bs);
+-void coroutine_fn GRAPH_RDLOCK bdrv_co_io_unplug(BlockDriverState *bs);
+-
+ bool coroutine_fn GRAPH_RDLOCK
+ bdrv_co_can_store_new_dirty_bitmap(BlockDriverState *bs, const char *name,
+                                    uint32_t granularity, Error **errp);
+diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
+index XXXXXXX..XXXXXXX 100644
+--- a/include/block/block_int-common.h
++++ b/include/block/block_int-common.h
+@@ -XXX,XX +XXX,XX @@ struct BlockDriver {
+     void coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_debug_event)(
+         BlockDriverState *bs, BlkdebugEvent event);
+-    /* io queue for linux-aio */
+-    void coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_io_plug)(BlockDriverState *bs);
+-    void coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_io_unplug)(
+-        BlockDriverState *bs);
+-
+     bool (*bdrv_supports_persistent_dirty_bitmap)(BlockDriverState *bs);
+     bool coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_can_store_new_dirty_bitmap)(
+@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
+     unsigned int in_flight;
+     unsigned int serialising_in_flight;
+-    /*
+-     * counter for nested bdrv_io_plug.
+-     * Accessed with atomic ops.
+-     */
+-    unsigned io_plugged;
+-
+     /* do we need to tell the quest if we have a volatile write cache? */
+     int enable_write_cache;
+diff --git a/block/io.c b/block/io.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/io.c
++++ b/block/io.c
+@@ -XXX,XX +XXX,XX @@ void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
+     return mem;
+ }
+-void coroutine_fn bdrv_co_io_plug(BlockDriverState *bs)
+-{
+-    BdrvChild *child;
+-    IO_CODE();
+-    assert_bdrv_graph_readable();
+-
+-    QLIST_FOREACH(child, &bs->children, next) {
+-        bdrv_co_io_plug(child->bs);
+-    }
+-
+-    if (qatomic_fetch_inc(&bs->io_plugged) == 0) {
+-        BlockDriver *drv = bs->drv;
+-        if (drv && drv->bdrv_co_io_plug) {
+-            drv->bdrv_co_io_plug(bs);
+-        }
+-    }
+-}
+-
+-void coroutine_fn bdrv_co_io_unplug(BlockDriverState *bs)
+-{
+-    BdrvChild *child;
+-    IO_CODE();
+-    assert_bdrv_graph_readable();
+-
+-    assert(bs->io_plugged);
+-    if (qatomic_fetch_dec(&bs->io_plugged) == 1) {
+-        BlockDriver *drv = bs->drv;
+-        if (drv && drv->bdrv_co_io_unplug) {
+-            drv->bdrv_co_io_unplug(bs);
+-        }
+-    }
+-
+-    QLIST_FOREACH(child, &bs->children, next) {
+-        bdrv_co_io_unplug(child->bs);
+-    }
+-}
+-
+ /* Helper that undoes bdrv_register_buf() when it fails partway through */
+ static void GRAPH_RDLOCK
+ bdrv_register_buf_rollback(BlockDriverState *bs, void *host, size_t size,
+--
+.40.1

-[PULL for-6.1 2/3] iothread: add aio-max-batch parameter
+[PULL 7/8] block/blkio: use qemu_open() to support fd passing for virtio-blk
 From: Stefano Garzarella <sgarzare@redhat.com>
-The `aio-max-batch` parameter will be propagated to AIO engines
+Some virtio-blk drivers (e.g. virtio-blk-vhost-vdpa) supports the fd
-and it will be used to control the maximum number of queued requests.
+passing. Let's expose this to the user, so the management layer
 can pass the file descriptor of an already opened path.
-When there are in queue a number of requests equal to `aio-max-batch`,
+If the libblkio virtio-blk driver supports fd passing, let's always
-the engine invokes the system call to forward the requests to the kernel.
+use qemu_open() to open the `path`, so we can handle fd passing
 from the management layer through the "/dev/fdset/N" special path.
-This parameter allows us to control the maximum batch size to reduce
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 the latency that requests might accumulate while queued in the AIO
 engine queue.
 If `aio-max-batch` is equal to 0 (default value), the AIO engine will
 use its default maximum batch size value.
 Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
-Message-id: 20210721094211.69853-3-sgarzare@redhat.com
+Message-id: 20230530071941.8954-2-sgarzare@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- qapi/misc.json            |  6 ++++-
+ block/blkio.c | 53 ++++++++++++++++++++++++++++++++++++++++++---------
- qapi/qom.json             |  7 ++++-
+file changed, 44 insertions(+), 9 deletions(-)
  include/block/aio.h       | 12 +++++++++
  include/sysemu/iothread.h |  3 +++
  iothread.c                | 55 +++++++++++++++++++++++++++++++++++----
  monitor/hmp-cmds.c        |  2 ++
  util/aio-posix.c          | 12 +++++++++
  util/aio-win32.c          |  5 ++++
  util/async.c              |  2 ++
  qemu-options.hx           |  8 ++++--
 files changed, 103 insertions(+), 9 deletions(-)
-diff --git a/qapi/misc.json b/qapi/misc.json
+diff --git a/block/blkio.c b/block/blkio.c
 index XXXXXXX..XXXXXXX 100644
---- a/qapi/misc.json
+--- a/block/blkio.c
-+++ b/qapi/misc.json
++++ b/block/blkio.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static int blkio_virtio_blk_common_open(BlockDriverState *bs,
- # @poll-shrink: how many ns will be removed from polling time, 0 means that
+ {
- #               it's not configured (since 2.9)
+     const char *path = qdict_get_try_str(options, "path");
- #
+     BDRVBlkioState *s = bs->opaque;
-+# @aio-max-batch: maximum number of requests in a batch for the AIO engine,
+-    int ret;
-+#                 0 means that the engine will use its default (since 6.1)
++    bool fd_supported = false;
-+#
++    int fd, ret;
- # Since: 2.0
- ##
+     if (!path) {
- { 'struct': 'IOThreadInfo',
+         error_setg(errp, "missing 'path' option");
-@@ -XXX,XX +XXX,XX @@
+         return -EINVAL;
-            'thread-id': 'int',
+     }
-            'poll-max-ns': 'int',
-            'poll-grow': 'int',
+-    ret = blkio_set_str(s->blkio, "path", path);
--           'poll-shrink': 'int' } }
+-    qdict_del(options, "path");
-+           'poll-shrink': 'int',
+-    if (ret < 0) {
-+           'aio-max-batch': 'int' } }
+-        error_setg_errno(errp, -ret, "failed to set path: %s",
+-                         blkio_get_error_msg());
- ##
+-        return ret;
- # @query-iothreads:
+-    }
-diff --git a/qapi/qom.json b/qapi/qom.json
+-
-index XXXXXXX..XXXXXXX 100644
+     if (!(flags & BDRV_O_NOCACHE)) {
---- a/qapi/qom.json
+         error_setg(errp, "cache.direct=off is not supported");
-+++ b/qapi/qom.json
+         return -EINVAL;
-@@ -XXX,XX +XXX,XX @@
+     }
  #               algorithm detects it is spending too long polling without
  #               encountering events. 0 selects a default behaviour (default: 0)
  #
 +# @aio-max-batch: maximum number of requests in a batch for the AIO engine,
 +#                 0 means that the engine will use its default
 +#                 (default:0, since 6.1)
 +#
  # Since: 2.0
  ##
  { 'struct': 'IothreadProperties',
    'data': { '*poll-max-ns': 'int',
              '*poll-grow': 'int',
 -            '*poll-shrink': 'int' } }
 +            '*poll-shrink': 'int',
 +            '*aio-max-batch': 'int' } }
  ##
  # @MemoryBackendProperties:
 diff --git a/include/block/aio.h b/include/block/aio.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/block/aio.h
 +++ b/include/block/aio.h
@@ -XXX,XX +XXX,XX @@ struct AioContext {
      int64_t poll_grow;      /* polling time growth factor */
      int64_t poll_shrink;    /* polling time shrink factor */
 +    /* AIO engine parameters */
 +    int64_t aio_max_batch;  /* maximum number of requests in a batch */
 +
-     /*
++    if (blkio_get_int(s->blkio, "fd", &fd) == 0) {
-      * List of handlers participating in userspace polling.  Protected by
++        fd_supported = true;
       * ctx->list_lock.  Iterated and modified mostly by the event loop thread
@@ -XXX,XX +XXX,XX @@ void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
                                   int64_t grow, int64_t shrink,
                                   Error **errp);
 +/**
 + * aio_context_set_aio_params:
 + * @ctx: the aio context
 + * @max_batch: maximum number of requests in a batch, 0 means that the
 + *             engine will use its default
 + */
 +void aio_context_set_aio_params(AioContext *ctx, int64_t max_batch,
 +                                Error **errp);
 +
  #endif
 diff --git a/include/sysemu/iothread.h b/include/sysemu/iothread.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/sysemu/iothread.h
 +++ b/include/sysemu/iothread.h
@@ -XXX,XX +XXX,XX @@ struct IOThread {
      int64_t poll_max_ns;
      int64_t poll_grow;
      int64_t poll_shrink;
 +
 +    /* AioContext AIO engine parameters */
 +    int64_t aio_max_batch;
  };
  typedef struct IOThread IOThread;
 diff --git a/iothread.c b/iothread.c
 index XXXXXXX..XXXXXXX 100644
 --- a/iothread.c
 +++ b/iothread.c
@@ -XXX,XX +XXX,XX @@ static void iothread_init_gcontext(IOThread *iothread)
      iothread->main_loop = g_main_loop_new(iothread->worker_context, TRUE);
  }
 +static void iothread_set_aio_context_params(IOThread *iothread, Error **errp)
 +{
 +    ERRP_GUARD();
 +
 +    aio_context_set_poll_params(iothread->ctx,
 +                                iothread->poll_max_ns,
 +                                iothread->poll_grow,
 +                                iothread->poll_shrink,
 +                                errp);
 +    if (*errp) {
 +        return;
 +    }
 +
-+    aio_context_set_aio_params(iothread->ctx,
++    /*
-+                               iothread->aio_max_batch,
++     * If the libblkio driver supports fd passing, let's always use qemu_open()
-+                               errp);
++     * to open the `path`, so we can handle fd passing from the management
-+}
++     * layer through the "/dev/fdset/N" special path.
 +     */
 +    if (fd_supported) {
 +        int open_flags;
 +
- static void iothread_complete(UserCreatable *obj, Error **errp)
++        if (flags & BDRV_O_RDWR) {
- {
++            open_flags = O_RDWR;
-     Error *local_error = NULL;
++        } else {
-@@ -XXX,XX +XXX,XX @@ static void iothread_complete(UserCreatable *obj, Error **errp)
++            open_flags = O_RDONLY;
-      */
++        }
      iothread_init_gcontext(iothread);
 -    aio_context_set_poll_params(iothread->ctx,
 -                                iothread->poll_max_ns,
 -                                iothread->poll_grow,
 -                                iothread->poll_shrink,
 -                                &local_error);
 +    iothread_set_aio_context_params(iothread, &local_error);
      if (local_error) {
          error_propagate(errp, local_error);
          aio_context_unref(iothread->ctx);
@@ -XXX,XX +XXX,XX @@ static PollParamInfo poll_grow_info = {
  static PollParamInfo poll_shrink_info = {
      "poll-shrink", offsetof(IOThread, poll_shrink),
  };
 +static PollParamInfo aio_max_batch_info = {
 +    "aio-max-batch", offsetof(IOThread, aio_max_batch),
 +};
  static void iothread_get_param(Object *obj, Visitor *v,
          const char *name, void *opaque, Error **errp)
@@ -XXX,XX +XXX,XX @@ static void iothread_set_poll_param(Object *obj, Visitor *v,
      }
  }
 +static void iothread_get_aio_param(Object *obj, Visitor *v,
 +        const char *name, void *opaque, Error **errp)
 +{
 +
-+    iothread_get_param(obj, v, name, opaque, errp);
++        fd = qemu_open(path, open_flags, errp);
-+}
++        if (fd < 0) {
 +            return -EINVAL;
 +        }
 +
-+static void iothread_set_aio_param(Object *obj, Visitor *v,
++        ret = blkio_set_int(s->blkio, "fd", fd);
-+        const char *name, void *opaque, Error **errp)
++        if (ret < 0) {
-+{
++            error_setg_errno(errp, -ret, "failed to set fd: %s",
-+    IOThread *iothread = IOTHREAD(obj);
++                             blkio_get_error_msg());
-+
++            qemu_close(fd);
-+    if (!iothread_set_param(obj, v, name, opaque, errp)) {
++            return ret;
-+        return;
++        }
 +    } else {
 +        ret = blkio_set_str(s->blkio, "path", path);
 +        if (ret < 0) {
 +            error_setg_errno(errp, -ret, "failed to set path: %s",
 +                             blkio_get_error_msg());
 +            return ret;
 +        }
 +    }
 +
-+    if (iothread->ctx) {
++    qdict_del(options, "path");
 +        aio_context_set_aio_params(iothread->ctx,
 +                                   iothread->aio_max_batch,
 +                                   errp);
 +    }
 +}
 +
- static void iothread_class_init(ObjectClass *klass, void *class_data)
+     return 0;
  {
      UserCreatableClass *ucc = USER_CREATABLE_CLASS(klass);
@@ -XXX,XX +XXX,XX @@ static void iothread_class_init(ObjectClass *klass, void *class_data)
                                iothread_get_poll_param,
                                iothread_set_poll_param,
                                NULL, &poll_shrink_info);
 +    object_class_property_add(klass, "aio-max-batch", "int",
 +                              iothread_get_aio_param,
 +                              iothread_set_aio_param,
 +                              NULL, &aio_max_batch_info);
  }
- static const TypeInfo iothread_info = {
-@@ -XXX,XX +XXX,XX @@ static int query_one_iothread(Object *object, void *opaque)
-     info->poll_max_ns = iothread->poll_max_ns;
-     info->poll_grow = iothread->poll_grow;
-     info->poll_shrink = iothread->poll_shrink;
-+    info->aio_max_batch = iothread->aio_max_batch;
-     QAPI_LIST_APPEND(*tail, info);
-     return 0;
-diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c
-index XXXXXXX..XXXXXXX 100644
---- a/monitor/hmp-cmds.c
-+++ b/monitor/hmp-cmds.c
-@@ -XXX,XX +XXX,XX @@ void hmp_info_iothreads(Monitor *mon, const QDict *qdict)
-         monitor_printf(mon, "  poll-max-ns=%" PRId64 "\n", value->poll_max_ns);
-         monitor_printf(mon, "  poll-grow=%" PRId64 "\n", value->poll_grow);
-         monitor_printf(mon, "  poll-shrink=%" PRId64 "\n", value->poll_shrink);
-+        monitor_printf(mon, "  aio-max-batch=%" PRId64 "\n",
-+                       value->aio_max_batch);
-     }
-     qapi_free_IOThreadInfoList(info_list);
-diff --git a/util/aio-posix.c b/util/aio-posix.c
-index XXXXXXX..XXXXXXX 100644
---- a/util/aio-posix.c
-+++ b/util/aio-posix.c
-@@ -XXX,XX +XXX,XX @@ void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
-     aio_notify(ctx);
- }
-+
-+void aio_context_set_aio_params(AioContext *ctx, int64_t max_batch,
-+                                Error **errp)
-+{
-+    /*
-+     * No thread synchronization here, it doesn't matter if an incorrect value
-+     * is used once.
-+     */
-+    ctx->aio_max_batch = max_batch;
-+
-+    aio_notify(ctx);
-+}
-diff --git a/util/aio-win32.c b/util/aio-win32.c
-index XXXXXXX..XXXXXXX 100644
---- a/util/aio-win32.c
-+++ b/util/aio-win32.c
-@@ -XXX,XX +XXX,XX @@ void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
-         error_setg(errp, "AioContext polling is not implemented on Windows");
-     }
- }
-+
-+void aio_context_set_aio_params(AioContext *ctx, int64_t max_batch,
-+                                Error **errp)
-+{
-+}
-diff --git a/util/async.c b/util/async.c
-index XXXXXXX..XXXXXXX 100644
---- a/util/async.c
-+++ b/util/async.c
-@@ -XXX,XX +XXX,XX @@ AioContext *aio_context_new(Error **errp)
-     ctx->poll_grow = 0;
-     ctx->poll_shrink = 0;
-+    ctx->aio_max_batch = 0;
-+
-     return ctx;
- fail:
-     g_source_destroy(&ctx->source);
-diff --git a/qemu-options.hx b/qemu-options.hx
-index XXXXXXX..XXXXXXX 100644
---- a/qemu-options.hx
-+++ b/qemu-options.hx
-@@ -XXX,XX +XXX,XX @@ SRST
-             CN=laptop.example.com,O=Example Home,L=London,ST=London,C=GB
--    ``-object iothread,id=id,poll-max-ns=poll-max-ns,poll-grow=poll-grow,poll-shrink=poll-shrink``
-+    ``-object iothread,id=id,poll-max-ns=poll-max-ns,poll-grow=poll-grow,poll-shrink=poll-shrink,aio-max-batch=aio-max-batch``
-         Creates a dedicated event loop thread that devices can be
-         assigned to. This is known as an IOThread. By default device
-         emulation happens in vCPU threads or the main event loop thread.
-@@ -XXX,XX +XXX,XX @@ SRST
-         the polling time when the algorithm detects it is spending too
-         long polling without encountering events.
--        The polling parameters can be modified at run-time using the
-+        The ``aio-max-batch`` parameter is the maximum number of requests
-+        in a batch for the AIO engine, 0 means that the engine will use
-+        its default.
-+
-+        The IOThread parameters can be modified at run-time using the
-         ``qom-set`` command (where ``iothread1`` is the IOThread's
-         ``id``):
 --
-.31.1
+.40.1

-[PULL for-6.1 1/3] iothread: generalize iothread_set_param/iothread_get_param
+[PULL 8/8] qapi: add '@fdset' feature for BlockdevOptionsVirtioBlkVhostVdpa
 From: Stefano Garzarella <sgarzare@redhat.com>
-Changes in preparation for next patches where we add a new
+The virtio-blk-vhost-vdpa driver in libblkio 1.3.0 supports the fd
-parameter not related to the poll mechanism.
+passing through the new 'fd' property.
-Let's add two new generic functions (iothread_set_param and
+Since now we are using qemu_open() on '@path' if the virtio-blk driver
-iothread_get_param) that we use to set and get IOThread
+supports the fd passing, let's announce it.
-parameters.
+In this way, the management layer can pass the file descriptor of an
 already opened vhost-vdpa character device. This is useful especially
 when the device can only be accessed with certain privileges.
+Add the '@fdset' feature only when the virtio-blk-vhost-vdpa driver
+in libblkio supports it.
+Suggested-by: Markus Armbruster <armbru@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
-Message-id: 20210721094211.69853-2-sgarzare@redhat.com
+Message-id: 20230530071941.8954-3-sgarzare@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- iothread.c | 27 +++++++++++++++++++++++----
+ qapi/block-core.json | 6 ++++++
-file changed, 23 insertions(+), 4 deletions(-)
+ meson.build          | 4 ++++
 files changed, 10 insertions(+)
-diff --git a/iothread.c b/iothread.c
+diff --git a/qapi/block-core.json b/qapi/block-core.json
 index XXXXXXX..XXXXXXX 100644
---- a/iothread.c
+--- a/qapi/block-core.json
-+++ b/iothread.c
++++ b/qapi/block-core.json
-@@ -XXX,XX +XXX,XX @@ static PollParamInfo poll_shrink_info = {
+@@ -XXX,XX +XXX,XX @@
-     "poll-shrink", offsetof(IOThread, poll_shrink),
+ #
- };
+ # @path: path to the vhost-vdpa character device.
+ #
--static void iothread_get_poll_param(Object *obj, Visitor *v,
++# Features:
-+static void iothread_get_param(Object *obj, Visitor *v,
++# @fdset: Member @path supports the special "/dev/fdset/N" path
-         const char *name, void *opaque, Error **errp)
++#     (since 8.1)
- {
++#
-     IOThread *iothread = IOTHREAD(obj);
+ # Since: 7.2
-@@ -XXX,XX +XXX,XX @@ static void iothread_get_poll_param(Object *obj, Visitor *v,
+ ##
-     visit_type_int64(v, name, field, errp);
+ { 'struct': 'BlockdevOptionsVirtioBlkVhostVdpa',
- }
+   'data': { 'path': 'str' },
++  'features': [ { 'name' :'fdset',
--static void iothread_set_poll_param(Object *obj, Visitor *v,
++                  'if': 'CONFIG_BLKIO_VHOST_VDPA_FD' } ],
-+static bool iothread_set_param(Object *obj, Visitor *v,
+   'if': 'CONFIG_BLKIO' }
-         const char *name, void *opaque, Error **errp)
- {
+ ##
-     IOThread *iothread = IOTHREAD(obj);
+diff --git a/meson.build b/meson.build
-@@ -XXX,XX +XXX,XX @@ static void iothread_set_poll_param(Object *obj, Visitor *v,
+index XXXXXXX..XXXXXXX 100644
-     int64_t value;
+--- a/meson.build
++++ b/meson.build
-     if (!visit_type_int64(v, name, &value, errp)) {
+@@ -XXX,XX +XXX,XX @@ config_host_data.set('CONFIG_LZO', lzo.found())
--        return;
+ config_host_data.set('CONFIG_MPATH', mpathpersist.found())
-+        return false;
+ config_host_data.set('CONFIG_MPATH_NEW_API', mpathpersist_new_api)
-     }
+ config_host_data.set('CONFIG_BLKIO', blkio.found())
++if blkio.found()
-     if (value < 0) {
++  config_host_data.set('CONFIG_BLKIO_VHOST_VDPA_FD',
-         error_setg(errp, "%s value must be in range [0, %" PRId64 "]",
++                       blkio.version().version_compare('>=1.3.0'))
-                    info->name, INT64_MAX);
++endif
--        return;
+ config_host_data.set('CONFIG_CURL', curl.found())
-+        return false;
+ config_host_data.set('CONFIG_CURSES', curses.found())
-     }
+ config_host_data.set('CONFIG_GBM', gbm.found())
      *field = value;
 +    return true;
 +}
 +
 +static void iothread_get_poll_param(Object *obj, Visitor *v,
 +        const char *name, void *opaque, Error **errp)
 +{
 +
 +    iothread_get_param(obj, v, name, opaque, errp);
 +}
 +
 +static void iothread_set_poll_param(Object *obj, Visitor *v,
 +        const char *name, void *opaque, Error **errp)
 +{
 +    IOThread *iothread = IOTHREAD(obj);
 +
 +    if (!iothread_set_param(obj, v, name, opaque, errp)) {
 +        return;
 +    }
 +
      if (iothread->ctx) {
          aio_context_set_poll_params(iothread->ctx,
                                      iothread->poll_max_ns,
 --
-.31.1
+.40.1

The following changes since commit 801f3db7564dcce8a37a70833c0abe40ec19f8ce:

Merge remote-tracking branch 'remotes/philmd/tags/kconfig-20210720' into staging (2021-07-20 19:30:28 +0100)

are available in the Git repository at:

https://gitlab.com/stefanha/qemu.git tags/block-pull-request

for you to fetch changes up to d7ddd0a1618a75b31dc308bb37365ce1da972154:

linux-aio: limit the batch size using `aio-max-batch` parameter (2021-07-21 13:47:50 +0100)

----------------------------------------------------------------
Pull request

Stefano's performance regression fix for commit 2558cb8dd4 ("linux-aio:
increasing MAX_EVENTS to a larger hardcoded value").

----------------------------------------------------------------

Stefano Garzarella (3):
  iothread: generalize iothread_set_param/iothread_get_param
  iothread: add aio-max-batch parameter
  linux-aio: limit the batch size using `aio-max-batch` parameter

-- 
2.31.1

From: Stefano Garzarella <sgarzare@redhat.com>

Changes in preparation for next patches where we add a new
parameter not related to the poll mechanism.

Let's add two new generic functions (iothread_set_param and
iothread_get_param) that we use to set and get IOThread
parameters.

Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Message-id: 20210721094211.69853-2-sgarzare@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 iothread.c | 27 +++++++++++++++++++++++----
 1 file changed, 23 insertions(+), 4 deletions(-)

diff --git a/iothread.c b/iothread.c
index XXXXXXX..XXXXXXX 100644
--- a/iothread.c
+++ b/iothread.c
@@ -XXX,XX +XXX,XX @@ static PollParamInfo poll_shrink_info = {
     "poll-shrink", offsetof(IOThread, poll_shrink),
 };
 
-static void iothread_get_poll_param(Object *obj, Visitor *v,
+static void iothread_get_param(Object *obj, Visitor *v,
         const char *name, void *opaque, Error **errp)
 {
     IOThread *iothread = IOTHREAD(obj);
@@ -XXX,XX +XXX,XX @@ static void iothread_get_poll_param(Object *obj, Visitor *v,
     visit_type_int64(v, name, field, errp);
 }
 
-static void iothread_set_poll_param(Object *obj, Visitor *v,
+static bool iothread_set_param(Object *obj, Visitor *v,
         const char *name, void *opaque, Error **errp)
 {
     IOThread *iothread = IOTHREAD(obj);
@@ -XXX,XX +XXX,XX @@ static void iothread_set_poll_param(Object *obj, Visitor *v,
     int64_t value;
 
     if (!visit_type_int64(v, name, &value, errp)) {
-        return;
+        return false;
     }
 
     if (value < 0) {
         error_setg(errp, "%s value must be in range [0, %" PRId64 "]",
                    info->name, INT64_MAX);
-        return;
+        return false;
     }
 
     *field = value;
 
+    return true;
+}
+
+static void iothread_get_poll_param(Object *obj, Visitor *v,
+        const char *name, void *opaque, Error **errp)
+{
+
+    iothread_get_param(obj, v, name, opaque, errp);
+}
+
+static void iothread_set_poll_param(Object *obj, Visitor *v,
+        const char *name, void *opaque, Error **errp)
+{
+    IOThread *iothread = IOTHREAD(obj);
+
+    if (!iothread_set_param(obj, v, name, opaque, errp)) {
+        return;
+    }
+
     if (iothread->ctx) {
         aio_context_set_poll_params(iothread->ctx,
                                     iothread->poll_max_ns,
-- 
2.31.1

From: Stefano Garzarella <sgarzare@redhat.com>

The `aio-max-batch` parameter will be propagated to AIO engines
and it will be used to control the maximum number of queued requests.

When there are in queue a number of requests equal to `aio-max-batch`,
the engine invokes the system call to forward the requests to the kernel.

This parameter allows us to control the maximum batch size to reduce
the latency that requests might accumulate while queued in the AIO
engine queue.

If `aio-max-batch` is equal to 0 (default value), the AIO engine will
use its default maximum batch size value.

Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Message-id: 20210721094211.69853-3-sgarzare@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 qapi/misc.json            |  6 ++++-
 qapi/qom.json             |  7 ++++-
 include/block/aio.h       | 12 +++++++++
 include/sysemu/iothread.h |  3 +++
 iothread.c                | 55 +++++++++++++++++++++++++++++++++++----
 monitor/hmp-cmds.c        |  2 ++
 util/aio-posix.c          | 12 +++++++++
 util/aio-win32.c          |  5 ++++
 util/async.c              |  2 ++
 qemu-options.hx           |  8 ++++--
 10 files changed, 103 insertions(+), 9 deletions(-)

diff --git a/qapi/misc.json b/qapi/misc.json
index XXXXXXX..XXXXXXX 100644
--- a/qapi/misc.json
+++ b/qapi/misc.json
@@ -XXX,XX +XXX,XX @@
 # @poll-shrink: how many ns will be removed from polling time, 0 means that
 #               it's not configured (since 2.9)
 #
+# @aio-max-batch: maximum number of requests in a batch for the AIO engine,
+#                 0 means that the engine will use its default (since 6.1)
+#
 # Since: 2.0
 ##
 { 'struct': 'IOThreadInfo',
@@ -XXX,XX +XXX,XX @@
            'thread-id': 'int',
            'poll-max-ns': 'int',
            'poll-grow': 'int',
-           'poll-shrink': 'int' } }
+           'poll-shrink': 'int',
+           'aio-max-batch': 'int' } }
 
 ##
 # @query-iothreads:
diff --git a/qapi/qom.json b/qapi/qom.json
index XXXXXXX..XXXXXXX 100644
--- a/qapi/qom.json
+++ b/qapi/qom.json
@@ -XXX,XX +XXX,XX @@
 #               algorithm detects it is spending too long polling without
 #               encountering events. 0 selects a default behaviour (default: 0)
 #
+# @aio-max-batch: maximum number of requests in a batch for the AIO engine,
+#                 0 means that the engine will use its default
+#                 (default:0, since 6.1)
+#
 # Since: 2.0
 ##
 { 'struct': 'IothreadProperties',
   'data': { '*poll-max-ns': 'int',
             '*poll-grow': 'int',
-            '*poll-shrink': 'int' } }
+            '*poll-shrink': 'int',
+            '*aio-max-batch': 'int' } }
 
 ##
 # @MemoryBackendProperties:
diff --git a/include/block/aio.h b/include/block/aio.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -XXX,XX +XXX,XX @@ struct AioContext {
     int64_t poll_grow;      /* polling time growth factor */
     int64_t poll_shrink;    /* polling time shrink factor */
 
+    /* AIO engine parameters */
+    int64_t aio_max_batch;  /* maximum number of requests in a batch */
+
     /*
      * List of handlers participating in userspace polling.  Protected by
      * ctx->list_lock.  Iterated and modified mostly by the event loop thread
@@ -XXX,XX +XXX,XX @@ void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
                                  int64_t grow, int64_t shrink,
                                  Error **errp);
 
+/**
+ * aio_context_set_aio_params:
+ * @ctx: the aio context
+ * @max_batch: maximum number of requests in a batch, 0 means that the
+ *             engine will use its default
+ */
+void aio_context_set_aio_params(AioContext *ctx, int64_t max_batch,
+                                Error **errp);
+
 #endif
diff --git a/include/sysemu/iothread.h b/include/sysemu/iothread.h
index XXXXXXX..XXXXXXX 100644
--- a/include/sysemu/iothread.h
+++ b/include/sysemu/iothread.h
@@ -XXX,XX +XXX,XX @@ struct IOThread {
     int64_t poll_max_ns;
     int64_t poll_grow;
     int64_t poll_shrink;
+
+    /* AioContext AIO engine parameters */
+    int64_t aio_max_batch;
 };
 typedef struct IOThread IOThread;
 
diff --git a/iothread.c b/iothread.c
index XXXXXXX..XXXXXXX 100644
--- a/iothread.c
+++ b/iothread.c
@@ -XXX,XX +XXX,XX @@ static void iothread_init_gcontext(IOThread *iothread)
     iothread->main_loop = g_main_loop_new(iothread->worker_context, TRUE);
 }
 
+static void iothread_set_aio_context_params(IOThread *iothread, Error **errp)
+{
+    ERRP_GUARD();
+
+    aio_context_set_poll_params(iothread->ctx,
+                                iothread->poll_max_ns,
+                                iothread->poll_grow,
+                                iothread->poll_shrink,
+                                errp);
+    if (*errp) {
+        return;
+    }
+
+    aio_context_set_aio_params(iothread->ctx,
+                               iothread->aio_max_batch,
+                               errp);
+}
+
 static void iothread_complete(UserCreatable *obj, Error **errp)
 {
     Error *local_error = NULL;
@@ -XXX,XX +XXX,XX @@ static void iothread_complete(UserCreatable *obj, Error **errp)
      */
     iothread_init_gcontext(iothread);
 
-    aio_context_set_poll_params(iothread->ctx,
-                                iothread->poll_max_ns,
-                                iothread->poll_grow,
-                                iothread->poll_shrink,
-                                &local_error);
+    iothread_set_aio_context_params(iothread, &local_error);
     if (local_error) {
         error_propagate(errp, local_error);
         aio_context_unref(iothread->ctx);
@@ -XXX,XX +XXX,XX @@ static PollParamInfo poll_grow_info = {
 static PollParamInfo poll_shrink_info = {
     "poll-shrink", offsetof(IOThread, poll_shrink),
 };
+static PollParamInfo aio_max_batch_info = {
+    "aio-max-batch", offsetof(IOThread, aio_max_batch),
+};
 
 static void iothread_get_param(Object *obj, Visitor *v,
         const char *name, void *opaque, Error **errp)
@@ -XXX,XX +XXX,XX @@ static void iothread_set_poll_param(Object *obj, Visitor *v,
     }
 }
 
+static void iothread_get_aio_param(Object *obj, Visitor *v,
+        const char *name, void *opaque, Error **errp)
+{
+
+    iothread_get_param(obj, v, name, opaque, errp);
+}
+
+static void iothread_set_aio_param(Object *obj, Visitor *v,
+        const char *name, void *opaque, Error **errp)
+{
+    IOThread *iothread = IOTHREAD(obj);
+
+    if (!iothread_set_param(obj, v, name, opaque, errp)) {
+        return;
+    }
+
+    if (iothread->ctx) {
+        aio_context_set_aio_params(iothread->ctx,
+                                   iothread->aio_max_batch,
+                                   errp);
+    }
+}
+
 static void iothread_class_init(ObjectClass *klass, void *class_data)
 {
     UserCreatableClass *ucc = USER_CREATABLE_CLASS(klass);
@@ -XXX,XX +XXX,XX @@ static void iothread_class_init(ObjectClass *klass, void *class_data)
                               iothread_get_poll_param,
                               iothread_set_poll_param,
                               NULL, &poll_shrink_info);
+    object_class_property_add(klass, "aio-max-batch", "int",
+                              iothread_get_aio_param,
+                              iothread_set_aio_param,
+                              NULL, &aio_max_batch_info);
 }
 
 static const TypeInfo iothread_info = {
@@ -XXX,XX +XXX,XX @@ static int query_one_iothread(Object *object, void *opaque)
     info->poll_max_ns = iothread->poll_max_ns;
     info->poll_grow = iothread->poll_grow;
     info->poll_shrink = iothread->poll_shrink;
+    info->aio_max_batch = iothread->aio_max_batch;
 
     QAPI_LIST_APPEND(*tail, info);
     return 0;
diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c
index XXXXXXX..XXXXXXX 100644
--- a/monitor/hmp-cmds.c
+++ b/monitor/hmp-cmds.c
@@ -XXX,XX +XXX,XX @@ void hmp_info_iothreads(Monitor *mon, const QDict *qdict)
         monitor_printf(mon, "  poll-max-ns=%" PRId64 "\n", value->poll_max_ns);
         monitor_printf(mon, "  poll-grow=%" PRId64 "\n", value->poll_grow);
         monitor_printf(mon, "  poll-shrink=%" PRId64 "\n", value->poll_shrink);
+        monitor_printf(mon, "  aio-max-batch=%" PRId64 "\n",
+                       value->aio_max_batch);
     }
 
     qapi_free_IOThreadInfoList(info_list);
diff --git a/util/aio-posix.c b/util/aio-posix.c
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -XXX,XX +XXX,XX @@ void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
 
     aio_notify(ctx);
 }
+
+void aio_context_set_aio_params(AioContext *ctx, int64_t max_batch,
+                                Error **errp)
+{
+    /*
+     * No thread synchronization here, it doesn't matter if an incorrect value
+     * is used once.
+     */
+    ctx->aio_max_batch = max_batch;
+
+    aio_notify(ctx);
+}
diff --git a/util/aio-win32.c b/util/aio-win32.c
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-win32.c
+++ b/util/aio-win32.c
@@ -XXX,XX +XXX,XX @@ void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
         error_setg(errp, "AioContext polling is not implemented on Windows");
     }
 }
+
+void aio_context_set_aio_params(AioContext *ctx, int64_t max_batch,
+                                Error **errp)
+{
+}
diff --git a/util/async.c b/util/async.c
index XXXXXXX..XXXXXXX 100644
--- a/util/async.c
+++ b/util/async.c
@@ -XXX,XX +XXX,XX @@ AioContext *aio_context_new(Error **errp)
     ctx->poll_grow = 0;
     ctx->poll_shrink = 0;
 
+    ctx->aio_max_batch = 0;
+
     return ctx;
 fail:
     g_source_destroy(&ctx->source);
diff --git a/qemu-options.hx b/qemu-options.hx
index XXXXXXX..XXXXXXX 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -XXX,XX +XXX,XX @@ SRST
 
             CN=laptop.example.com,O=Example Home,L=London,ST=London,C=GB
 
-    ``-object iothread,id=id,poll-max-ns=poll-max-ns,poll-grow=poll-grow,poll-shrink=poll-shrink``
+    ``-object iothread,id=id,poll-max-ns=poll-max-ns,poll-grow=poll-grow,poll-shrink=poll-shrink,aio-max-batch=aio-max-batch``
         Creates a dedicated event loop thread that devices can be
         assigned to. This is known as an IOThread. By default device
         emulation happens in vCPU threads or the main event loop thread.
@@ -XXX,XX +XXX,XX @@ SRST
         the polling time when the algorithm detects it is spending too
         long polling without encountering events.
 
-        The polling parameters can be modified at run-time using the
+        The ``aio-max-batch`` parameter is the maximum number of requests
+        in a batch for the AIO engine, 0 means that the engine will use
+        its default.
+
+        The IOThread parameters can be modified at run-time using the
         ``qom-set`` command (where ``iothread1`` is the IOThread's
         ``id``):
 
-- 
2.31.1

From: Stefano Garzarella <sgarzare@redhat.com>

When there are multiple queues attached to the same AIO context,
some requests may experience high latency, since in the worst case
the AIO engine queue is only flushed when it is full (MAX_EVENTS) or
there are no more queues plugged.

Commit 2558cb8dd4 ("linux-aio: increasing MAX_EVENTS to a larger
hardcoded value") changed MAX_EVENTS from 128 to 1024, to increase
the number of in-flight requests. But this change also increased
the potential maximum batch to 1024 elements.

When there is a single queue attached to the AIO context, the issue
is mitigated from laio_io_unplug() that will flush the queue every
time is invoked since there can't be others queue plugged.

Let's use the new `aio-max-batch` IOThread parameter to mitigate
this issue, limiting the number of requests in a batch.

We also define a default value (32): this value is obtained running
some benchmarks and it represents a good tradeoff between the latency
increase while a request is queued and the cost of the io_submit(2)
system call.

Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Message-id: 20210721094211.69853-4-sgarzare@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/linux-aio.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/block/linux-aio.c b/block/linux-aio.c
index XXXXXXX..XXXXXXX 100644
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -XXX,XX +XXX,XX @@
  */
 #define MAX_EVENTS 1024
 
+/* Maximum number of requests in a batch. (default value) */
+#define DEFAULT_MAX_BATCH 32
+
 struct qemu_laiocb {
     Coroutine *co;
     LinuxAioState *ctx;
@@ -XXX,XX +XXX,XX @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
     LinuxAioState *s = laiocb->ctx;
     struct iocb *iocbs = &laiocb->iocb;
     QEMUIOVector *qiov = laiocb->qiov;
+    int64_t max_batch = s->aio_context->aio_max_batch ?: DEFAULT_MAX_BATCH;
+
+    /* limit the batch with the number of available events */
+    max_batch = MIN_NON_ZERO(MAX_EVENTS - s->io_q.in_flight, max_batch);
 
     switch (type) {
     case QEMU_AIO_WRITE:
@@ -XXX,XX +XXX,XX @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
     s->io_q.in_queue++;
     if (!s->io_q.blocked &&
         (!s->io_q.plugged ||
-         s->io_q.in_flight + s->io_q.in_queue >= MAX_EVENTS)) {
+         s->io_q.in_queue >= max_batch)) {
         ioq_submit(s);
     }
 
-- 
2.31.1

The following changes since commit c6a5fc2ac76c5ab709896ee1b0edd33685a67ed1:

decodetree: Add --output-null for meson testing (2023-05-31 19:56:42 -0700)

are available in the Git repository at:

https://gitlab.com/stefanha/qemu.git tags/block-pull-request

for you to fetch changes up to 98b126f5e3228a346c774e569e26689943b401dd:

qapi: add '@fdset' feature for BlockdevOptionsVirtioBlkVhostVdpa (2023-06-01 11:08:21 -0400)

----------------------------------------------------------------
Pull request

- Stefano Garzarella's blkio block driver 'fd' parameter
- My thread-local blk_io_plug() series

----------------------------------------------------------------

Stefan Hajnoczi (6):
  block: add blk_io_plug_call() API
  block/nvme: convert to blk_io_plug_call() API
  block/blkio: convert to blk_io_plug_call() API
  block/io_uring: convert to blk_io_plug_call() API
  block/linux-aio: convert to blk_io_plug_call() API
  block: remove bdrv_co_io_plug() API

Stefano Garzarella (2):
  block/blkio: use qemu_open() to support fd passing for virtio-blk
  qapi: add '@fdset' feature for BlockdevOptionsVirtioBlkVhostVdpa

-- 
2.40.1

Introduce a new API for thread-local blk_io_plug() that does not
traverse the block graph. The goal is to make blk_io_plug() multi-queue
friendly.

Instead of having block drivers track whether or not we're in a plugged
section, provide an API that allows them to defer a function call until
we're unplugged: blk_io_plug_call(fn, opaque). If blk_io_plug_call() is
called multiple times with the same fn/opaque pair, then fn() is only
called once at the end of the function - resulting in batching.

This patch introduces the API and changes blk_io_plug()/blk_io_unplug().
blk_io_plug()/blk_io_unplug() no longer require a BlockBackend argument
because the plug state is now thread-local.

Later patches convert block drivers to blk_io_plug_call() and then we
can finally remove .bdrv_co_io_plug() once all block drivers have been
converted.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Acked-by: Kevin Wolf <kwolf@redhat.com>
Message-id: 20230530180959.1108766-2-stefanha@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 MAINTAINERS                       |   1 +
 include/sysemu/block-backend-io.h |  13 +--
 block/block-backend.c             |  22 -----
 block/plug.c                      | 159 ++++++++++++++++++++++++++++++
 hw/block/dataplane/xen-block.c    |   8 +-
 hw/block/virtio-blk.c             |   4 +-
 hw/scsi/virtio-scsi.c             |   6 +-
 block/meson.build                 |   1 +
 8 files changed, 173 insertions(+), 41 deletions(-)
 create mode 100644 block/plug.c

diff --git a/MAINTAINERS b/MAINTAINERS
index XXXXXXX..XXXXXXX 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -XXX,XX +XXX,XX @@ F: util/aio-*.c
 F: util/aio-*.h
 F: util/fdmon-*.c
 F: block/io.c
+F: block/plug.c
 F: migration/block*
 F: include/block/aio.h
 F: include/block/aio-wait.h
diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h
index XXXXXXX..XXXXXXX 100644
--- a/include/sysemu/block-backend-io.h
+++ b/include/sysemu/block-backend-io.h
@@ -XXX,XX +XXX,XX @@ void blk_iostatus_set_err(BlockBackend *blk, int error);
 int blk_get_max_iov(BlockBackend *blk);
 int blk_get_max_hw_iov(BlockBackend *blk);
 
-/*
- * blk_io_plug/unplug are thread-local operations. This means that multiple
- * IOThreads can simultaneously call plug/unplug, but the caller must ensure
- * that each unplug() is called in the same IOThread of the matching plug().
- */
-void coroutine_fn blk_co_io_plug(BlockBackend *blk);
-void co_wrapper blk_io_plug(BlockBackend *blk);
-
-void coroutine_fn blk_co_io_unplug(BlockBackend *blk);
-void co_wrapper blk_io_unplug(BlockBackend *blk);
+void blk_io_plug(void);
+void blk_io_unplug(void);
+void blk_io_plug_call(void (*fn)(void *), void *opaque);
 
 AioContext *blk_get_aio_context(BlockBackend *blk);
 BlockAcctStats *blk_get_stats(BlockBackend *blk);
diff --git a/block/block-backend.c b/block/block-backend.c
index XXXXXXX..XXXXXXX 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -XXX,XX +XXX,XX @@ void blk_add_insert_bs_notifier(BlockBackend *blk, Notifier *notify)
     notifier_list_add(&blk->insert_bs_notifiers, notify);
 }
 
-void coroutine_fn blk_co_io_plug(BlockBackend *blk)
-{
-    BlockDriverState *bs = blk_bs(blk);
-    IO_CODE();
-    GRAPH_RDLOCK_GUARD();
-
-    if (bs) {
-        bdrv_co_io_plug(bs);
-    }
-}
-
-void coroutine_fn blk_co_io_unplug(BlockBackend *blk)
-{
-    BlockDriverState *bs = blk_bs(blk);
-    IO_CODE();
-    GRAPH_RDLOCK_GUARD();
-
-    if (bs) {
-        bdrv_co_io_unplug(bs);
-    }
-}
-
 BlockAcctStats *blk_get_stats(BlockBackend *blk)
 {
     IO_CODE();
diff --git a/block/plug.c b/block/plug.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/block/plug.c
@@ -XXX,XX +XXX,XX @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Block I/O plugging
+ *
+ * Copyright Red Hat.
+ *
+ * This API defers a function call within a blk_io_plug()/blk_io_unplug()
+ * section, allowing multiple calls to batch up. This is a performance
+ * optimization that is used in the block layer to submit several I/O requests
+ * at once instead of individually:
+ *
+ *   blk_io_plug(); <-- start of plugged region
+ *   ...
+ *   blk_io_plug_call(my_func, my_obj); <-- deferred my_func(my_obj) call
+ *   blk_io_plug_call(my_func, my_obj); <-- another
+ *   blk_io_plug_call(my_func, my_obj); <-- another
+ *   ...
+ *   blk_io_unplug(); <-- end of plugged region, my_func(my_obj) is called once
+ *
+ * This code is actually generic and not tied to the block layer. If another
+ * subsystem needs this functionality, it could be renamed.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/coroutine-tls.h"
+#include "qemu/notify.h"
+#include "qemu/thread.h"
+#include "sysemu/block-backend.h"
+
+/* A function call that has been deferred until unplug() */
+typedef struct {
+    void (*fn)(void *);
+    void *opaque;
+} UnplugFn;
+
+/* Per-thread state */
+typedef struct {
+    unsigned count;       /* how many times has plug() been called? */
+    GArray *unplug_fns;   /* functions to call at unplug time */
+} Plug;
+
+/* Use get_ptr_plug() to fetch this thread-local value */
+QEMU_DEFINE_STATIC_CO_TLS(Plug, plug);
+
+/* Called at thread cleanup time */
+static void blk_io_plug_atexit(Notifier *n, void *value)
+{
+    Plug *plug = get_ptr_plug();
+    g_array_free(plug->unplug_fns, TRUE);
+}
+
+/* This won't involve coroutines, so use __thread */
+static __thread Notifier blk_io_plug_atexit_notifier;
+
+/**
+ * blk_io_plug_call:
+ * @fn: a function pointer to be invoked
+ * @opaque: a user-defined argument to @fn()
+ *
+ * Call @fn(@opaque) immediately if not within a blk_io_plug()/blk_io_unplug()
+ * section.
+ *
+ * Otherwise defer the call until the end of the outermost
+ * blk_io_plug()/blk_io_unplug() section in this thread. If the same
+ * @fn/@opaque pair has already been deferred, it will only be called once upon
+ * blk_io_unplug() so that accumulated calls are batched into a single call.
+ *
+ * The caller must ensure that @opaque is not freed before @fn() is invoked.
+ */
+void blk_io_plug_call(void (*fn)(void *), void *opaque)
+{
+    Plug *plug = get_ptr_plug();
+
+    /* Call immediately if we're not plugged */
+    if (plug->count == 0) {
+        fn(opaque);
+        return;
+    }
+
+    GArray *array = plug->unplug_fns;
+    if (!array) {
+        array = g_array_new(FALSE, FALSE, sizeof(UnplugFn));
+        plug->unplug_fns = array;
+        blk_io_plug_atexit_notifier.notify = blk_io_plug_atexit;
+        qemu_thread_atexit_add(&blk_io_plug_atexit_notifier);
+    }
+
+    UnplugFn *fns = (UnplugFn *)array->data;
+    UnplugFn new_fn = {
+        .fn = fn,
+        .opaque = opaque,
+    };
+
+    /*
+     * There won't be many, so do a linear search. If this becomes a bottleneck
+     * then a binary search (glib 2.62+) or different data structure could be
+     * used.
+     */
+    for (guint i = 0; i < array->len; i++) {
+        if (memcmp(&fns[i], &new_fn, sizeof(new_fn)) == 0) {
+            return; /* already exists */
+        }
+    }
+
+    g_array_append_val(array, new_fn);
+}
+
+/**
+ * blk_io_plug: Defer blk_io_plug_call() functions until blk_io_unplug()
+ *
+ * blk_io_plug/unplug are thread-local operations. This means that multiple
+ * threads can simultaneously call plug/unplug, but the caller must ensure that
+ * each unplug() is called in the same thread of the matching plug().
+ *
+ * Nesting is supported. blk_io_plug_call() functions are only called at the
+ * outermost blk_io_unplug().
+ */
+void blk_io_plug(void)
+{
+    Plug *plug = get_ptr_plug();
+
+    assert(plug->count < UINT32_MAX);
+
+    plug->count++;
+}
+
+/**
+ * blk_io_unplug: Run any pending blk_io_plug_call() functions
+ *
+ * There must have been a matching blk_io_plug() call in the same thread prior
+ * to this blk_io_unplug() call.
+ */
+void blk_io_unplug(void)
+{
+    Plug *plug = get_ptr_plug();
+
+    assert(plug->count > 0);
+
+    if (--plug->count > 0) {
+        return;
+    }
+
+    GArray *array = plug->unplug_fns;
+    if (!array) {
+        return;
+    }
+
+    UnplugFn *fns = (UnplugFn *)array->data;
+
+    for (guint i = 0; i < array->len; i++) {
+        fns[i].fn(fns[i].opaque);
+    }
+
+    /*
+     * This resets the array without freeing memory so that appending is cheap
+     * in the future.
+     */
+    g_array_set_size(array, 0);
+}
diff --git a/hw/block/dataplane/xen-block.c b/hw/block/dataplane/xen-block.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/block/dataplane/xen-block.c
+++ b/hw/block/dataplane/xen-block.c
@@ -XXX,XX +XXX,XX @@ static bool xen_block_handle_requests(XenBlockDataPlane *dataplane)
      * is below us.
      */
     if (inflight_atstart > IO_PLUG_THRESHOLD) {
-        blk_io_plug(dataplane->blk);
+        blk_io_plug();
     }
     while (rc != rp) {
         /* pull request from ring */
@@ -XXX,XX +XXX,XX @@ static bool xen_block_handle_requests(XenBlockDataPlane *dataplane)
 
         if (inflight_atstart > IO_PLUG_THRESHOLD &&
             batched >= inflight_atstart) {
-            blk_io_unplug(dataplane->blk);
+            blk_io_unplug();
         }
         xen_block_do_aio(request);
         if (inflight_atstart > IO_PLUG_THRESHOLD) {
             if (batched >= inflight_atstart) {
-                blk_io_plug(dataplane->blk);
+                blk_io_plug();
                 batched = 0;
             } else {
                 batched++;
@@ -XXX,XX +XXX,XX @@ static bool xen_block_handle_requests(XenBlockDataPlane *dataplane)
         }
     }
     if (inflight_atstart > IO_PLUG_THRESHOLD) {
-        blk_io_unplug(dataplane->blk);
+        blk_io_unplug();
     }
 
     return done_something;
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -XXX,XX +XXX,XX @@ void virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq)
     bool suppress_notifications = virtio_queue_get_notification(vq);
 
     aio_context_acquire(blk_get_aio_context(s->blk));
-    blk_io_plug(s->blk);
+    blk_io_plug();
 
     do {
         if (suppress_notifications) {
@@ -XXX,XX +XXX,XX @@ void virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq)
         virtio_blk_submit_multireq(s, &mrb);
     }
 
-    blk_io_unplug(s->blk);
+    blk_io_unplug();
     aio_context_release(blk_get_aio_context(s->blk));
 }
 
diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/scsi/virtio-scsi.c
+++ b/hw/scsi/virtio-scsi.c
@@ -XXX,XX +XXX,XX @@ static int virtio_scsi_handle_cmd_req_prepare(VirtIOSCSI *s, VirtIOSCSIReq *req)
         return -ENOBUFS;
     }
     scsi_req_ref(req->sreq);
-    blk_io_plug(d->conf.blk);
+    blk_io_plug();
     object_unref(OBJECT(d));
     return 0;
 }
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_handle_cmd_req_submit(VirtIOSCSI *s, VirtIOSCSIReq *req)
     if (scsi_req_enqueue(sreq)) {
         scsi_req_continue(sreq);
     }
-    blk_io_unplug(sreq->dev->conf.blk);
+    blk_io_unplug();
     scsi_req_unref(sreq);
 }
 
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq)
                 while (!QTAILQ_EMPTY(&reqs)) {
                     req = QTAILQ_FIRST(&reqs);
                     QTAILQ_REMOVE(&reqs, req, next);
-                    blk_io_unplug(req->sreq->dev->conf.blk);
+                    blk_io_unplug();
                     scsi_req_unref(req->sreq);
                     virtqueue_detach_element(req->vq, &req->elem, 0);
                     virtio_scsi_free_req(req);
diff --git a/block/meson.build b/block/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/block/meson.build
+++ b/block/meson.build
@@ -XXX,XX +XXX,XX @@ block_ss.add(files(
   'mirror.c',
   'nbd.c',
   'null.c',
+  'plug.c',
   'qapi.c',
   'qcow2-bitmap.c',
   'qcow2-cache.c',
-- 
2.40.1

Stop using the .bdrv_co_io_plug() API because it is not multi-queue
block layer friendly. Use the new blk_io_plug_call() API to batch I/O
submission instead.

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/vfio-helpers.h"
 #include "block/block-io.h"
 #include "block/block_int.h"
+#include "sysemu/block-backend.h"
 #include "sysemu/replay.h"
 #include "trace.h"
 
@@ -XXX,XX +XXX,XX @@ struct BDRVNVMeState {
     int blkshift;
 
     uint64_t max_transfer;
-    bool plugged;
 
     bool supports_write_zeroes;
     bool supports_discard;
@@ -XXX,XX +XXX,XX @@ static void nvme_kick(NVMeQueuePair *q)
 {
     BDRVNVMeState *s = q->s;
 
-    if (s->plugged || !q->need_kick) {
+    if (!q->need_kick) {
         return;
     }
     trace_nvme_kick(s, q->index);
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(NVMeQueuePair *q)
     NvmeCqe *c;
 
     trace_nvme_process_completion(s, q->index, q->inflight);
-    if (s->plugged) {
-        trace_nvme_process_completion_queue_plugged(s, q->index);
-        return false;
-    }
 
     /*
      * Support re-entrancy when a request cb() function invokes aio_poll().
@@ -XXX,XX +XXX,XX @@ static void nvme_trace_command(const NvmeCmd *cmd)
     }
 }
 
+static void nvme_unplug_fn(void *opaque)
+{
+    NVMeQueuePair *q = opaque;
+
+    QEMU_LOCK_GUARD(&q->lock);
+    nvme_kick(q);
+    nvme_process_completion(q);
+}
+
 static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req,
                                 NvmeCmd *cmd, BlockCompletionFunc cb,
                                 void *opaque)
@@ -XXX,XX +XXX,XX @@ static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req,
            q->sq.tail * NVME_SQ_ENTRY_BYTES, cmd, sizeof(*cmd));
     q->sq.tail = (q->sq.tail + 1) % NVME_QUEUE_SIZE;
     q->need_kick++;
-    nvme_kick(q);
-    nvme_process_completion(q);
+    blk_io_plug_call(nvme_unplug_fn, q);
     qemu_mutex_unlock(&q->lock);
 }
 
@@ -XXX,XX +XXX,XX @@ static void nvme_attach_aio_context(BlockDriverState *bs,
     }
 }
 
-static void coroutine_fn nvme_co_io_plug(BlockDriverState *bs)
-{
-    BDRVNVMeState *s = bs->opaque;
-    assert(!s->plugged);
-    s->plugged = true;
-}
-
-static void coroutine_fn nvme_co_io_unplug(BlockDriverState *bs)
-{
-    BDRVNVMeState *s = bs->opaque;
-    assert(s->plugged);
-    s->plugged = false;
-    for (unsigned i = INDEX_IO(0); i < s->queue_count; i++) {
-        NVMeQueuePair *q = s->queues[i];
-        qemu_mutex_lock(&q->lock);
-        nvme_kick(q);
-        nvme_process_completion(q);
-        qemu_mutex_unlock(&q->lock);
-    }
-}
-
 static bool nvme_register_buf(BlockDriverState *bs, void *host, size_t size,
                               Error **errp)
 {
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_nvme = {
     .bdrv_detach_aio_context  = nvme_detach_aio_context,
     .bdrv_attach_aio_context  = nvme_attach_aio_context,
 
-    .bdrv_co_io_plug          = nvme_co_io_plug,
-    .bdrv_co_io_unplug        = nvme_co_io_unplug,
-
     .bdrv_register_buf        = nvme_register_buf,
     .bdrv_unregister_buf      = nvme_unregister_buf,
 };
diff --git a/block/trace-events b/block/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -XXX,XX +XXX,XX @@ nvme_kick(void *s, unsigned q_index) "s %p q #%u"
 nvme_dma_flush_queue_wait(void *s) "s %p"
 nvme_error(int cmd_specific, int sq_head, int sqid, int cid, int status) "cmd_specific %d sq_head %d sqid %d cid %d status 0x%x"
 nvme_process_completion(void *s, unsigned q_index, int inflight) "s %p q #%u inflight %d"
-nvme_process_completion_queue_plugged(void *s, unsigned q_index) "s %p q #%u"
 nvme_complete_command(void *s, unsigned q_index, int cid) "s %p q #%u cid %d"
 nvme_submit_command(void *s, unsigned q_index, int cid) "s %p q #%u cid %d"
 nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6, int c7) "%02x %02x %02x %02x %02x %02x %02x %02x"
-- 
2.40.1

Stop using the .bdrv_co_io_plug() API because it is not multi-queue
block layer friendly. Use the new blk_io_plug_call() API to batch I/O
submission instead.

diff --git a/block/blkio.c b/block/blkio.c
index XXXXXXX..XXXXXXX 100644
--- a/block/blkio.c
+++ b/block/blkio.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/error-report.h"
 #include "qapi/qmp/qdict.h"
 #include "qemu/module.h"
+#include "sysemu/block-backend.h"
 #include "exec/memory.h" /* for ram_block_discard_disable() */
 
 #include "block/block-io.h"
@@ -XXX,XX +XXX,XX @@ static void blkio_detach_aio_context(BlockDriverState *bs)
                        NULL, NULL, NULL);
 }
 
-/* Call with s->blkio_lock held to submit I/O after enqueuing a new request */
-static void blkio_submit_io(BlockDriverState *bs)
+/*
+ * Called by blk_io_unplug() or immediately if not plugged. Called without
+ * blkio_lock.
+ */
+static void blkio_unplug_fn(void *opaque)
 {
-    if (qatomic_read(&bs->io_plugged) == 0) {
-        BDRVBlkioState *s = bs->opaque;
+    BDRVBlkioState *s = opaque;
 
+    WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
         blkioq_do_io(s->blkioq, NULL, 0, 0, NULL);
     }
 }
 
+/*
+ * Schedule I/O submission after enqueuing a new request. Called without
+ * blkio_lock.
+ */
+static void blkio_submit_io(BlockDriverState *bs)
+{
+    BDRVBlkioState *s = bs->opaque;
+
+    blk_io_plug_call(blkio_unplug_fn, s);
+}
+
 static int coroutine_fn
 blkio_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
 {
@@ -XXX,XX +XXX,XX @@ blkio_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
 
     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
         blkioq_discard(s->blkioq, offset, bytes, &cod, 0);
-        blkio_submit_io(bs);
     }
 
+    blkio_submit_io(bs);
     qemu_coroutine_yield();
     return cod.ret;
 }
@@ -XXX,XX +XXX,XX @@ blkio_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
 
     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
         blkioq_readv(s->blkioq, offset, iov, iovcnt, &cod, 0);
-        blkio_submit_io(bs);
     }
 
+    blkio_submit_io(bs);
     qemu_coroutine_yield();
 
     if (use_bounce_buffer) {
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn blkio_co_pwritev(BlockDriverState *bs, int64_t offset,
 
     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
         blkioq_writev(s->blkioq, offset, iov, iovcnt, &cod, blkio_flags);
-        blkio_submit_io(bs);
     }
 
+    blkio_submit_io(bs);
     qemu_coroutine_yield();
 
     if (use_bounce_buffer) {
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn blkio_co_flush(BlockDriverState *bs)
 
     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
         blkioq_flush(s->blkioq, &cod, 0);
-        blkio_submit_io(bs);
     }
 
+    blkio_submit_io(bs);
     qemu_coroutine_yield();
     return cod.ret;
 }
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn blkio_co_pwrite_zeroes(BlockDriverState *bs,
 
     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
         blkioq_write_zeroes(s->blkioq, offset, bytes, &cod, blkio_flags);
-        blkio_submit_io(bs);
     }
 
+    blkio_submit_io(bs);
     qemu_coroutine_yield();
     return cod.ret;
 }
 
-static void coroutine_fn blkio_co_io_unplug(BlockDriverState *bs)
-{
-    BDRVBlkioState *s = bs->opaque;
-
-    WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
-        blkio_submit_io(bs);
-    }
-}
-
 typedef enum {
     BMRR_OK,
     BMRR_SKIP,
@@ -XXX,XX +XXX,XX @@ static void blkio_refresh_limits(BlockDriverState *bs, Error **errp)
         .bdrv_co_pwritev         = blkio_co_pwritev, \
         .bdrv_co_flush_to_disk   = blkio_co_flush, \
         .bdrv_co_pwrite_zeroes   = blkio_co_pwrite_zeroes, \
-        .bdrv_co_io_unplug       = blkio_co_io_unplug, \
         .bdrv_refresh_limits     = blkio_refresh_limits, \
         .bdrv_register_buf       = blkio_register_buf, \
         .bdrv_unregister_buf     = blkio_unregister_buf, \
-- 
2.40.1

Stop using the .bdrv_co_io_plug() API because it is not multi-queue
block layer friendly. Use the new blk_io_plug_call() API to batch I/O
submission instead.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Acked-by: Kevin Wolf <kwolf@redhat.com>
Message-id: 20230530180959.1108766-5-stefanha@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/block/raw-aio.h |  7 -------
 block/file-posix.c      | 10 ----------
 block/io_uring.c        | 44 ++++++++++++++++-------------------------
 block/trace-events      |  5 ++---
 4 files changed, 19 insertions(+), 47 deletions(-)

diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/raw-aio.h
+++ b/include/block/raw-aio.h
@@ -XXX,XX +XXX,XX @@ int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd, uint64_t offset,
                                   QEMUIOVector *qiov, int type);
 void luring_detach_aio_context(LuringState *s, AioContext *old_context);
 void luring_attach_aio_context(LuringState *s, AioContext *new_context);
-
-/*
- * luring_io_plug/unplug work in the thread's current AioContext, therefore the
- * caller must ensure that they are paired in the same IOThread.
- */
-void luring_io_plug(void);
-void luring_io_unplug(void);
 #endif
 
 #ifdef _WIN32
diff --git a/block/file-posix.c b/block/file-posix.c
index XXXXXXX..XXXXXXX 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn raw_co_io_plug(BlockDriverState *bs)
         laio_io_plug();
     }
 #endif
-#ifdef CONFIG_LINUX_IO_URING
-    if (s->use_linux_io_uring) {
-        luring_io_plug();
-    }
-#endif
 }
 
 static void coroutine_fn raw_co_io_unplug(BlockDriverState *bs)
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn raw_co_io_unplug(BlockDriverState *bs)
         laio_io_unplug(s->aio_max_batch);
     }
 #endif
-#ifdef CONFIG_LINUX_IO_URING
-    if (s->use_linux_io_uring) {
-        luring_io_unplug();
-    }
-#endif
 }
 
 static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs)
diff --git a/block/io_uring.c b/block/io_uring.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io_uring.c
+++ b/block/io_uring.c
@@ -XXX,XX +XXX,XX @@
 #include "block/raw-aio.h"
 #include "qemu/coroutine.h"
 #include "qapi/error.h"
+#include "sysemu/block-backend.h"
 #include "trace.h"
 
 /* Only used for assertions.  */
@@ -XXX,XX +XXX,XX @@ typedef struct LuringAIOCB {
 } LuringAIOCB;
 
 typedef struct LuringQueue {
-    int plugged;
     unsigned int in_queue;
     unsigned int in_flight;
     bool blocked;
@@ -XXX,XX +XXX,XX @@ static void luring_process_completions_and_submit(LuringState *s)
 {
     luring_process_completions(s);
 
-    if (!s->io_q.plugged && s->io_q.in_queue > 0) {
+    if (s->io_q.in_queue > 0) {
         ioq_submit(s);
     }
 }
@@ -XXX,XX +XXX,XX @@ static void qemu_luring_poll_ready(void *opaque)
 static void ioq_init(LuringQueue *io_q)
 {
     QSIMPLEQ_INIT(&io_q->submit_queue);
-    io_q->plugged = 0;
     io_q->in_queue = 0;
     io_q->in_flight = 0;
     io_q->blocked = false;
 }
 
-void luring_io_plug(void)
+static void luring_unplug_fn(void *opaque)
 {
-    AioContext *ctx = qemu_get_current_aio_context();
-    LuringState *s = aio_get_linux_io_uring(ctx);
-    trace_luring_io_plug(s);
-    s->io_q.plugged++;
-}
-
-void luring_io_unplug(void)
-{
-    AioContext *ctx = qemu_get_current_aio_context();
-    LuringState *s = aio_get_linux_io_uring(ctx);
-    assert(s->io_q.plugged);
-    trace_luring_io_unplug(s, s->io_q.blocked, s->io_q.plugged,
-                           s->io_q.in_queue, s->io_q.in_flight);
-    if (--s->io_q.plugged == 0 &&
-        !s->io_q.blocked && s->io_q.in_queue > 0) {
+    LuringState *s = opaque;
+    trace_luring_unplug_fn(s, s->io_q.blocked, s->io_q.in_queue,
+                           s->io_q.in_flight);
+    if (!s->io_q.blocked && s->io_q.in_queue > 0) {
         ioq_submit(s);
     }
 }
@@ -XXX,XX +XXX,XX @@ static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s,
 
     QSIMPLEQ_INSERT_TAIL(&s->io_q.submit_queue, luringcb, next);
     s->io_q.in_queue++;
-    trace_luring_do_submit(s, s->io_q.blocked, s->io_q.plugged,
-                           s->io_q.in_queue, s->io_q.in_flight);
-    if (!s->io_q.blocked &&
-        (!s->io_q.plugged ||
-         s->io_q.in_flight + s->io_q.in_queue >= MAX_ENTRIES)) {
-        ret = ioq_submit(s);
-        trace_luring_do_submit_done(s, ret);
-        return ret;
+    trace_luring_do_submit(s, s->io_q.blocked, s->io_q.in_queue,
+                           s->io_q.in_flight);
+    if (!s->io_q.blocked) {
+        if (s->io_q.in_flight + s->io_q.in_queue >= MAX_ENTRIES) {
+            ret = ioq_submit(s);
+            trace_luring_do_submit_done(s, ret);
+            return ret;
+        }
+
+        blk_io_plug_call(luring_unplug_fn, s);
     }
     return 0;
 }
diff --git a/block/trace-events b/block/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -XXX,XX +XXX,XX @@ file_paio_submit(void *acb, void *opaque, int64_t offset, int count, int type) "
 # io_uring.c
 luring_init_state(void *s, size_t size) "s %p size %zu"
 luring_cleanup_state(void *s) "%p freed"
-luring_io_plug(void *s) "LuringState %p plug"
-luring_io_unplug(void *s, int blocked, int plugged, int queued, int inflight) "LuringState %p blocked %d plugged %d queued %d inflight %d"
-luring_do_submit(void *s, int blocked, int plugged, int queued, int inflight) "LuringState %p blocked %d plugged %d queued %d inflight %d"
+luring_unplug_fn(void *s, int blocked, int queued, int inflight) "LuringState %p blocked %d queued %d inflight %d"
+luring_do_submit(void *s, int blocked, int queued, int inflight) "LuringState %p blocked %d queued %d inflight %d"
 luring_do_submit_done(void *s, int ret) "LuringState %p submitted to kernel %d"
 luring_co_submit(void *bs, void *s, void *luringcb, int fd, uint64_t offset, size_t nbytes, int type) "bs %p s %p luringcb %p fd %d offset %" PRId64 " nbytes %zd type %d"
 luring_process_completion(void *s, void *aiocb, int ret) "LuringState %p luringcb %p ret %d"
-- 
2.40.1

Stop using the .bdrv_co_io_plug() API because it is not multi-queue
block layer friendly. Use the new blk_io_plug_call() API to batch I/O
submission instead.

Note that a dev_max_batch check is dropped in laio_io_unplug() because
the semantics of unplug_fn() are different from .bdrv_co_unplug():
1. unplug_fn() is only called when the last blk_io_unplug() call occurs,
   not every time blk_io_unplug() is called.
2. unplug_fn() is per-thread, not per-BlockDriverState, so there is no
   way to get per-BlockDriverState fields like dev_max_batch.

Therefore this condition cannot be moved to laio_unplug_fn(). It is not
obvious that this condition affects performance in practice, so I am
removing it instead of trying to come up with a more complex mechanism
to preserve the condition.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Acked-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Message-id: 20230530180959.1108766-6-stefanha@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/block/raw-aio.h |  7 -------
 block/file-posix.c      | 28 ----------------------------
 block/linux-aio.c       | 41 +++++++++++------------------------------
 3 files changed, 11 insertions(+), 65 deletions(-)

diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/raw-aio.h
+++ b/include/block/raw-aio.h
@@ -XXX,XX +XXX,XX @@ int coroutine_fn laio_co_submit(int fd, uint64_t offset, QEMUIOVector *qiov,
 
 void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context);
 void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context);
-
-/*
- * laio_io_plug/unplug work in the thread's current AioContext, therefore the
- * caller must ensure that they are paired in the same IOThread.
- */
-void laio_io_plug(void);
-void laio_io_unplug(uint64_t dev_max_batch);
 #endif
 /* io_uring.c - Linux io_uring implementation */
 #ifdef CONFIG_LINUX_IO_URING
diff --git a/block/file-posix.c b/block/file-posix.c
index XXXXXXX..XXXXXXX 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, int64_t offset,
     return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE);
 }
 
-static void coroutine_fn raw_co_io_plug(BlockDriverState *bs)
-{
-    BDRVRawState __attribute__((unused)) *s = bs->opaque;
-#ifdef CONFIG_LINUX_AIO
-    if (s->use_linux_aio) {
-        laio_io_plug();
-    }
-#endif
-}
-
-static void coroutine_fn raw_co_io_unplug(BlockDriverState *bs)
-{
-    BDRVRawState __attribute__((unused)) *s = bs->opaque;
-#ifdef CONFIG_LINUX_AIO
-    if (s->use_linux_aio) {
-        laio_io_unplug(s->aio_max_batch);
-    }
-#endif
-}
-
 static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs)
 {
     BDRVRawState *s = bs->opaque;
@@ -XXX,XX +XXX,XX @@ BlockDriver bdrv_file = {
     .bdrv_co_copy_range_from = raw_co_copy_range_from,
     .bdrv_co_copy_range_to  = raw_co_copy_range_to,
     .bdrv_refresh_limits = raw_refresh_limits,
-    .bdrv_co_io_plug        = raw_co_io_plug,
-    .bdrv_co_io_unplug      = raw_co_io_unplug,
     .bdrv_attach_aio_context = raw_aio_attach_aio_context,
 
     .bdrv_co_truncate                   = raw_co_truncate,
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_host_device = {
     .bdrv_co_copy_range_from = raw_co_copy_range_from,
     .bdrv_co_copy_range_to  = raw_co_copy_range_to,
     .bdrv_refresh_limits = raw_refresh_limits,
-    .bdrv_co_io_plug        = raw_co_io_plug,
-    .bdrv_co_io_unplug      = raw_co_io_unplug,
     .bdrv_attach_aio_context = raw_aio_attach_aio_context,
 
     .bdrv_co_truncate                   = raw_co_truncate,
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_host_cdrom = {
     .bdrv_co_pwritev        = raw_co_pwritev,
     .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
     .bdrv_refresh_limits    = cdrom_refresh_limits,
-    .bdrv_co_io_plug        = raw_co_io_plug,
-    .bdrv_co_io_unplug      = raw_co_io_unplug,
     .bdrv_attach_aio_context = raw_aio_attach_aio_context,
 
     .bdrv_co_truncate                   = raw_co_truncate,
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_host_cdrom = {
     .bdrv_co_pwritev        = raw_co_pwritev,
     .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
     .bdrv_refresh_limits    = cdrom_refresh_limits,
-    .bdrv_co_io_plug        = raw_co_io_plug,
-    .bdrv_co_io_unplug      = raw_co_io_unplug,
     .bdrv_attach_aio_context = raw_aio_attach_aio_context,
 
     .bdrv_co_truncate                   = raw_co_truncate,
diff --git a/block/linux-aio.c b/block/linux-aio.c
index XXXXXXX..XXXXXXX 100644
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/event_notifier.h"
 #include "qemu/coroutine.h"
 #include "qapi/error.h"
+#include "sysemu/block-backend.h"
 
 /* Only used for assertions.  */
 #include "qemu/coroutine_int.h"
@@ -XXX,XX +XXX,XX @@ struct qemu_laiocb {
 };
 
 typedef struct {
-    int plugged;
     unsigned int in_queue;
     unsigned int in_flight;
     bool blocked;
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_process_completions_and_submit(LinuxAioState *s)
 {
     qemu_laio_process_completions(s);
 
-    if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
+    if (!QSIMPLEQ_EMPTY(&s->io_q.pending)) {
         ioq_submit(s);
     }
 }
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_poll_ready(EventNotifier *opaque)
 static void ioq_init(LaioQueue *io_q)
 {
     QSIMPLEQ_INIT(&io_q->pending);
-    io_q->plugged = 0;
     io_q->in_queue = 0;
     io_q->in_flight = 0;
     io_q->blocked = false;
@@ -XXX,XX +XXX,XX @@ static uint64_t laio_max_batch(LinuxAioState *s, uint64_t dev_max_batch)
     return max_batch;
 }
 
-void laio_io_plug(void)
+static void laio_unplug_fn(void *opaque)
 {
-    AioContext *ctx = qemu_get_current_aio_context();
-    LinuxAioState *s = aio_get_linux_aio(ctx);
+    LinuxAioState *s = opaque;
 
-    s->io_q.plugged++;
-}
-
-void laio_io_unplug(uint64_t dev_max_batch)
-{
-    AioContext *ctx = qemu_get_current_aio_context();
-    LinuxAioState *s = aio_get_linux_aio(ctx);
-
-    assert(s->io_q.plugged);
-    s->io_q.plugged--;
-
-    /*
-     * Why max batch checking is performed here:
-     * Another BDS may have queued requests with a higher dev_max_batch and
-     * therefore in_queue could now exceed our dev_max_batch. Re-check the max
-     * batch so we can honor our device's dev_max_batch.
-     */
-    if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch) ||
-        (!s->io_q.plugged &&
-         !s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending))) {
+    if (!s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
         ioq_submit(s);
     }
 }
@@ -XXX,XX +XXX,XX @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
 
     QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next);
     s->io_q.in_queue++;
-    if (!s->io_q.blocked &&
-        (!s->io_q.plugged ||
-         s->io_q.in_queue >= laio_max_batch(s, dev_max_batch))) {
-        ioq_submit(s);
+    if (!s->io_q.blocked) {
+        if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch)) {
+            ioq_submit(s);
+        } else {
+            blk_io_plug_call(laio_unplug_fn, s);
+        }
     }
 
     return 0;
-- 
2.40.1

No block driver implements .bdrv_co_io_plug() anymore. Get rid of the
function pointers.

diff --git a/include/block/block-io.h b/include/block/block-io.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block-io.h
+++ b/include/block/block-io.h
@@ -XXX,XX +XXX,XX @@ void coroutine_fn bdrv_co_leave(BlockDriverState *bs, AioContext *old_ctx);
 
 AioContext *child_of_bds_get_parent_aio_context(BdrvChild *c);
 
-void coroutine_fn GRAPH_RDLOCK bdrv_co_io_plug(BlockDriverState *bs);
-void coroutine_fn GRAPH_RDLOCK bdrv_co_io_unplug(BlockDriverState *bs);
-
 bool coroutine_fn GRAPH_RDLOCK
 bdrv_co_can_store_new_dirty_bitmap(BlockDriverState *bs, const char *name,
                                    uint32_t granularity, Error **errp);
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -XXX,XX +XXX,XX @@ struct BlockDriver {
     void coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_debug_event)(
         BlockDriverState *bs, BlkdebugEvent event);
 
-    /* io queue for linux-aio */
-    void coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_io_plug)(BlockDriverState *bs);
-    void coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_io_unplug)(
-        BlockDriverState *bs);
-
     bool (*bdrv_supports_persistent_dirty_bitmap)(BlockDriverState *bs);
 
     bool coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_can_store_new_dirty_bitmap)(
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
     unsigned int in_flight;
     unsigned int serialising_in_flight;
 
-    /*
-     * counter for nested bdrv_io_plug.
-     * Accessed with atomic ops.
-     */
-    unsigned io_plugged;
-
     /* do we need to tell the quest if we have a volatile write cache? */
     int enable_write_cache;
 
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
     return mem;
 }
 
-void coroutine_fn bdrv_co_io_plug(BlockDriverState *bs)
-{
-    BdrvChild *child;
-    IO_CODE();
-    assert_bdrv_graph_readable();
-
-    QLIST_FOREACH(child, &bs->children, next) {
-        bdrv_co_io_plug(child->bs);
-    }
-
-    if (qatomic_fetch_inc(&bs->io_plugged) == 0) {
-        BlockDriver *drv = bs->drv;
-        if (drv && drv->bdrv_co_io_plug) {
-            drv->bdrv_co_io_plug(bs);
-        }
-    }
-}
-
-void coroutine_fn bdrv_co_io_unplug(BlockDriverState *bs)
-{
-    BdrvChild *child;
-    IO_CODE();
-    assert_bdrv_graph_readable();
-
-    assert(bs->io_plugged);
-    if (qatomic_fetch_dec(&bs->io_plugged) == 1) {
-        BlockDriver *drv = bs->drv;
-        if (drv && drv->bdrv_co_io_unplug) {
-            drv->bdrv_co_io_unplug(bs);
-        }
-    }
-
-    QLIST_FOREACH(child, &bs->children, next) {
-        bdrv_co_io_unplug(child->bs);
-    }
-}
-
 /* Helper that undoes bdrv_register_buf() when it fails partway through */
 static void GRAPH_RDLOCK
 bdrv_register_buf_rollback(BlockDriverState *bs, void *host, size_t size,
-- 
2.40.1

From: Stefano Garzarella <sgarzare@redhat.com>

Some virtio-blk drivers (e.g. virtio-blk-vhost-vdpa) supports the fd
passing. Let's expose this to the user, so the management layer
can pass the file descriptor of an already opened path.

If the libblkio virtio-blk driver supports fd passing, let's always
use qemu_open() to open the `path`, so we can handle fd passing
from the management layer through the "/dev/fdset/N" special path.

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Message-id: 20230530071941.8954-2-sgarzare@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/blkio.c | 53 ++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 44 insertions(+), 9 deletions(-)

diff --git a/block/blkio.c b/block/blkio.c
index XXXXXXX..XXXXXXX 100644
--- a/block/blkio.c
+++ b/block/blkio.c
@@ -XXX,XX +XXX,XX @@ static int blkio_virtio_blk_common_open(BlockDriverState *bs,
 {
     const char *path = qdict_get_try_str(options, "path");
     BDRVBlkioState *s = bs->opaque;
-    int ret;
+    bool fd_supported = false;
+    int fd, ret;
 
     if (!path) {
         error_setg(errp, "missing 'path' option");
         return -EINVAL;
     }
 
-    ret = blkio_set_str(s->blkio, "path", path);
-    qdict_del(options, "path");
-    if (ret < 0) {
-        error_setg_errno(errp, -ret, "failed to set path: %s",
-                         blkio_get_error_msg());
-        return ret;
-    }
-
     if (!(flags & BDRV_O_NOCACHE)) {
         error_setg(errp, "cache.direct=off is not supported");
         return -EINVAL;
     }
+
+    if (blkio_get_int(s->blkio, "fd", &fd) == 0) {
+        fd_supported = true;
+    }
+
+    /*
+     * If the libblkio driver supports fd passing, let's always use qemu_open()
+     * to open the `path`, so we can handle fd passing from the management
+     * layer through the "/dev/fdset/N" special path.
+     */
+    if (fd_supported) {
+        int open_flags;
+
+        if (flags & BDRV_O_RDWR) {
+            open_flags = O_RDWR;
+        } else {
+            open_flags = O_RDONLY;
+        }
+
+        fd = qemu_open(path, open_flags, errp);
+        if (fd < 0) {
+            return -EINVAL;
+        }
+
+        ret = blkio_set_int(s->blkio, "fd", fd);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret, "failed to set fd: %s",
+                             blkio_get_error_msg());
+            qemu_close(fd);
+            return ret;
+        }
+    } else {
+        ret = blkio_set_str(s->blkio, "path", path);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret, "failed to set path: %s",
+                             blkio_get_error_msg());
+            return ret;
+        }
+    }
+
+    qdict_del(options, "path");
+
     return 0;
 }
 
-- 
2.40.1

From: Stefano Garzarella <sgarzare@redhat.com>

The virtio-blk-vhost-vdpa driver in libblkio 1.3.0 supports the fd
passing through the new 'fd' property.

Since now we are using qemu_open() on '@path' if the virtio-blk driver
supports the fd passing, let's announce it.
In this way, the management layer can pass the file descriptor of an
already opened vhost-vdpa character device. This is useful especially
when the device can only be accessed with certain privileges.

Add the '@fdset' feature only when the virtio-blk-vhost-vdpa driver
in libblkio supports it.

Suggested-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Message-id: 20230530071941.8954-3-sgarzare@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 qapi/block-core.json | 6 ++++++
 meson.build          | 4 ++++
 2 files changed, 10 insertions(+)

diff --git a/qapi/block-core.json b/qapi/block-core.json
index XXXXXXX..XXXXXXX 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -XXX,XX +XXX,XX @@
 #
 # @path: path to the vhost-vdpa character device.
 #
+# Features:
+# @fdset: Member @path supports the special "/dev/fdset/N" path
+#     (since 8.1)
+#
 # Since: 7.2
 ##
 { 'struct': 'BlockdevOptionsVirtioBlkVhostVdpa',
   'data': { 'path': 'str' },
+  'features': [ { 'name' :'fdset',
+                  'if': 'CONFIG_BLKIO_VHOST_VDPA_FD' } ],
   'if': 'CONFIG_BLKIO' }
 
 ##
diff --git a/meson.build b/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/meson.build
+++ b/meson.build
@@ -XXX,XX +XXX,XX @@ config_host_data.set('CONFIG_LZO', lzo.found())
 config_host_data.set('CONFIG_MPATH', mpathpersist.found())
 config_host_data.set('CONFIG_MPATH_NEW_API', mpathpersist_new_api)
 config_host_data.set('CONFIG_BLKIO', blkio.found())
+if blkio.found()
+  config_host_data.set('CONFIG_BLKIO_VHOST_VDPA_FD',
+                       blkio.version().version_compare('>=1.3.0'))
+endif
 config_host_data.set('CONFIG_CURL', curl.found())
 config_host_data.set('CONFIG_CURSES', curses.found())
 config_host_data.set('CONFIG_GBM', gbm.found())
-- 
2.40.1