Series comparison

-[PULL for-5.0 0/3] Block patches
+[PULL 0/1] Block patches
-The following changes since commit 8bac3ba57eecc466b7e73dabf7d19328a59f684e:
+The following changes since commit 887cba855bb6ff4775256f7968409281350b568c:
-  Merge remote-tracking branch 'remotes/rth/tags/pull-rx-20200408' into staging (2020-04-09 13:23:30 +0100)
+  configure: Fix cross-building for RISCV host (v5) (2023-07-11 17:56:09 +0100)
 are available in the Git repository at:
-  https://github.com/stefanha/qemu.git tags/block-pull-request
+  https://gitlab.com/stefanha/qemu.git tags/block-pull-request
-for you to fetch changes up to 5710a3e09f9b85801e5ce70797a4a511e5fc9e2c:
+for you to fetch changes up to 75dcb4d790bbe5327169fd72b185960ca58e2fa6:
-  async: use explicit memory barriers (2020-04-09 16:17:14 +0100)
+  virtio-blk: fix host notifier issues during dataplane start/stop (2023-07-12 15:20:32 -0400)
 ----------------------------------------------------------------
 Pull request
-Fixes for QEMU on aarch64 ARM hosts and fdmon-io_uring.
 ----------------------------------------------------------------
-Paolo Bonzini (2):
+Stefan Hajnoczi (1):
-  aio-wait: delegate polling of main AioContext if BQL not held
+  virtio-blk: fix host notifier issues during dataplane start/stop
   async: use explicit memory barriers
-Stefan Hajnoczi (1):
+ hw/block/dataplane/virtio-blk.c | 67 +++++++++++++++++++--------------
-  aio-posix: signal-proof fdmon-io_uring
+file changed, 38 insertions(+), 29 deletions(-)
  include/block/aio-wait.h | 22 ++++++++++++++++++++++
  include/block/aio.h      | 29 ++++++++++-------------------
  util/aio-posix.c         | 16 ++++++++++++++--
  util/aio-win32.c         | 17 ++++++++++++++---
  util/async.c             | 16 ++++++++++++----
  util/fdmon-io_uring.c    | 10 ++++++++--
 files changed, 80 insertions(+), 30 deletions(-)
 --
-.25.1
+.40.1

-[PULL for-5.0 1/3] aio-posix: signal-proof fdmon-io_uring
+Deleted patch
-The io_uring_enter(2) syscall returns with errno=EINTR when interrupted
-by a signal.  Retry the syscall in this case.
-It's essential to do this in the io_uring_submit_and_wait() case.  My
-interpretation of the Linux v5.5 io_uring_enter(2) code is that it
-shouldn't affect the io_uring_submit() case, but there is no guarantee
-this will always be the case.  Let's check for -EINTR around both APIs.
-Note that the liburing APIs have -errno return values.
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Message-id: 20200408091139.273851-1-stefanha@redhat.com
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
----
- util/fdmon-io_uring.c | 10 ++++++++--
-file changed, 8 insertions(+), 2 deletions(-)
-diff --git a/util/fdmon-io_uring.c b/util/fdmon-io_uring.c
-index XXXXXXX..XXXXXXX 100644
---- a/util/fdmon-io_uring.c
-+++ b/util/fdmon-io_uring.c
-@@ -XXX,XX +XXX,XX @@ static struct io_uring_sqe *get_sqe(AioContext *ctx)
-     }
-     /* No free sqes left, submit pending sqes first */
--    ret = io_uring_submit(ring);
-+    do {
-+        ret = io_uring_submit(ring);
-+    } while (ret == -EINTR);
-+
-     assert(ret > 1);
-     sqe = io_uring_get_sqe(ring);
-     assert(sqe);
-@@ -XXX,XX +XXX,XX @@ static int fdmon_io_uring_wait(AioContext *ctx, AioHandlerList *ready_list,
-     fill_sq_ring(ctx);
--    ret = io_uring_submit_and_wait(&ctx->fdmon_io_uring, wait_nr);
-+    do {
-+        ret = io_uring_submit_and_wait(&ctx->fdmon_io_uring, wait_nr);
-+    } while (ret == -EINTR);
-+
-     assert(ret >= 0);
-     return process_cq_ring(ctx, ready_list);
---
-.25.1

-[PULL for-5.0 2/3] aio-wait: delegate polling of main AioContext if BQL not held
+Deleted patch
-From: Paolo Bonzini <pbonzini@redhat.com>
-Any thread that is not a iothread returns NULL for qemu_get_current_aio_context().
-As a result, it would also return true for
-in_aio_context_home_thread(qemu_get_aio_context()), causing
-AIO_WAIT_WHILE to invoke aio_poll() directly.  This is incorrect
-if the BQL is not held, because aio_poll() does not expect to
-run concurrently from multiple threads, and it can actually
-happen when savevm writes to the vmstate file from the
-migration thread.
-Therefore, restrict in_aio_context_home_thread to return true
-for the main AioContext only if the BQL is held.
-The function is moved to aio-wait.h because it is mostly used
-there and to avoid a circular reference between main-loop.h
-and block/aio.h.
-Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Message-Id: <20200407140746.8041-5-pbonzini@redhat.com>
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
----
- include/block/aio-wait.h | 22 ++++++++++++++++++++++
- include/block/aio.h      | 29 ++++++++++-------------------
-files changed, 32 insertions(+), 19 deletions(-)
-diff --git a/include/block/aio-wait.h b/include/block/aio-wait.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/block/aio-wait.h
-+++ b/include/block/aio-wait.h
-@@ -XXX,XX +XXX,XX @@
- #define QEMU_AIO_WAIT_H
- #include "block/aio.h"
-+#include "qemu/main-loop.h"
- /**
-  * AioWait:
-@@ -XXX,XX +XXX,XX @@ void aio_wait_kick(void);
-  */
- void aio_wait_bh_oneshot(AioContext *ctx, QEMUBHFunc *cb, void *opaque);
-+/**
-+ * in_aio_context_home_thread:
-+ * @ctx: the aio context
-+ *
-+ * Return whether we are running in the thread that normally runs @ctx.  Note
-+ * that acquiring/releasing ctx does not affect the outcome, each AioContext
-+ * still only has one home thread that is responsible for running it.
-+ */
-+static inline bool in_aio_context_home_thread(AioContext *ctx)
-+{
-+    if (ctx == qemu_get_current_aio_context()) {
-+        return true;
-+    }
-+
-+    if (ctx == qemu_get_aio_context()) {
-+        return qemu_mutex_iothread_locked();
-+    } else {
-+        return false;
-+    }
-+}
-+
- #endif /* QEMU_AIO_WAIT_H */
-diff --git a/include/block/aio.h b/include/block/aio.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/block/aio.h
-+++ b/include/block/aio.h
-@@ -XXX,XX +XXX,XX @@ struct AioContext {
-     AioHandlerList deleted_aio_handlers;
-     /* Used to avoid unnecessary event_notifier_set calls in aio_notify;
--     * accessed with atomic primitives.  If this field is 0, everything
--     * (file descriptors, bottom halves, timers) will be re-evaluated
--     * before the next blocking poll(), thus the event_notifier_set call
--     * can be skipped.  If it is non-zero, you may need to wake up a
--     * concurrent aio_poll or the glib main event loop, making
--     * event_notifier_set necessary.
-+     * only written from the AioContext home thread, or under the BQL in
-+     * the case of the main AioContext.  However, it is read from any
-+     * thread so it is still accessed with atomic primitives.
-+     *
-+     * If this field is 0, everything (file descriptors, bottom halves,
-+     * timers) will be re-evaluated before the next blocking poll() or
-+     * io_uring wait; therefore, the event_notifier_set call can be
-+     * skipped.  If it is non-zero, you may need to wake up a concurrent
-+     * aio_poll or the glib main event loop, making event_notifier_set
-+     * necessary.
-      *
-      * Bit 0 is reserved for GSource usage of the AioContext, and is 1
-      * between a call to aio_ctx_prepare and the next call to aio_ctx_check.
-@@ -XXX,XX +XXX,XX @@ void aio_co_enter(AioContext *ctx, struct Coroutine *co);
-  */
- AioContext *qemu_get_current_aio_context(void);
--/**
-- * in_aio_context_home_thread:
-- * @ctx: the aio context
-- *
-- * Return whether we are running in the thread that normally runs @ctx.  Note
-- * that acquiring/releasing ctx does not affect the outcome, each AioContext
-- * still only has one home thread that is responsible for running it.
-- */
--static inline bool in_aio_context_home_thread(AioContext *ctx)
--{
--    return ctx == qemu_get_current_aio_context();
--}
--
- /**
-  * aio_context_setup:
-  * @ctx: the aio context
---
-.25.1

-[PULL for-5.0 3/3] async: use explicit memory barriers
+[PULL 1/1] virtio-blk: fix host notifier issues during dataplane start/stop
-From: Paolo Bonzini <pbonzini@redhat.com>
+The main loop thread can consume 100% CPU when using --device
 virtio-blk-pci,iothread=<iothread>. ppoll() constantly returns but
 reading virtqueue host notifiers fails with EAGAIN. The file descriptors
 are stale and remain registered with the AioContext because of bugs in
 the virtio-blk dataplane start/stop code.
-When using C11 atomics, non-seqcst reads and writes do not participate
+The problem is that the dataplane start/stop code involves drain
-in the total order of seqcst operations.  In util/async.c and util/aio-posix.c,
+operations, which call virtio_blk_drained_begin() and
-in particular, the pattern that we use
+virtio_blk_drained_end() at points where the host notifier is not
 operational:
 - In virtio_blk_data_plane_start(), blk_set_aio_context() drains after
   vblk->dataplane_started has been set to true but the host notifier has
   not been attached yet.
 - In virtio_blk_data_plane_stop(), blk_drain() and blk_set_aio_context()
   drain after the host notifier has already been detached but with
   vblk->dataplane_started still set to true.
-          write ctx->notify_me                 write bh->scheduled
+I would like to simplify ->ioeventfd_start/stop() to avoid interactions
-          read bh->scheduled                   read ctx->notify_me
+with drain entirely, but couldn't find a way to do that. Instead, this
-          if !bh->scheduled, sleep             if ctx->notify_me, notify
+patch accepts the fragile nature of the code and reorders it so that
 vblk->dataplane_started is false during drain operations. This way the
 virtio_blk_drained_begin() and virtio_blk_drained_end() calls don't
 touch the host notifier. The result is that
 virtio_blk_data_plane_start() and virtio_blk_data_plane_stop() have
 complete control over the host notifier and stale file descriptors are
 no longer left in the AioContext.
-needs to use seqcst operations for both the write and the read.  In
+This patch fixes the 100% CPU consumption in the main loop thread and
-general this is something that we do not want, because there can be
+correctly moves host notifier processing to the IOThread.
 many sources that are polled in addition to bottom halves.  The
 alternative is to place a seqcst memory barrier between the write
 and the read.  This also comes with a disadvantage, in that the
 memory barrier is implicit on strongly-ordered architectures and
 it wastes a few dozen clock cycles.
-Fortunately, ctx->notify_me is never written concurrently by two
+Fixes: 1665d9326fd2 ("virtio-blk: implement BlockDevOps->drained_begin()")
-threads, so we can assert that and relax the writes to ctx->notify_me.
+Reported-by: Lukáš Doktor <ldoktor@redhat.com>
-The resulting solution works and performs well on both aarch64 and x86.
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Tested-by: Lukas Doktor <ldoktor@redhat.com>
-Note that the atomic_set/atomic_read combination is not an atomic
+Message-id: 20230704151527.193586-1-stefanha@redhat.com
 read-modify-write, and therefore it is even weaker than C11 ATOMIC_RELAXED;
 on x86, ATOMIC_RELAXED compiles to a locked operation.
 Analyzed-by: Ying Fang <fangying1@huawei.com>
 Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Tested-by: Ying Fang <fangying1@huawei.com>
 Message-Id: <20200407140746.8041-6-pbonzini@redhat.com>
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- util/aio-posix.c | 16 ++++++++++++++--
+ hw/block/dataplane/virtio-blk.c | 67 +++++++++++++++++++--------------
- util/aio-win32.c | 17 ++++++++++++++---
+file changed, 38 insertions(+), 29 deletions(-)
  util/async.c     | 16 ++++++++++++----
 files changed, 40 insertions(+), 9 deletions(-)
-diff --git a/util/aio-posix.c b/util/aio-posix.c
+diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
 index XXXXXXX..XXXXXXX 100644
---- a/util/aio-posix.c
+--- a/hw/block/dataplane/virtio-blk.c
-+++ b/util/aio-posix.c
++++ b/hw/block/dataplane/virtio-blk.c
-@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
+@@ -XXX,XX +XXX,XX @@ int virtio_blk_data_plane_start(VirtIODevice *vdev)
-     int64_t timeout;
-     int64_t start = 0;
+     memory_region_transaction_commit();
 -    /*
 -     * These fields are visible to the IOThread so we rely on implicit barriers
 -     * in aio_context_acquire() on the write side and aio_notify_accept() on
 -     * the read side.
 -     */
 -    s->starting = false;
 -    vblk->dataplane_started = true;
      trace_virtio_blk_data_plane_start(s);
      old_context = blk_get_aio_context(s->conf->conf.blk);
@@ -XXX,XX +XXX,XX @@ int virtio_blk_data_plane_start(VirtIODevice *vdev)
          event_notifier_set(virtio_queue_get_host_notifier(vq));
      }
 +    /*
-+     * There cannot be two concurrent aio_poll calls for the same AioContext (or
++     * These fields must be visible to the IOThread when it processes the
-+     * an aio_poll concurrent with a GSource prepare/check/dispatch callback).
++     * virtqueue, otherwise it will think dataplane has not started yet.
-+     * We rely on this below to avoid slow locked accesses to ctx->notify_me.
++     *
 +     * Make sure ->dataplane_started is false when blk_set_aio_context() is
 +     * called above so that draining does not cause the host notifier to be
 +     * detached/attached prematurely.
 +     */
-     assert(in_aio_context_home_thread(ctx));
++    s->starting = false;
++    vblk->dataplane_started = true;
-     /* aio_notify can avoid the expensive event_notifier_set if
++    smp_wmb(); /* paired with aio_notify_accept() on the read side */
-@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
++
-      * so disable the optimization now.
+     /* Get this show started by hooking up our callbacks */
-      */
+     if (!blk_in_drain(s->conf->conf.blk)) {
-     if (blocking) {
+         aio_context_acquire(s->ctx);
--        atomic_add(&ctx->notify_me, 2);
+@@ -XXX,XX +XXX,XX @@ int virtio_blk_data_plane_start(VirtIODevice *vdev)
-+        atomic_set(&ctx->notify_me, atomic_read(&ctx->notify_me) + 2);
+   fail_guest_notifiers:
-+        /*
+     vblk->dataplane_disabled = true;
-+         * Write ctx->notify_me before computing the timeout
+     s->starting = false;
-+         * (reading bottom half flags, etc.).  Pairs with
+-    vblk->dataplane_started = true;
-+         * smp_mb in aio_notify().
+     return -ENOSYS;
-+         */
+ }
-+        smp_mb();
@@ -XXX,XX +XXX,XX @@ void virtio_blk_data_plane_stop(VirtIODevice *vdev)
          aio_wait_bh_oneshot(s->ctx, virtio_blk_data_plane_stop_bh, s);
      }
-     qemu_lockcnt_inc(&ctx->list_lock);
-@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
-     }
-     if (blocking) {
--        atomic_sub(&ctx->notify_me, 2);
-+        /* Finish the poll before clearing the flag.  */
-+        atomic_store_release(&ctx->notify_me, atomic_read(&ctx->notify_me) - 2);
-         aio_notify_accept(ctx);
-     }
-diff --git a/util/aio-win32.c b/util/aio-win32.c
-index XXXXXXX..XXXXXXX 100644
---- a/util/aio-win32.c
-+++ b/util/aio-win32.c
-@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
-     int count;
-     int timeout;
 +    /*
-+     * There cannot be two concurrent aio_poll calls for the same AioContext (or
++     * Batch all the host notifiers in a single transaction to avoid
-+     * an aio_poll concurrent with a GSource prepare/check/dispatch callback).
++     * quadratic time complexity in address_space_update_ioeventfds().
 +     * We rely on this below to avoid slow locked accesses to ctx->notify_me.
 +     */
-+    assert(in_aio_context_home_thread(ctx));
++    memory_region_transaction_begin();
-     progress = false;
++
++    for (i = 0; i < nvqs; i++) {
-     /* aio_notify can avoid the expensive event_notifier_set if
++        virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), i, false);
-@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
++    }
       * so disable the optimization now.
       */
      if (blocking) {
 -        atomic_add(&ctx->notify_me, 2);
 +        atomic_set(&ctx->notify_me, atomic_read(&ctx->notify_me) + 2);
 +        /*
 +         * Write ctx->notify_me before computing the timeout
 +         * (reading bottom half flags, etc.).  Pairs with
 +         * smp_mb in aio_notify().
 +         */
 +        smp_mb();
      }
      qemu_lockcnt_inc(&ctx->list_lock);
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
          ret = WaitForMultipleObjects(count, events, FALSE, timeout);
          if (blocking) {
              assert(first);
 -            assert(in_aio_context_home_thread(ctx));
 -            atomic_sub(&ctx->notify_me, 2);
 +            atomic_store_release(&ctx->notify_me, atomic_read(&ctx->notify_me) - 2);
              aio_notify_accept(ctx);
          }
 diff --git a/util/async.c b/util/async.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/async.c
 +++ b/util/async.c
@@ -XXX,XX +XXX,XX @@ aio_ctx_prepare(GSource *source, gint    *timeout)
  {
      AioContext *ctx = (AioContext *) source;
 -    atomic_or(&ctx->notify_me, 1);
 +    atomic_set(&ctx->notify_me, atomic_read(&ctx->notify_me) | 1);
 +
 +    /*
-+     * Write ctx->notify_me before computing the timeout
++     * The transaction expects the ioeventfds to be open when it
-+     * (reading bottom half flags, etc.).  Pairs with
++     * commits. Do it now, before the cleanup loop.
 +     * smp_mb in aio_notify().
 +     */
-+    smp_mb();
++    memory_region_transaction_commit();
++
-     /* We assume there is no timeout already supplied */
++    for (i = 0; i < nvqs; i++) {
-     *timeout = qemu_timeout_ns_to_ms(aio_compute_timeout(ctx));
++        virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), i);
-@@ -XXX,XX +XXX,XX @@ aio_ctx_check(GSource *source)
++    }
-     QEMUBH *bh;
++
-     BHListSlice *s;
++    /*
++     * Set ->dataplane_started to false before draining so that host notifiers
--    atomic_and(&ctx->notify_me, ~1);
++     * are not detached/attached anymore.
-+    /* Finish computing the timeout before clearing the flag.  */
++     */
-+    atomic_store_release(&ctx->notify_me, atomic_read(&ctx->notify_me) & ~1);
++    vblk->dataplane_started = false;
-     aio_notify_accept(ctx);
++
+     aio_context_acquire(s->ctx);
-     QSLIST_FOREACH_RCU(bh, &ctx->bh_list, next) {
-@@ -XXX,XX +XXX,XX @@ LuringState *aio_get_linux_io_uring(AioContext *ctx)
+     /* Wait for virtio_blk_dma_restart_bh() and in flight I/O to complete */
- void aio_notify(AioContext *ctx)
+@@ -XXX,XX +XXX,XX @@ void virtio_blk_data_plane_stop(VirtIODevice *vdev)
- {
-     /* Write e.g. bh->scheduled before reading ctx->notify_me.  Pairs
+     aio_context_release(s->ctx);
--     * with atomic_or in aio_ctx_prepare or atomic_add in aio_poll.
-+     * with smp_mb in aio_ctx_prepare or aio_poll.
+-    /*
-      */
+-     * Batch all the host notifiers in a single transaction to avoid
-     smp_mb();
+-     * quadratic time complexity in address_space_update_ioeventfds().
--    if (ctx->notify_me) {
+-     */
-+    if (atomic_read(&ctx->notify_me)) {
+-    memory_region_transaction_begin();
-         event_notifier_set(&ctx->notifier);
+-
-         atomic_mb_set(&ctx->notified, true);
+-    for (i = 0; i < nvqs; i++) {
-     }
+-        virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), i, false);
 -    }
 -
 -    /*
 -     * The transaction expects the ioeventfds to be open when it
 -     * commits. Do it now, before the cleanup loop.
 -     */
 -    memory_region_transaction_commit();
 -
 -    for (i = 0; i < nvqs; i++) {
 -        virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), i);
 -    }
 -
      qemu_bh_cancel(s->bh);
      notify_guest_bh(s); /* final chance to notify guest */
      /* Clean up guest notifier (irq) */
      k->set_guest_notifiers(qbus->parent, nvqs, false);
 -    vblk->dataplane_started = false;
      s->stopping = false;
  }
 --
-.25.1
+.40.1

The following changes since commit 8bac3ba57eecc466b7e73dabf7d19328a59f684e:

Merge remote-tracking branch 'remotes/rth/tags/pull-rx-20200408' into staging (2020-04-09 13:23:30 +0100)

are available in the Git repository at:

https://github.com/stefanha/qemu.git tags/block-pull-request

for you to fetch changes up to 5710a3e09f9b85801e5ce70797a4a511e5fc9e2c:

async: use explicit memory barriers (2020-04-09 16:17:14 +0100)

----------------------------------------------------------------
Pull request

Fixes for QEMU on aarch64 ARM hosts and fdmon-io_uring.

----------------------------------------------------------------

Paolo Bonzini (2):
  aio-wait: delegate polling of main AioContext if BQL not held
  async: use explicit memory barriers

Stefan Hajnoczi (1):
  aio-posix: signal-proof fdmon-io_uring

include/block/aio-wait.h | 22 ++++++++++++++++++++++
 include/block/aio.h      | 29 ++++++++++-------------------
 util/aio-posix.c         | 16 ++++++++++++++--
 util/aio-win32.c         | 17 ++++++++++++++---
 util/async.c             | 16 ++++++++++++----
 util/fdmon-io_uring.c    | 10 ++++++++--
 6 files changed, 80 insertions(+), 30 deletions(-)

-- 
2.25.1

The io_uring_enter(2) syscall returns with errno=EINTR when interrupted
by a signal.  Retry the syscall in this case.

It's essential to do this in the io_uring_submit_and_wait() case.  My
interpretation of the Linux v5.5 io_uring_enter(2) code is that it
shouldn't affect the io_uring_submit() case, but there is no guarantee
this will always be the case.  Let's check for -EINTR around both APIs.

Note that the liburing APIs have -errno return values.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20200408091139.273851-1-stefanha@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 util/fdmon-io_uring.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/util/fdmon-io_uring.c b/util/fdmon-io_uring.c
index XXXXXXX..XXXXXXX 100644
--- a/util/fdmon-io_uring.c
+++ b/util/fdmon-io_uring.c
@@ -XXX,XX +XXX,XX @@ static struct io_uring_sqe *get_sqe(AioContext *ctx)
     }
 
     /* No free sqes left, submit pending sqes first */
-    ret = io_uring_submit(ring);
+    do {
+        ret = io_uring_submit(ring);
+    } while (ret == -EINTR);
+
     assert(ret > 1);
     sqe = io_uring_get_sqe(ring);
     assert(sqe);
@@ -XXX,XX +XXX,XX @@ static int fdmon_io_uring_wait(AioContext *ctx, AioHandlerList *ready_list,
 
     fill_sq_ring(ctx);
 
-    ret = io_uring_submit_and_wait(&ctx->fdmon_io_uring, wait_nr);
+    do {
+        ret = io_uring_submit_and_wait(&ctx->fdmon_io_uring, wait_nr);
+    } while (ret == -EINTR);
+
     assert(ret >= 0);
 
     return process_cq_ring(ctx, ready_list);
-- 
2.25.1

From: Paolo Bonzini <pbonzini@redhat.com>

Any thread that is not a iothread returns NULL for qemu_get_current_aio_context().
As a result, it would also return true for
in_aio_context_home_thread(qemu_get_aio_context()), causing
AIO_WAIT_WHILE to invoke aio_poll() directly.  This is incorrect
if the BQL is not held, because aio_poll() does not expect to
run concurrently from multiple threads, and it can actually
happen when savevm writes to the vmstate file from the
migration thread.

Therefore, restrict in_aio_context_home_thread to return true
for the main AioContext only if the BQL is held.

The function is moved to aio-wait.h because it is mostly used
there and to avoid a circular reference between main-loop.h
and block/aio.h.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20200407140746.8041-5-pbonzini@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/block/aio-wait.h | 22 ++++++++++++++++++++++
 include/block/aio.h      | 29 ++++++++++-------------------
 2 files changed, 32 insertions(+), 19 deletions(-)

diff --git a/include/block/aio-wait.h b/include/block/aio-wait.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/aio-wait.h
+++ b/include/block/aio-wait.h
@@ -XXX,XX +XXX,XX @@
 #define QEMU_AIO_WAIT_H
 
 #include "block/aio.h"
+#include "qemu/main-loop.h"
 
 /**
  * AioWait:
@@ -XXX,XX +XXX,XX @@ void aio_wait_kick(void);
  */
 void aio_wait_bh_oneshot(AioContext *ctx, QEMUBHFunc *cb, void *opaque);
 
+/**
+ * in_aio_context_home_thread:
+ * @ctx: the aio context
+ *
+ * Return whether we are running in the thread that normally runs @ctx.  Note
+ * that acquiring/releasing ctx does not affect the outcome, each AioContext
+ * still only has one home thread that is responsible for running it.
+ */
+static inline bool in_aio_context_home_thread(AioContext *ctx)
+{
+    if (ctx == qemu_get_current_aio_context()) {
+        return true;
+    }
+
+    if (ctx == qemu_get_aio_context()) {
+        return qemu_mutex_iothread_locked();
+    } else {
+        return false;
+    }
+}
+
 #endif /* QEMU_AIO_WAIT_H */
diff --git a/include/block/aio.h b/include/block/aio.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -XXX,XX +XXX,XX @@ struct AioContext {
     AioHandlerList deleted_aio_handlers;
 
     /* Used to avoid unnecessary event_notifier_set calls in aio_notify;
-     * accessed with atomic primitives.  If this field is 0, everything
-     * (file descriptors, bottom halves, timers) will be re-evaluated
-     * before the next blocking poll(), thus the event_notifier_set call
-     * can be skipped.  If it is non-zero, you may need to wake up a
-     * concurrent aio_poll or the glib main event loop, making
-     * event_notifier_set necessary.
+     * only written from the AioContext home thread, or under the BQL in
+     * the case of the main AioContext.  However, it is read from any
+     * thread so it is still accessed with atomic primitives.
+     *
+     * If this field is 0, everything (file descriptors, bottom halves,
+     * timers) will be re-evaluated before the next blocking poll() or
+     * io_uring wait; therefore, the event_notifier_set call can be
+     * skipped.  If it is non-zero, you may need to wake up a concurrent
+     * aio_poll or the glib main event loop, making event_notifier_set
+     * necessary.
      *
      * Bit 0 is reserved for GSource usage of the AioContext, and is 1
      * between a call to aio_ctx_prepare and the next call to aio_ctx_check.
@@ -XXX,XX +XXX,XX @@ void aio_co_enter(AioContext *ctx, struct Coroutine *co);
  */
 AioContext *qemu_get_current_aio_context(void);
 
-/**
- * in_aio_context_home_thread:
- * @ctx: the aio context
- *
- * Return whether we are running in the thread that normally runs @ctx.  Note
- * that acquiring/releasing ctx does not affect the outcome, each AioContext
- * still only has one home thread that is responsible for running it.
- */
-static inline bool in_aio_context_home_thread(AioContext *ctx)
-{
-    return ctx == qemu_get_current_aio_context();
-}
-
 /**
  * aio_context_setup:
  * @ctx: the aio context
-- 
2.25.1

From: Paolo Bonzini <pbonzini@redhat.com>

When using C11 atomics, non-seqcst reads and writes do not participate
in the total order of seqcst operations.  In util/async.c and util/aio-posix.c,
in particular, the pattern that we use

write ctx->notify_me                 write bh->scheduled
          read bh->scheduled                   read ctx->notify_me
          if !bh->scheduled, sleep             if ctx->notify_me, notify

needs to use seqcst operations for both the write and the read.  In
general this is something that we do not want, because there can be
many sources that are polled in addition to bottom halves.  The
alternative is to place a seqcst memory barrier between the write
and the read.  This also comes with a disadvantage, in that the
memory barrier is implicit on strongly-ordered architectures and
it wastes a few dozen clock cycles.

Fortunately, ctx->notify_me is never written concurrently by two
threads, so we can assert that and relax the writes to ctx->notify_me.
The resulting solution works and performs well on both aarch64 and x86.

Note that the atomic_set/atomic_read combination is not an atomic
read-modify-write, and therefore it is even weaker than C11 ATOMIC_RELAXED;
on x86, ATOMIC_RELAXED compiles to a locked operation.

Analyzed-by: Ying Fang <fangying1@huawei.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Tested-by: Ying Fang <fangying1@huawei.com>
Message-Id: <20200407140746.8041-6-pbonzini@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 util/aio-posix.c | 16 ++++++++++++++--
 util/aio-win32.c | 17 ++++++++++++++---
 util/async.c     | 16 ++++++++++++----
 3 files changed, 40 insertions(+), 9 deletions(-)

diff --git a/util/aio-posix.c b/util/aio-posix.c
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
     int64_t timeout;
     int64_t start = 0;
 
+    /*
+     * There cannot be two concurrent aio_poll calls for the same AioContext (or
+     * an aio_poll concurrent with a GSource prepare/check/dispatch callback).
+     * We rely on this below to avoid slow locked accesses to ctx->notify_me.
+     */
     assert(in_aio_context_home_thread(ctx));
 
     /* aio_notify can avoid the expensive event_notifier_set if
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
      * so disable the optimization now.
      */
     if (blocking) {
-        atomic_add(&ctx->notify_me, 2);
+        atomic_set(&ctx->notify_me, atomic_read(&ctx->notify_me) + 2);
+        /*
+         * Write ctx->notify_me before computing the timeout
+         * (reading bottom half flags, etc.).  Pairs with
+         * smp_mb in aio_notify().
+         */
+        smp_mb();
     }
 
     qemu_lockcnt_inc(&ctx->list_lock);
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
     }
 
     if (blocking) {
-        atomic_sub(&ctx->notify_me, 2);
+        /* Finish the poll before clearing the flag.  */
+        atomic_store_release(&ctx->notify_me, atomic_read(&ctx->notify_me) - 2);
         aio_notify_accept(ctx);
     }
 
diff --git a/util/aio-win32.c b/util/aio-win32.c
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-win32.c
+++ b/util/aio-win32.c
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
     int count;
     int timeout;
 
+    /*
+     * There cannot be two concurrent aio_poll calls for the same AioContext (or
+     * an aio_poll concurrent with a GSource prepare/check/dispatch callback).
+     * We rely on this below to avoid slow locked accesses to ctx->notify_me.
+     */
+    assert(in_aio_context_home_thread(ctx));
     progress = false;
 
     /* aio_notify can avoid the expensive event_notifier_set if
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
      * so disable the optimization now.
      */
     if (blocking) {
-        atomic_add(&ctx->notify_me, 2);
+        atomic_set(&ctx->notify_me, atomic_read(&ctx->notify_me) + 2);
+        /*
+         * Write ctx->notify_me before computing the timeout
+         * (reading bottom half flags, etc.).  Pairs with
+         * smp_mb in aio_notify().
+         */
+        smp_mb();
     }
 
     qemu_lockcnt_inc(&ctx->list_lock);
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
         ret = WaitForMultipleObjects(count, events, FALSE, timeout);
         if (blocking) {
             assert(first);
-            assert(in_aio_context_home_thread(ctx));
-            atomic_sub(&ctx->notify_me, 2);
+            atomic_store_release(&ctx->notify_me, atomic_read(&ctx->notify_me) - 2);
             aio_notify_accept(ctx);
         }
 
diff --git a/util/async.c b/util/async.c
index XXXXXXX..XXXXXXX 100644
--- a/util/async.c
+++ b/util/async.c
@@ -XXX,XX +XXX,XX @@ aio_ctx_prepare(GSource *source, gint    *timeout)
 {
     AioContext *ctx = (AioContext *) source;
 
-    atomic_or(&ctx->notify_me, 1);
+    atomic_set(&ctx->notify_me, atomic_read(&ctx->notify_me) | 1);
+
+    /*
+     * Write ctx->notify_me before computing the timeout
+     * (reading bottom half flags, etc.).  Pairs with
+     * smp_mb in aio_notify().
+     */
+    smp_mb();
 
     /* We assume there is no timeout already supplied */
     *timeout = qemu_timeout_ns_to_ms(aio_compute_timeout(ctx));
@@ -XXX,XX +XXX,XX @@ aio_ctx_check(GSource *source)
     QEMUBH *bh;
     BHListSlice *s;
 
-    atomic_and(&ctx->notify_me, ~1);
+    /* Finish computing the timeout before clearing the flag.  */
+    atomic_store_release(&ctx->notify_me, atomic_read(&ctx->notify_me) & ~1);
     aio_notify_accept(ctx);
 
     QSLIST_FOREACH_RCU(bh, &ctx->bh_list, next) {
@@ -XXX,XX +XXX,XX @@ LuringState *aio_get_linux_io_uring(AioContext *ctx)
 void aio_notify(AioContext *ctx)
 {
     /* Write e.g. bh->scheduled before reading ctx->notify_me.  Pairs
-     * with atomic_or in aio_ctx_prepare or atomic_add in aio_poll.
+     * with smp_mb in aio_ctx_prepare or aio_poll.
      */
     smp_mb();
-    if (ctx->notify_me) {
+    if (atomic_read(&ctx->notify_me)) {
         event_notifier_set(&ctx->notifier);
         atomic_mb_set(&ctx->notified, true);
     }
-- 
2.25.1

The main loop thread can consume 100% CPU when using --device
virtio-blk-pci,iothread=<iothread>. ppoll() constantly returns but
reading virtqueue host notifiers fails with EAGAIN. The file descriptors
are stale and remain registered with the AioContext because of bugs in
the virtio-blk dataplane start/stop code.

The problem is that the dataplane start/stop code involves drain
operations, which call virtio_blk_drained_begin() and
virtio_blk_drained_end() at points where the host notifier is not
operational:
- In virtio_blk_data_plane_start(), blk_set_aio_context() drains after
  vblk->dataplane_started has been set to true but the host notifier has
  not been attached yet.
- In virtio_blk_data_plane_stop(), blk_drain() and blk_set_aio_context()
  drain after the host notifier has already been detached but with
  vblk->dataplane_started still set to true.

I would like to simplify ->ioeventfd_start/stop() to avoid interactions
with drain entirely, but couldn't find a way to do that. Instead, this
patch accepts the fragile nature of the code and reorders it so that
vblk->dataplane_started is false during drain operations. This way the
virtio_blk_drained_begin() and virtio_blk_drained_end() calls don't
touch the host notifier. The result is that
virtio_blk_data_plane_start() and virtio_blk_data_plane_stop() have
complete control over the host notifier and stale file descriptors are
no longer left in the AioContext.

This patch fixes the 100% CPU consumption in the main loop thread and
correctly moves host notifier processing to the IOThread.

Fixes: 1665d9326fd2 ("virtio-blk: implement BlockDevOps->drained_begin()")
Reported-by: Lukáš Doktor <ldoktor@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Lukas Doktor <ldoktor@redhat.com>
Message-id: 20230704151527.193586-1-stefanha@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 hw/block/dataplane/virtio-blk.c | 67 +++++++++++++++++++--------------
 1 file changed, 38 insertions(+), 29 deletions(-)

diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/block/dataplane/virtio-blk.c
+++ b/hw/block/dataplane/virtio-blk.c
@@ -XXX,XX +XXX,XX @@ int virtio_blk_data_plane_start(VirtIODevice *vdev)
 
     memory_region_transaction_commit();
 
-    /*
-     * These fields are visible to the IOThread so we rely on implicit barriers
-     * in aio_context_acquire() on the write side and aio_notify_accept() on
-     * the read side.
-     */
-    s->starting = false;
-    vblk->dataplane_started = true;
     trace_virtio_blk_data_plane_start(s);
 
     old_context = blk_get_aio_context(s->conf->conf.blk);
@@ -XXX,XX +XXX,XX @@ int virtio_blk_data_plane_start(VirtIODevice *vdev)
         event_notifier_set(virtio_queue_get_host_notifier(vq));
     }
 
+    /*
+     * These fields must be visible to the IOThread when it processes the
+     * virtqueue, otherwise it will think dataplane has not started yet.
+     *
+     * Make sure ->dataplane_started is false when blk_set_aio_context() is
+     * called above so that draining does not cause the host notifier to be
+     * detached/attached prematurely.
+     */
+    s->starting = false;
+    vblk->dataplane_started = true;
+    smp_wmb(); /* paired with aio_notify_accept() on the read side */
+
     /* Get this show started by hooking up our callbacks */
     if (!blk_in_drain(s->conf->conf.blk)) {
         aio_context_acquire(s->ctx);
@@ -XXX,XX +XXX,XX @@ int virtio_blk_data_plane_start(VirtIODevice *vdev)
   fail_guest_notifiers:
     vblk->dataplane_disabled = true;
     s->starting = false;
-    vblk->dataplane_started = true;
     return -ENOSYS;
 }
 
@@ -XXX,XX +XXX,XX @@ void virtio_blk_data_plane_stop(VirtIODevice *vdev)
         aio_wait_bh_oneshot(s->ctx, virtio_blk_data_plane_stop_bh, s);
     }
 
+    /*
+     * Batch all the host notifiers in a single transaction to avoid
+     * quadratic time complexity in address_space_update_ioeventfds().
+     */
+    memory_region_transaction_begin();
+
+    for (i = 0; i < nvqs; i++) {
+        virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), i, false);
+    }
+
+    /*
+     * The transaction expects the ioeventfds to be open when it
+     * commits. Do it now, before the cleanup loop.
+     */
+    memory_region_transaction_commit();
+
+    for (i = 0; i < nvqs; i++) {
+        virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), i);
+    }
+
+    /*
+     * Set ->dataplane_started to false before draining so that host notifiers
+     * are not detached/attached anymore.
+     */
+    vblk->dataplane_started = false;
+
     aio_context_acquire(s->ctx);
 
     /* Wait for virtio_blk_dma_restart_bh() and in flight I/O to complete */
@@ -XXX,XX +XXX,XX @@ void virtio_blk_data_plane_stop(VirtIODevice *vdev)
 
     aio_context_release(s->ctx);
 
-    /*
-     * Batch all the host notifiers in a single transaction to avoid
-     * quadratic time complexity in address_space_update_ioeventfds().
-     */
-    memory_region_transaction_begin();
-
-    for (i = 0; i < nvqs; i++) {
-        virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), i, false);
-    }
-
-    /*
-     * The transaction expects the ioeventfds to be open when it
-     * commits. Do it now, before the cleanup loop.
-     */
-    memory_region_transaction_commit();
-
-    for (i = 0; i < nvqs; i++) {
-        virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), i);
-    }
-
     qemu_bh_cancel(s->bh);
     notify_guest_bh(s); /* final chance to notify guest */
 
     /* Clean up guest notifier (irq) */
     k->set_guest_notifiers(qbus->parent, nvqs, false);
 
-    vblk->dataplane_started = false;
     s->stopping = false;
 }
-- 
2.40.1