Series comparison

-[PULL for-6.0 0/6] Block patches
+[PULL 0/1] Block patches
-The following changes since commit 6d40ce00c1166c317e298ad82ecf10e650c4f87d:
+The following changes since commit 887cba855bb6ff4775256f7968409281350b568c:
-  Update version for v6.0.0-rc1 release (2021-03-30 18:19:07 +0100)
+  configure: Fix cross-building for RISCV host (v5) (2023-07-11 17:56:09 +0100)
 are available in the Git repository at:
   https://gitlab.com/stefanha/qemu.git tags/block-pull-request
-for you to fetch changes up to b6489ac06695e257ea0a9841364577e247fdee30:
+for you to fetch changes up to 75dcb4d790bbe5327169fd72b185960ca58e2fa6:
-  test-coroutine: Add rwlock downgrade test (2021-03-31 10:44:21 +0100)
+  virtio-blk: fix host notifier issues during dataplane start/stop (2023-07-12 15:20:32 -0400)
 ----------------------------------------------------------------
 Pull request
-A fix for VDI image files and more generally for CoRwlock.
 ----------------------------------------------------------------
-David Edmondson (4):
+Stefan Hajnoczi (1):
-  block/vdi: When writing new bmap entry fails, don't leak the buffer
+  virtio-blk: fix host notifier issues during dataplane start/stop
   block/vdi: Don't assume that blocks are larger than VdiHeader
   coroutine-lock: Store the coroutine in the CoWaitRecord only once
   test-coroutine: Add rwlock downgrade test
-Paolo Bonzini (2):
+ hw/block/dataplane/virtio-blk.c | 67 +++++++++++++++++++--------------
-  coroutine-lock: Reimplement CoRwlock to fix downgrade bug
+file changed, 38 insertions(+), 29 deletions(-)
   test-coroutine: Add rwlock upgrade test
  include/qemu/coroutine.h    |  17 ++--
  block/vdi.c                 |  11 ++-
  tests/unit/test-coroutine.c | 161 +++++++++++++++++++++++++++++++++++
  util/qemu-coroutine-lock.c  | 165 +++++++++++++++++++++++-------------
 files changed, 282 insertions(+), 72 deletions(-)
 --
-.30.2
+.40.1

-[PULL for-6.0 1/6] block/vdi: When writing new bmap entry fails, don't leak the buffer
+Deleted patch
-From: David Edmondson <david.edmondson@oracle.com>
-If a new bitmap entry is allocated, requiring the entire block to be
-written, avoiding leaking the buffer allocated for the block should
-the write fail.
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Signed-off-by: David Edmondson <david.edmondson@oracle.com>
-Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Acked-by: Max Reitz <mreitz@redhat.com>
-Message-id: 20210325112941.365238-2-pbonzini@redhat.com
-Message-Id: <20210309144015.557477-2-david.edmondson@oracle.com>
-Acked-by: Max Reitz <mreitz@redhat.com>
-Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
----
- block/vdi.c | 1 +
-file changed, 1 insertion(+)
-diff --git a/block/vdi.c b/block/vdi.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/vdi.c
-+++ b/block/vdi.c
-@@ -XXX,XX +XXX,XX @@ nonallocating_write:
-     logout("finished data write\n");
-     if (ret < 0) {
-+        g_free(block);
-         return ret;
-     }
---
-.30.2

-[PULL for-6.0 2/6] block/vdi: Don't assume that blocks are larger than VdiHeader
+Deleted patch
-From: David Edmondson <david.edmondson@oracle.com>
-Given that the block size is read from the header of the VDI file, a
-wide variety of sizes might be seen. Rather than re-using a block
-sized memory region when writing the VDI header, allocate an
-appropriately sized buffer.
-Signed-off-by: David Edmondson <david.edmondson@oracle.com>
-Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Acked-by: Max Reitz <mreitz@redhat.com>
-Message-id: 20210325112941.365238-3-pbonzini@redhat.com
-Message-Id: <20210309144015.557477-3-david.edmondson@oracle.com>
-Acked-by: Max Reitz <mreitz@redhat.com>
-Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
----
- block/vdi.c | 10 ++++++----
-file changed, 6 insertions(+), 4 deletions(-)
-diff --git a/block/vdi.c b/block/vdi.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/vdi.c
-+++ b/block/vdi.c
-@@ -XXX,XX +XXX,XX @@ nonallocating_write:
-     if (block) {
-         /* One or more new blocks were allocated. */
--        VdiHeader *header = (VdiHeader *) block;
-+        VdiHeader *header;
-         uint8_t *base;
-         uint64_t offset;
-         uint32_t n_sectors;
-+        g_free(block);
-+        header = g_malloc(sizeof(*header));
-+
-         logout("now writing modified header\n");
-         assert(VDI_IS_ALLOCATED(bmap_first));
-         *header = s->header;
-         vdi_header_to_le(header);
--        ret = bdrv_pwrite(bs->file, 0, block, sizeof(VdiHeader));
--        g_free(block);
--        block = NULL;
-+        ret = bdrv_pwrite(bs->file, 0, header, sizeof(*header));
-+        g_free(header);
-         if (ret < 0) {
-             return ret;
---
-.30.2

-[PULL for-6.0 3/6] coroutine-lock: Store the coroutine in the CoWaitRecord only once
+Deleted patch
-From: David Edmondson <david.edmondson@oracle.com>
-When taking the slow path for mutex acquisition, set the coroutine
-value in the CoWaitRecord in push_waiter(), rather than both there and
-in the caller.
-Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Signed-off-by: David Edmondson <david.edmondson@oracle.com>
-Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Message-id: 20210325112941.365238-4-pbonzini@redhat.com
-Message-Id: <20210309144015.557477-4-david.edmondson@oracle.com>
-Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
----
- util/qemu-coroutine-lock.c | 1 -
-file changed, 1 deletion(-)
-diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
-index XXXXXXX..XXXXXXX 100644
---- a/util/qemu-coroutine-lock.c
-+++ b/util/qemu-coroutine-lock.c
-@@ -XXX,XX +XXX,XX @@ static void coroutine_fn qemu_co_mutex_lock_slowpath(AioContext *ctx,
-     unsigned old_handoff;
-     trace_qemu_co_mutex_lock_entry(mutex, self);
--    w.co = self;
-     push_waiter(mutex, &w);
-     /* This is the "Responsibility Hand-Off" protocol; a lock() picks from
---
-.30.2

-[PULL for-6.0 4/6] coroutine-lock: Reimplement CoRwlock to fix downgrade bug
+[PULL 1/1] virtio-blk: fix host notifier issues during dataplane start/stop
-From: Paolo Bonzini <pbonzini@redhat.com>
+The main loop thread can consume 100% CPU when using --device
 virtio-blk-pci,iothread=<iothread>. ppoll() constantly returns but
 reading virtqueue host notifiers fails with EAGAIN. The file descriptors
 are stale and remain registered with the AioContext because of bugs in
 the virtio-blk dataplane start/stop code.
-An invariant of the current rwlock is that if multiple coroutines hold a
+The problem is that the dataplane start/stop code involves drain
-reader lock, all must be runnable. The unlock implementation relies on
+operations, which call virtio_blk_drained_begin() and
-this, choosing to wake a single coroutine when the final read lock
+virtio_blk_drained_end() at points where the host notifier is not
-holder exits the critical section, assuming that it will wake a
+operational:
-coroutine attempting to acquire a write lock.
+- In virtio_blk_data_plane_start(), blk_set_aio_context() drains after
   vblk->dataplane_started has been set to true but the host notifier has
   not been attached yet.
 - In virtio_blk_data_plane_stop(), blk_drain() and blk_set_aio_context()
   drain after the host notifier has already been detached but with
   vblk->dataplane_started still set to true.
-The downgrade implementation violates this assumption by creating a
+I would like to simplify ->ioeventfd_start/stop() to avoid interactions
-read lock owning coroutine that is exclusively runnable - any other
+with drain entirely, but couldn't find a way to do that. Instead, this
-coroutines that are waiting to acquire a read lock are *not* made
+patch accepts the fragile nature of the code and reorders it so that
-runnable when the write lock holder converts its ownership to read
+vblk->dataplane_started is false during drain operations. This way the
-only.
+virtio_blk_drained_begin() and virtio_blk_drained_end() calls don't
 touch the host notifier. The result is that
 virtio_blk_data_plane_start() and virtio_blk_data_plane_stop() have
 complete control over the host notifier and stale file descriptors are
 no longer left in the AioContext.
-More in general, the old implementation had lots of other fairness bugs.
+This patch fixes the 100% CPU consumption in the main loop thread and
-The root cause of the bugs was that CoQueue would wake up readers even
+correctly moves host notifier processing to the IOThread.
 if there were pending writers, and would wake up writers even if there
 were readers.  In that case, the coroutine would go back to sleep *at
 the end* of the CoQueue, losing its place at the head of the line.
-To fix this, keep the queue of waiters explicitly in the CoRwlock
+Fixes: 1665d9326fd2 ("virtio-blk: implement BlockDevOps->drained_begin()")
-instead of using CoQueue, and store for each whether it is a
+Reported-by: Lukáš Doktor <ldoktor@redhat.com>
-potential reader or a writer.  This way, downgrade can look at the
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-first queued coroutines and wake it only if it is a reader, causing
+Tested-by: Lukas Doktor <ldoktor@redhat.com>
-all other readers in line to be released in turn.
+Message-id: 20230704151527.193586-1-stefanha@redhat.com
 Reported-by: David Edmondson <david.edmondson@oracle.com>
 Reviewed-by: David Edmondson <david.edmondson@oracle.com>
 Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Message-id: 20210325112941.365238-5-pbonzini@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- include/qemu/coroutine.h   |  17 ++--
+ hw/block/dataplane/virtio-blk.c | 67 +++++++++++++++++++--------------
- util/qemu-coroutine-lock.c | 164 +++++++++++++++++++++++--------------
+file changed, 38 insertions(+), 29 deletions(-)
 files changed, 114 insertions(+), 67 deletions(-)
-diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
+diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/qemu/coroutine.h
+--- a/hw/block/dataplane/virtio-blk.c
-+++ b/include/qemu/coroutine.h
++++ b/hw/block/dataplane/virtio-blk.c
-@@ -XXX,XX +XXX,XX @@ bool qemu_co_enter_next_impl(CoQueue *queue, QemuLockable *lock);
+@@ -XXX,XX +XXX,XX @@ int virtio_blk_data_plane_start(VirtIODevice *vdev)
- bool qemu_co_queue_empty(CoQueue *queue);
+     memory_region_transaction_commit();
-+typedef struct CoRwTicket CoRwTicket;
+-    /*
- typedef struct CoRwlock {
+-     * These fields are visible to the IOThread so we rely on implicit barriers
--    int pending_writer;
+-     * in aio_context_acquire() on the write side and aio_notify_accept() on
--    int reader;
+-     * the read side.
-     CoMutex mutex;
+-     */
--    CoQueue queue;
+-    s->starting = false;
 -    vblk->dataplane_started = true;
      trace_virtio_blk_data_plane_start(s);
      old_context = blk_get_aio_context(s->conf->conf.blk);
@@ -XXX,XX +XXX,XX @@ int virtio_blk_data_plane_start(VirtIODevice *vdev)
          event_notifier_set(virtio_queue_get_host_notifier(vq));
      }
 +    /*
 +     * These fields must be visible to the IOThread when it processes the
 +     * virtqueue, otherwise it will think dataplane has not started yet.
 +     *
 +     * Make sure ->dataplane_started is false when blk_set_aio_context() is
 +     * called above so that draining does not cause the host notifier to be
 +     * detached/attached prematurely.
 +     */
 +    s->starting = false;
 +    vblk->dataplane_started = true;
 +    smp_wmb(); /* paired with aio_notify_accept() on the read side */
 +
-+    /* Number of readers, or -1 if owned for writing.  */
+     /* Get this show started by hooking up our callbacks */
-+    int owners;
+     if (!blk_in_drain(s->conf->conf.blk)) {
          aio_context_acquire(s->ctx);
@@ -XXX,XX +XXX,XX @@ int virtio_blk_data_plane_start(VirtIODevice *vdev)
    fail_guest_notifiers:
      vblk->dataplane_disabled = true;
      s->starting = false;
 -    vblk->dataplane_started = true;
      return -ENOSYS;
  }
@@ -XXX,XX +XXX,XX @@ void virtio_blk_data_plane_stop(VirtIODevice *vdev)
          aio_wait_bh_oneshot(s->ctx, virtio_blk_data_plane_stop_bh, s);
      }
 +    /*
 +     * Batch all the host notifiers in a single transaction to avoid
 +     * quadratic time complexity in address_space_update_ioeventfds().
 +     */
 +    memory_region_transaction_begin();
 +
-+    /* Waiting coroutines.  */
++    for (i = 0; i < nvqs; i++) {
-+    QSIMPLEQ_HEAD(, CoRwTicket) tickets;
++        virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), i, false);
- } CoRwlock;
++    }
  /**
@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_rdlock(CoRwlock *lock);
  /**
   * Write Locks the CoRwlock from a reader.  This is a bit more efficient than
   * @qemu_co_rwlock_unlock followed by a separate @qemu_co_rwlock_wrlock.
 - * However, if the lock cannot be upgraded immediately, control is transferred
 - * to the caller of the current coroutine.  Also, @qemu_co_rwlock_upgrade
 - * only overrides CoRwlock fairness if there are no concurrent readers, so
 - * another writer might run while @qemu_co_rwlock_upgrade blocks.
 + * Note that if the lock cannot be upgraded immediately, control is transferred
 + * to the caller of the current coroutine; another writer might run while
 + * @qemu_co_rwlock_upgrade blocks.
   */
  void qemu_co_rwlock_upgrade(CoRwlock *lock);
 diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/qemu-coroutine-lock.c
 +++ b/util/qemu-coroutine-lock.c
@@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex)
      trace_qemu_co_mutex_unlock_return(mutex, self);
  }
 +struct CoRwTicket {
 +    bool read;
 +    Coroutine *co;
 +    QSIMPLEQ_ENTRY(CoRwTicket) next;
 +};
 +
  void qemu_co_rwlock_init(CoRwlock *lock)
  {
 -    memset(lock, 0, sizeof(*lock));
 -    qemu_co_queue_init(&lock->queue);
      qemu_co_mutex_init(&lock->mutex);
 +    lock->owners = 0;
 +    QSIMPLEQ_INIT(&lock->tickets);
 +}
 +
 +/* Releases the internal CoMutex.  */
 +static void qemu_co_rwlock_maybe_wake_one(CoRwlock *lock)
 +{
 +    CoRwTicket *tkt = QSIMPLEQ_FIRST(&lock->tickets);
 +    Coroutine *co = NULL;
 +
 +    /*
-+     * Setting lock->owners here prevents rdlock and wrlock from
++     * The transaction expects the ioeventfds to be open when it
-+     * sneaking in between unlock and wake.
++     * commits. Do it now, before the cleanup loop.
 +     */
++    memory_region_transaction_commit();
 +
-+    if (tkt) {
++    for (i = 0; i < nvqs; i++) {
-+        if (tkt->read) {
++        virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), i);
 +            if (lock->owners >= 0) {
 +                lock->owners++;
 +                co = tkt->co;
 +            }
 +        } else {
 +            if (lock->owners == 0) {
 +                lock->owners = -1;
 +                co = tkt->co;
 +            }
 +        }
 +    }
 +
-+    if (co) {
++    /*
-+        QSIMPLEQ_REMOVE_HEAD(&lock->tickets, next);
++     * Set ->dataplane_started to false before draining so that host notifiers
-+        qemu_co_mutex_unlock(&lock->mutex);
++     * are not detached/attached anymore.
-+        aio_co_wake(co);
++     */
-+    } else {
++    vblk->dataplane_started = false;
-+        qemu_co_mutex_unlock(&lock->mutex);
++
-+    }
+     aio_context_acquire(s->ctx);
- }
+     /* Wait for virtio_blk_dma_restart_bh() and in flight I/O to complete */
- void qemu_co_rwlock_rdlock(CoRwlock *lock)
+@@ -XXX,XX +XXX,XX @@ void virtio_blk_data_plane_stop(VirtIODevice *vdev)
-@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_rdlock(CoRwlock *lock)
+     aio_context_release(s->ctx);
-     qemu_co_mutex_lock(&lock->mutex);
-     /* For fairness, wait if a writer is in line.  */
+-    /*
--    while (lock->pending_writer) {
+-     * Batch all the host notifiers in a single transaction to avoid
--        qemu_co_queue_wait(&lock->queue, &lock->mutex);
+-     * quadratic time complexity in address_space_update_ioeventfds().
 -     */
 -    memory_region_transaction_begin();
 -
 -    for (i = 0; i < nvqs; i++) {
 -        virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), i, false);
 -    }
--    lock->reader++;
--    qemu_co_mutex_unlock(&lock->mutex);
 -
--    /* The rest of the read-side critical section is run without the mutex.  */
+-    /*
--    self->locks_held++;
+-     * The transaction expects the ioeventfds to be open when it
--}
+-     * commits. Do it now, before the cleanup loop.
 -     */
 -    memory_region_transaction_commit();
 -
--void qemu_co_rwlock_unlock(CoRwlock *lock)
+-    for (i = 0; i < nvqs; i++) {
--{
+-        virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), i);
--    Coroutine *self = qemu_coroutine_self();
+-    }
 -
--    assert(qemu_in_coroutine());
+     qemu_bh_cancel(s->bh);
--    if (!lock->reader) {
+     notify_guest_bh(s); /* final chance to notify guest */
--        /* The critical section started in qemu_co_rwlock_wrlock.  */
--        qemu_co_queue_restart_all(&lock->queue);
+     /* Clean up guest notifier (irq) */
-+    if (lock->owners == 0 || (lock->owners > 0 && QSIMPLEQ_EMPTY(&lock->tickets))) {
+     k->set_guest_notifiers(qbus->parent, nvqs, false);
-+        lock->owners++;
-+        qemu_co_mutex_unlock(&lock->mutex);
+-    vblk->dataplane_started = false;
-     } else {
+     s->stopping = false;
 -        self->locks_held--;
 +        CoRwTicket my_ticket = { true, self };
 +        QSIMPLEQ_INSERT_TAIL(&lock->tickets, &my_ticket, next);
 +        qemu_co_mutex_unlock(&lock->mutex);
 +        qemu_coroutine_yield();
 +        assert(lock->owners >= 1);
 +
 +        /* Possibly wake another reader, which will wake the next in line.  */
          qemu_co_mutex_lock(&lock->mutex);
 -        lock->reader--;
 -        assert(lock->reader >= 0);
 -        /* Wakeup only one waiting writer */
 -        if (!lock->reader) {
 -            qemu_co_queue_next(&lock->queue);
 -        }
 +        qemu_co_rwlock_maybe_wake_one(lock);
      }
 -    qemu_co_mutex_unlock(&lock->mutex);
 +
 +    self->locks_held++;
 +}
 +
 +void qemu_co_rwlock_unlock(CoRwlock *lock)
 +{
 +    Coroutine *self = qemu_coroutine_self();
 +
 +    assert(qemu_in_coroutine());
 +    self->locks_held--;
 +
 +    qemu_co_mutex_lock(&lock->mutex);
 +    if (lock->owners > 0) {
 +        lock->owners--;
 +    } else {
 +        assert(lock->owners == -1);
 +        lock->owners = 0;
 +    }
 +
 +    qemu_co_rwlock_maybe_wake_one(lock);
  }
  void qemu_co_rwlock_downgrade(CoRwlock *lock)
  {
 -    Coroutine *self = qemu_coroutine_self();
 +    qemu_co_mutex_lock(&lock->mutex);
 +    assert(lock->owners == -1);
 +    lock->owners = 1;
 -    /* lock->mutex critical section started in qemu_co_rwlock_wrlock or
 -     * qemu_co_rwlock_upgrade.
 -     */
 -    assert(lock->reader == 0);
 -    lock->reader++;
 -    qemu_co_mutex_unlock(&lock->mutex);
 -
 -    /* The rest of the read-side critical section is run without the mutex.  */
 -    self->locks_held++;
 +    /* Possibly wake another reader, which will wake the next in line.  */
 +    qemu_co_rwlock_maybe_wake_one(lock);
  }
  void qemu_co_rwlock_wrlock(CoRwlock *lock)
  {
 +    Coroutine *self = qemu_coroutine_self();
 +
      qemu_co_mutex_lock(&lock->mutex);
 -    lock->pending_writer++;
 -    while (lock->reader) {
 -        qemu_co_queue_wait(&lock->queue, &lock->mutex);
 +    if (lock->owners == 0) {
 +        lock->owners = -1;
 +        qemu_co_mutex_unlock(&lock->mutex);
 +    } else {
 +        CoRwTicket my_ticket = { false, qemu_coroutine_self() };
 +
 +        QSIMPLEQ_INSERT_TAIL(&lock->tickets, &my_ticket, next);
 +        qemu_co_mutex_unlock(&lock->mutex);
 +        qemu_coroutine_yield();
 +        assert(lock->owners == -1);
      }
 -    lock->pending_writer--;
 -    /* The rest of the write-side critical section is run with
 -     * the mutex taken, so that lock->reader remains zero.
 -     * There is no need to update self->locks_held.
 -     */
 +    self->locks_held++;
  }
  void qemu_co_rwlock_upgrade(CoRwlock *lock)
  {
 -    Coroutine *self = qemu_coroutine_self();
 -
      qemu_co_mutex_lock(&lock->mutex);
 -    assert(lock->reader > 0);
 -    lock->reader--;
 -    lock->pending_writer++;
 -    while (lock->reader) {
 -        qemu_co_queue_wait(&lock->queue, &lock->mutex);
 +    assert(lock->owners > 0);
 +    /* For fairness, wait if a writer is in line.  */
 +    if (lock->owners == 1 && QSIMPLEQ_EMPTY(&lock->tickets)) {
 +        lock->owners = -1;
 +        qemu_co_mutex_unlock(&lock->mutex);
 +    } else {
 +        CoRwTicket my_ticket = { false, qemu_coroutine_self() };
 +
 +        lock->owners--;
 +        QSIMPLEQ_INSERT_TAIL(&lock->tickets, &my_ticket, next);
 +        qemu_co_rwlock_maybe_wake_one(lock);
 +        qemu_coroutine_yield();
 +        assert(lock->owners == -1);
      }
 -    lock->pending_writer--;
 -
 -    /* The rest of the write-side critical section is run with
 -     * the mutex taken, similar to qemu_co_rwlock_wrlock.  Do
 -     * not account for the lock twice in self->locks_held.
 -     */
 -    self->locks_held--;
  }
 --
-.30.2
+.40.1

-[PULL for-6.0 5/6] test-coroutine: Add rwlock upgrade test
+Deleted patch
-From: Paolo Bonzini <pbonzini@redhat.com>
-Test that rwlock upgrade is fair, and that readers go back to sleep if
-a writer is in line.
-Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Message-id: 20210325112941.365238-6-pbonzini@redhat.com
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
----
- tests/unit/test-coroutine.c | 62 +++++++++++++++++++++++++++++++++++++
-file changed, 62 insertions(+)
-diff --git a/tests/unit/test-coroutine.c b/tests/unit/test-coroutine.c
-index XXXXXXX..XXXXXXX 100644
---- a/tests/unit/test-coroutine.c
-+++ b/tests/unit/test-coroutine.c
-@@ -XXX,XX +XXX,XX @@ static void test_co_mutex_lockable(void)
-     g_assert(QEMU_MAKE_LOCKABLE(null_pointer) == NULL);
- }
-+static CoRwlock rwlock;
-+
-+/* Test that readers are properly sent back to the queue when upgrading,
-+ * even if they are the sole readers.  The test scenario is as follows:
-+ *
-+ *
-+ * | c1           | c2         |
-+ * |--------------+------------+
-+ * | rdlock       |            |
-+ * | yield        |            |
-+ * |              | wrlock     |
-+ * |              | <queued>   |
-+ * | upgrade      |            |
-+ * | <queued>     | <dequeued> |
-+ * |              | unlock     |
-+ * | <dequeued>   |            |
-+ * | unlock       |            |
-+ */
-+
-+static void coroutine_fn rwlock_yield_upgrade(void *opaque)
-+{
-+    qemu_co_rwlock_rdlock(&rwlock);
-+    qemu_coroutine_yield();
-+
-+    qemu_co_rwlock_upgrade(&rwlock);
-+    qemu_co_rwlock_unlock(&rwlock);
-+
-+    *(bool *)opaque = true;
-+}
-+
-+static void coroutine_fn rwlock_wrlock_yield(void *opaque)
-+{
-+    qemu_co_rwlock_wrlock(&rwlock);
-+    qemu_coroutine_yield();
-+
-+    qemu_co_rwlock_unlock(&rwlock);
-+    *(bool *)opaque = true;
-+}
-+
-+static void test_co_rwlock_upgrade(void)
-+{
-+    bool c1_done = false;
-+    bool c2_done = false;
-+    Coroutine *c1, *c2;
-+
-+    qemu_co_rwlock_init(&rwlock);
-+    c1 = qemu_coroutine_create(rwlock_yield_upgrade, &c1_done);
-+    c2 = qemu_coroutine_create(rwlock_wrlock_yield, &c2_done);
-+
-+    qemu_coroutine_enter(c1);
-+    qemu_coroutine_enter(c2);
-+
-+    /* c1 now should go to sleep.  */
-+    qemu_coroutine_enter(c1);
-+    g_assert(!c1_done);
-+
-+    qemu_coroutine_enter(c2);
-+    g_assert(c1_done);
-+    g_assert(c2_done);
-+}
-+
- /*
-  * Check that creation, enter, and return work
-  */
-@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
-     g_test_add_func("/basic/order", test_order);
-     g_test_add_func("/locking/co-mutex", test_co_mutex);
-     g_test_add_func("/locking/co-mutex/lockable", test_co_mutex_lockable);
-+    g_test_add_func("/locking/co-rwlock/upgrade", test_co_rwlock_upgrade);
-     if (g_test_perf()) {
-         g_test_add_func("/perf/lifecycle", perf_lifecycle);
-         g_test_add_func("/perf/nesting", perf_nesting);
---
-.30.2

-[PULL for-6.0 6/6] test-coroutine: Add rwlock downgrade test
+Deleted patch
-From: David Edmondson <david.edmondson@oracle.com>
-Test that downgrading an rwlock does not result in a failure to
-schedule coroutines queued on the rwlock.
-The diagram associated with test_co_rwlock_downgrade() describes the
-intended behaviour, but what was observed previously corresponds to:
-| c1     | c2         | c3         | c4       |
-|--------+------------+------------+----------|
-| rdlock |            |            |          |
-| yield  |            |            |          |
-|        | wrlock     |            |          |
-|        | <queued>   |            |          |
-|        |            | rdlock     |          |
-|        |            | <queued>   |          |
-|        |            |            | wrlock   |
-|        |            |            | <queued> |
-| unlock |            |            |          |
-| yield  |            |            |          |
-|        | <dequeued> |            |          |
-|        | downgrade  |            |          |
-|        | ...        |            |          |
-|        | unlock     |            |          |
-|        |            | <dequeued> |          |
-|        |            | <queued>   |          |
-This results in a failure...
-ERROR:../tests/test-coroutine.c:369:test_co_rwlock_downgrade: assertion failed: (c3_done)
-Bail out! ERROR:../tests/test-coroutine.c:369:test_co_rwlock_downgrade: assertion failed: (c3_done)
-...as a result of the c3 coroutine failing to run to completion.
-Signed-off-by: David Edmondson <david.edmondson@oracle.com>
-Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Message-id: 20210325112941.365238-7-pbonzini@redhat.com
-Message-Id: <20210309144015.557477-5-david.edmondson@oracle.com>
-Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
----
- tests/unit/test-coroutine.c | 99 +++++++++++++++++++++++++++++++++++++
-file changed, 99 insertions(+)
-diff --git a/tests/unit/test-coroutine.c b/tests/unit/test-coroutine.c
-index XXXXXXX..XXXXXXX 100644
---- a/tests/unit/test-coroutine.c
-+++ b/tests/unit/test-coroutine.c
-@@ -XXX,XX +XXX,XX @@ static void test_co_rwlock_upgrade(void)
-     g_assert(c2_done);
- }
-+static void coroutine_fn rwlock_rdlock_yield(void *opaque)
-+{
-+    qemu_co_rwlock_rdlock(&rwlock);
-+    qemu_coroutine_yield();
-+
-+    qemu_co_rwlock_unlock(&rwlock);
-+    qemu_coroutine_yield();
-+
-+    *(bool *)opaque = true;
-+}
-+
-+static void coroutine_fn rwlock_wrlock_downgrade(void *opaque)
-+{
-+    qemu_co_rwlock_wrlock(&rwlock);
-+
-+    qemu_co_rwlock_downgrade(&rwlock);
-+    qemu_co_rwlock_unlock(&rwlock);
-+    *(bool *)opaque = true;
-+}
-+
-+static void coroutine_fn rwlock_rdlock(void *opaque)
-+{
-+    qemu_co_rwlock_rdlock(&rwlock);
-+
-+    qemu_co_rwlock_unlock(&rwlock);
-+    *(bool *)opaque = true;
-+}
-+
-+static void coroutine_fn rwlock_wrlock(void *opaque)
-+{
-+    qemu_co_rwlock_wrlock(&rwlock);
-+
-+    qemu_co_rwlock_unlock(&rwlock);
-+    *(bool *)opaque = true;
-+}
-+
-+/*
-+ * Check that downgrading a reader-writer lock does not cause a hang.
-+ *
-+ * Four coroutines are used to produce a situation where there are
-+ * both reader and writer hopefuls waiting to acquire an rwlock that
-+ * is held by a reader.
-+ *
-+ * The correct sequence of operations we aim to provoke can be
-+ * represented as:
-+ *
-+ * | c1     | c2         | c3         | c4         |
-+ * |--------+------------+------------+------------|
-+ * | rdlock |            |            |            |
-+ * | yield  |            |            |            |
-+ * |        | wrlock     |            |            |
-+ * |        | <queued>   |            |            |
-+ * |        |            | rdlock     |            |
-+ * |        |            | <queued>   |            |
-+ * |        |            |            | wrlock     |
-+ * |        |            |            | <queued>   |
-+ * | unlock |            |            |            |
-+ * | yield  |            |            |            |
-+ * |        | <dequeued> |            |            |
-+ * |        | downgrade  |            |            |
-+ * |        |            | <dequeued> |            |
-+ * |        |            | unlock     |            |
-+ * |        | ...        |            |            |
-+ * |        | unlock     |            |            |
-+ * |        |            |            | <dequeued> |
-+ * |        |            |            | unlock     |
-+ */
-+static void test_co_rwlock_downgrade(void)
-+{
-+    bool c1_done = false;
-+    bool c2_done = false;
-+    bool c3_done = false;
-+    bool c4_done = false;
-+    Coroutine *c1, *c2, *c3, *c4;
-+
-+    qemu_co_rwlock_init(&rwlock);
-+
-+    c1 = qemu_coroutine_create(rwlock_rdlock_yield, &c1_done);
-+    c2 = qemu_coroutine_create(rwlock_wrlock_downgrade, &c2_done);
-+    c3 = qemu_coroutine_create(rwlock_rdlock, &c3_done);
-+    c4 = qemu_coroutine_create(rwlock_wrlock, &c4_done);
-+
-+    qemu_coroutine_enter(c1);
-+    qemu_coroutine_enter(c2);
-+    qemu_coroutine_enter(c3);
-+    qemu_coroutine_enter(c4);
-+
-+    qemu_coroutine_enter(c1);
-+
-+    g_assert(c2_done);
-+    g_assert(c3_done);
-+    g_assert(c4_done);
-+
-+    qemu_coroutine_enter(c1);
-+
-+    g_assert(c1_done);
-+}
-+
- /*
-  * Check that creation, enter, and return work
-  */
-@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
-     g_test_add_func("/locking/co-mutex", test_co_mutex);
-     g_test_add_func("/locking/co-mutex/lockable", test_co_mutex_lockable);
-     g_test_add_func("/locking/co-rwlock/upgrade", test_co_rwlock_upgrade);
-+    g_test_add_func("/locking/co-rwlock/downgrade", test_co_rwlock_downgrade);
-     if (g_test_perf()) {
-         g_test_add_func("/perf/lifecycle", perf_lifecycle);
-         g_test_add_func("/perf/nesting", perf_nesting);
---
-.30.2

The following changes since commit 6d40ce00c1166c317e298ad82ecf10e650c4f87d:

Update version for v6.0.0-rc1 release (2021-03-30 18:19:07 +0100)

are available in the Git repository at:

https://gitlab.com/stefanha/qemu.git tags/block-pull-request

for you to fetch changes up to b6489ac06695e257ea0a9841364577e247fdee30:

test-coroutine: Add rwlock downgrade test (2021-03-31 10:44:21 +0100)

----------------------------------------------------------------
Pull request

A fix for VDI image files and more generally for CoRwlock.

----------------------------------------------------------------

David Edmondson (4):
  block/vdi: When writing new bmap entry fails, don't leak the buffer
  block/vdi: Don't assume that blocks are larger than VdiHeader
  coroutine-lock: Store the coroutine in the CoWaitRecord only once
  test-coroutine: Add rwlock downgrade test

Paolo Bonzini (2):
  coroutine-lock: Reimplement CoRwlock to fix downgrade bug
  test-coroutine: Add rwlock upgrade test

include/qemu/coroutine.h    |  17 ++--
 block/vdi.c                 |  11 ++-
 tests/unit/test-coroutine.c | 161 +++++++++++++++++++++++++++++++++++
 util/qemu-coroutine-lock.c  | 165 +++++++++++++++++++++++-------------
 4 files changed, 282 insertions(+), 72 deletions(-)

-- 
2.30.2

From: David Edmondson <david.edmondson@oracle.com>

If a new bitmap entry is allocated, requiring the entire block to be
written, avoiding leaking the buffer allocated for the block should
the write fail.

Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: David Edmondson <david.edmondson@oracle.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Acked-by: Max Reitz <mreitz@redhat.com>
Message-id: 20210325112941.365238-2-pbonzini@redhat.com
Message-Id: <20210309144015.557477-2-david.edmondson@oracle.com>
Acked-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/vdi.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/block/vdi.c b/block/vdi.c
index XXXXXXX..XXXXXXX 100644
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -XXX,XX +XXX,XX @@ nonallocating_write:
 
     logout("finished data write\n");
     if (ret < 0) {
+        g_free(block);
         return ret;
     }
 
-- 
2.30.2

From: David Edmondson <david.edmondson@oracle.com>

Given that the block size is read from the header of the VDI file, a
wide variety of sizes might be seen. Rather than re-using a block
sized memory region when writing the VDI header, allocate an
appropriately sized buffer.

Signed-off-by: David Edmondson <david.edmondson@oracle.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Acked-by: Max Reitz <mreitz@redhat.com>
Message-id: 20210325112941.365238-3-pbonzini@redhat.com
Message-Id: <20210309144015.557477-3-david.edmondson@oracle.com>
Acked-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/vdi.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/block/vdi.c b/block/vdi.c
index XXXXXXX..XXXXXXX 100644
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -XXX,XX +XXX,XX @@ nonallocating_write:
 
     if (block) {
         /* One or more new blocks were allocated. */
-        VdiHeader *header = (VdiHeader *) block;
+        VdiHeader *header;
         uint8_t *base;
         uint64_t offset;
         uint32_t n_sectors;
 
+        g_free(block);
+        header = g_malloc(sizeof(*header));
+
         logout("now writing modified header\n");
         assert(VDI_IS_ALLOCATED(bmap_first));
         *header = s->header;
         vdi_header_to_le(header);
-        ret = bdrv_pwrite(bs->file, 0, block, sizeof(VdiHeader));
-        g_free(block);
-        block = NULL;
+        ret = bdrv_pwrite(bs->file, 0, header, sizeof(*header));
+        g_free(header);
 
         if (ret < 0) {
             return ret;
-- 
2.30.2

From: David Edmondson <david.edmondson@oracle.com>

When taking the slow path for mutex acquisition, set the coroutine
value in the CoWaitRecord in push_waiter(), rather than both there and
in the caller.

Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: David Edmondson <david.edmondson@oracle.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-id: 20210325112941.365238-4-pbonzini@redhat.com
Message-Id: <20210309144015.557477-4-david.edmondson@oracle.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 util/qemu-coroutine-lock.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
index XXXXXXX..XXXXXXX 100644
--- a/util/qemu-coroutine-lock.c
+++ b/util/qemu-coroutine-lock.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn qemu_co_mutex_lock_slowpath(AioContext *ctx,
     unsigned old_handoff;
 
     trace_qemu_co_mutex_lock_entry(mutex, self);
-    w.co = self;
     push_waiter(mutex, &w);
 
     /* This is the "Responsibility Hand-Off" protocol; a lock() picks from
-- 
2.30.2

From: Paolo Bonzini <pbonzini@redhat.com>

An invariant of the current rwlock is that if multiple coroutines hold a
reader lock, all must be runnable. The unlock implementation relies on
this, choosing to wake a single coroutine when the final read lock
holder exits the critical section, assuming that it will wake a
coroutine attempting to acquire a write lock.

The downgrade implementation violates this assumption by creating a
read lock owning coroutine that is exclusively runnable - any other
coroutines that are waiting to acquire a read lock are *not* made
runnable when the write lock holder converts its ownership to read
only.

More in general, the old implementation had lots of other fairness bugs.
The root cause of the bugs was that CoQueue would wake up readers even
if there were pending writers, and would wake up writers even if there
were readers.  In that case, the coroutine would go back to sleep *at
the end* of the CoQueue, losing its place at the head of the line.

To fix this, keep the queue of waiters explicitly in the CoRwlock
instead of using CoQueue, and store for each whether it is a
potential reader or a writer.  This way, downgrade can look at the
first queued coroutines and wake it only if it is a reader, causing
all other readers in line to be released in turn.

Reported-by: David Edmondson <david.edmondson@oracle.com>
Reviewed-by: David Edmondson <david.edmondson@oracle.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-id: 20210325112941.365238-5-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/qemu/coroutine.h   |  17 ++--
 util/qemu-coroutine-lock.c | 164 +++++++++++++++++++++++--------------
 2 files changed, 114 insertions(+), 67 deletions(-)

diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/coroutine.h
+++ b/include/qemu/coroutine.h
@@ -XXX,XX +XXX,XX @@ bool qemu_co_enter_next_impl(CoQueue *queue, QemuLockable *lock);
 bool qemu_co_queue_empty(CoQueue *queue);
 
 
+typedef struct CoRwTicket CoRwTicket;
 typedef struct CoRwlock {
-    int pending_writer;
-    int reader;
     CoMutex mutex;
-    CoQueue queue;
+
+    /* Number of readers, or -1 if owned for writing.  */
+    int owners;
+
+    /* Waiting coroutines.  */
+    QSIMPLEQ_HEAD(, CoRwTicket) tickets;
 } CoRwlock;
 
 /**
@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_rdlock(CoRwlock *lock);
 /**
  * Write Locks the CoRwlock from a reader.  This is a bit more efficient than
  * @qemu_co_rwlock_unlock followed by a separate @qemu_co_rwlock_wrlock.
- * However, if the lock cannot be upgraded immediately, control is transferred
- * to the caller of the current coroutine.  Also, @qemu_co_rwlock_upgrade
- * only overrides CoRwlock fairness if there are no concurrent readers, so
- * another writer might run while @qemu_co_rwlock_upgrade blocks.
+ * Note that if the lock cannot be upgraded immediately, control is transferred
+ * to the caller of the current coroutine; another writer might run while
+ * @qemu_co_rwlock_upgrade blocks.
  */
 void qemu_co_rwlock_upgrade(CoRwlock *lock);
 
diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
index XXXXXXX..XXXXXXX 100644
--- a/util/qemu-coroutine-lock.c
+++ b/util/qemu-coroutine-lock.c
@@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex)
     trace_qemu_co_mutex_unlock_return(mutex, self);
 }
 
+struct CoRwTicket {
+    bool read;
+    Coroutine *co;
+    QSIMPLEQ_ENTRY(CoRwTicket) next;
+};
+
 void qemu_co_rwlock_init(CoRwlock *lock)
 {
-    memset(lock, 0, sizeof(*lock));
-    qemu_co_queue_init(&lock->queue);
     qemu_co_mutex_init(&lock->mutex);
+    lock->owners = 0;
+    QSIMPLEQ_INIT(&lock->tickets);
+}
+
+/* Releases the internal CoMutex.  */
+static void qemu_co_rwlock_maybe_wake_one(CoRwlock *lock)
+{
+    CoRwTicket *tkt = QSIMPLEQ_FIRST(&lock->tickets);
+    Coroutine *co = NULL;
+
+    /*
+     * Setting lock->owners here prevents rdlock and wrlock from
+     * sneaking in between unlock and wake.
+     */
+
+    if (tkt) {
+        if (tkt->read) {
+            if (lock->owners >= 0) {
+                lock->owners++;
+                co = tkt->co;
+            }
+        } else {
+            if (lock->owners == 0) {
+                lock->owners = -1;
+                co = tkt->co;
+            }
+        }
+    }
+
+    if (co) {
+        QSIMPLEQ_REMOVE_HEAD(&lock->tickets, next);
+        qemu_co_mutex_unlock(&lock->mutex);
+        aio_co_wake(co);
+    } else {
+        qemu_co_mutex_unlock(&lock->mutex);
+    }
 }
 
 void qemu_co_rwlock_rdlock(CoRwlock *lock)
@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_rdlock(CoRwlock *lock)
 
     qemu_co_mutex_lock(&lock->mutex);
     /* For fairness, wait if a writer is in line.  */
-    while (lock->pending_writer) {
-        qemu_co_queue_wait(&lock->queue, &lock->mutex);
-    }
-    lock->reader++;
-    qemu_co_mutex_unlock(&lock->mutex);
-
-    /* The rest of the read-side critical section is run without the mutex.  */
-    self->locks_held++;
-}
-
-void qemu_co_rwlock_unlock(CoRwlock *lock)
-{
-    Coroutine *self = qemu_coroutine_self();
-
-    assert(qemu_in_coroutine());
-    if (!lock->reader) {
-        /* The critical section started in qemu_co_rwlock_wrlock.  */
-        qemu_co_queue_restart_all(&lock->queue);
+    if (lock->owners == 0 || (lock->owners > 0 && QSIMPLEQ_EMPTY(&lock->tickets))) {
+        lock->owners++;
+        qemu_co_mutex_unlock(&lock->mutex);
     } else {
-        self->locks_held--;
+        CoRwTicket my_ticket = { true, self };
 
+        QSIMPLEQ_INSERT_TAIL(&lock->tickets, &my_ticket, next);
+        qemu_co_mutex_unlock(&lock->mutex);
+        qemu_coroutine_yield();
+        assert(lock->owners >= 1);
+
+        /* Possibly wake another reader, which will wake the next in line.  */
         qemu_co_mutex_lock(&lock->mutex);
-        lock->reader--;
-        assert(lock->reader >= 0);
-        /* Wakeup only one waiting writer */
-        if (!lock->reader) {
-            qemu_co_queue_next(&lock->queue);
-        }
+        qemu_co_rwlock_maybe_wake_one(lock);
     }
-    qemu_co_mutex_unlock(&lock->mutex);
+
+    self->locks_held++;
+}
+
+void qemu_co_rwlock_unlock(CoRwlock *lock)
+{
+    Coroutine *self = qemu_coroutine_self();
+
+    assert(qemu_in_coroutine());
+    self->locks_held--;
+
+    qemu_co_mutex_lock(&lock->mutex);
+    if (lock->owners > 0) {
+        lock->owners--;
+    } else {
+        assert(lock->owners == -1);
+        lock->owners = 0;
+    }
+
+    qemu_co_rwlock_maybe_wake_one(lock);
 }
 
 void qemu_co_rwlock_downgrade(CoRwlock *lock)
 {
-    Coroutine *self = qemu_coroutine_self();
+    qemu_co_mutex_lock(&lock->mutex);
+    assert(lock->owners == -1);
+    lock->owners = 1;
 
-    /* lock->mutex critical section started in qemu_co_rwlock_wrlock or
-     * qemu_co_rwlock_upgrade.
-     */
-    assert(lock->reader == 0);
-    lock->reader++;
-    qemu_co_mutex_unlock(&lock->mutex);
-
-    /* The rest of the read-side critical section is run without the mutex.  */
-    self->locks_held++;
+    /* Possibly wake another reader, which will wake the next in line.  */
+    qemu_co_rwlock_maybe_wake_one(lock);
 }
 
 void qemu_co_rwlock_wrlock(CoRwlock *lock)
 {
+    Coroutine *self = qemu_coroutine_self();
+
     qemu_co_mutex_lock(&lock->mutex);
-    lock->pending_writer++;
-    while (lock->reader) {
-        qemu_co_queue_wait(&lock->queue, &lock->mutex);
+    if (lock->owners == 0) {
+        lock->owners = -1;
+        qemu_co_mutex_unlock(&lock->mutex);
+    } else {
+        CoRwTicket my_ticket = { false, qemu_coroutine_self() };
+
+        QSIMPLEQ_INSERT_TAIL(&lock->tickets, &my_ticket, next);
+        qemu_co_mutex_unlock(&lock->mutex);
+        qemu_coroutine_yield();
+        assert(lock->owners == -1);
     }
-    lock->pending_writer--;
 
-    /* The rest of the write-side critical section is run with
-     * the mutex taken, so that lock->reader remains zero.
-     * There is no need to update self->locks_held.
-     */
+    self->locks_held++;
 }
 
 void qemu_co_rwlock_upgrade(CoRwlock *lock)
 {
-    Coroutine *self = qemu_coroutine_self();
-
     qemu_co_mutex_lock(&lock->mutex);
-    assert(lock->reader > 0);
-    lock->reader--;
-    lock->pending_writer++;
-    while (lock->reader) {
-        qemu_co_queue_wait(&lock->queue, &lock->mutex);
+    assert(lock->owners > 0);
+    /* For fairness, wait if a writer is in line.  */
+    if (lock->owners == 1 && QSIMPLEQ_EMPTY(&lock->tickets)) {
+        lock->owners = -1;
+        qemu_co_mutex_unlock(&lock->mutex);
+    } else {
+        CoRwTicket my_ticket = { false, qemu_coroutine_self() };
+
+        lock->owners--;
+        QSIMPLEQ_INSERT_TAIL(&lock->tickets, &my_ticket, next);
+        qemu_co_rwlock_maybe_wake_one(lock);
+        qemu_coroutine_yield();
+        assert(lock->owners == -1);
     }
-    lock->pending_writer--;
-
-    /* The rest of the write-side critical section is run with
-     * the mutex taken, similar to qemu_co_rwlock_wrlock.  Do
-     * not account for the lock twice in self->locks_held.
-     */
-    self->locks_held--;
 }
-- 
2.30.2

From: Paolo Bonzini <pbonzini@redhat.com>

Test that rwlock upgrade is fair, and that readers go back to sleep if
a writer is in line.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-id: 20210325112941.365238-6-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 tests/unit/test-coroutine.c | 62 +++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

diff --git a/tests/unit/test-coroutine.c b/tests/unit/test-coroutine.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/unit/test-coroutine.c
+++ b/tests/unit/test-coroutine.c
@@ -XXX,XX +XXX,XX @@ static void test_co_mutex_lockable(void)
     g_assert(QEMU_MAKE_LOCKABLE(null_pointer) == NULL);
 }
 
+static CoRwlock rwlock;
+
+/* Test that readers are properly sent back to the queue when upgrading,
+ * even if they are the sole readers.  The test scenario is as follows:
+ *
+ *
+ * | c1           | c2         |
+ * |--------------+------------+
+ * | rdlock       |            |
+ * | yield        |            |
+ * |              | wrlock     |
+ * |              | <queued>   |
+ * | upgrade      |            |
+ * | <queued>     | <dequeued> |
+ * |              | unlock     |
+ * | <dequeued>   |            |
+ * | unlock       |            |
+ */
+
+static void coroutine_fn rwlock_yield_upgrade(void *opaque)
+{
+    qemu_co_rwlock_rdlock(&rwlock);
+    qemu_coroutine_yield();
+
+    qemu_co_rwlock_upgrade(&rwlock);
+    qemu_co_rwlock_unlock(&rwlock);
+
+    *(bool *)opaque = true;
+}
+
+static void coroutine_fn rwlock_wrlock_yield(void *opaque)
+{
+    qemu_co_rwlock_wrlock(&rwlock);
+    qemu_coroutine_yield();
+
+    qemu_co_rwlock_unlock(&rwlock);
+    *(bool *)opaque = true;
+}
+
+static void test_co_rwlock_upgrade(void)
+{
+    bool c1_done = false;
+    bool c2_done = false;
+    Coroutine *c1, *c2;
+
+    qemu_co_rwlock_init(&rwlock);
+    c1 = qemu_coroutine_create(rwlock_yield_upgrade, &c1_done);
+    c2 = qemu_coroutine_create(rwlock_wrlock_yield, &c2_done);
+
+    qemu_coroutine_enter(c1);
+    qemu_coroutine_enter(c2);
+
+    /* c1 now should go to sleep.  */
+    qemu_coroutine_enter(c1);
+    g_assert(!c1_done);
+
+    qemu_coroutine_enter(c2);
+    g_assert(c1_done);
+    g_assert(c2_done);
+}
+
 /*
  * Check that creation, enter, and return work
  */
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
     g_test_add_func("/basic/order", test_order);
     g_test_add_func("/locking/co-mutex", test_co_mutex);
     g_test_add_func("/locking/co-mutex/lockable", test_co_mutex_lockable);
+    g_test_add_func("/locking/co-rwlock/upgrade", test_co_rwlock_upgrade);
     if (g_test_perf()) {
         g_test_add_func("/perf/lifecycle", perf_lifecycle);
         g_test_add_func("/perf/nesting", perf_nesting);
-- 
2.30.2

From: David Edmondson <david.edmondson@oracle.com>

Test that downgrading an rwlock does not result in a failure to
schedule coroutines queued on the rwlock.

The diagram associated with test_co_rwlock_downgrade() describes the
intended behaviour, but what was observed previously corresponds to:

| c1     | c2         | c3         | c4       |
|--------+------------+------------+----------|
| rdlock |            |            |          |
| yield  |            |            |          |
|        | wrlock     |            |          |
|        | <queued>   |            |          |
|        |            | rdlock     |          |
|        |            | <queued>   |          |
|        |            |            | wrlock   |
|        |            |            | <queued> |
| unlock |            |            |          |
| yield  |            |            |          |
|        | <dequeued> |            |          |
|        | downgrade  |            |          |
|        | ...        |            |          |
|        | unlock     |            |          |
|        |            | <dequeued> |          |
|        |            | <queued>   |          |

This results in a failure...

ERROR:../tests/test-coroutine.c:369:test_co_rwlock_downgrade: assertion failed: (c3_done)
Bail out! ERROR:../tests/test-coroutine.c:369:test_co_rwlock_downgrade: assertion failed: (c3_done)

...as a result of the c3 coroutine failing to run to completion.

Signed-off-by: David Edmondson <david.edmondson@oracle.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-id: 20210325112941.365238-7-pbonzini@redhat.com
Message-Id: <20210309144015.557477-5-david.edmondson@oracle.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 tests/unit/test-coroutine.c | 99 +++++++++++++++++++++++++++++++++++++
 1 file changed, 99 insertions(+)

diff --git a/tests/unit/test-coroutine.c b/tests/unit/test-coroutine.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/unit/test-coroutine.c
+++ b/tests/unit/test-coroutine.c
@@ -XXX,XX +XXX,XX @@ static void test_co_rwlock_upgrade(void)
     g_assert(c2_done);
 }
 
+static void coroutine_fn rwlock_rdlock_yield(void *opaque)
+{
+    qemu_co_rwlock_rdlock(&rwlock);
+    qemu_coroutine_yield();
+
+    qemu_co_rwlock_unlock(&rwlock);
+    qemu_coroutine_yield();
+
+    *(bool *)opaque = true;
+}
+
+static void coroutine_fn rwlock_wrlock_downgrade(void *opaque)
+{
+    qemu_co_rwlock_wrlock(&rwlock);
+
+    qemu_co_rwlock_downgrade(&rwlock);
+    qemu_co_rwlock_unlock(&rwlock);
+    *(bool *)opaque = true;
+}
+
+static void coroutine_fn rwlock_rdlock(void *opaque)
+{
+    qemu_co_rwlock_rdlock(&rwlock);
+
+    qemu_co_rwlock_unlock(&rwlock);
+    *(bool *)opaque = true;
+}
+
+static void coroutine_fn rwlock_wrlock(void *opaque)
+{
+    qemu_co_rwlock_wrlock(&rwlock);
+
+    qemu_co_rwlock_unlock(&rwlock);
+    *(bool *)opaque = true;
+}
+
+/*
+ * Check that downgrading a reader-writer lock does not cause a hang.
+ *
+ * Four coroutines are used to produce a situation where there are
+ * both reader and writer hopefuls waiting to acquire an rwlock that
+ * is held by a reader.
+ *
+ * The correct sequence of operations we aim to provoke can be
+ * represented as:
+ *
+ * | c1     | c2         | c3         | c4         |
+ * |--------+------------+------------+------------|
+ * | rdlock |            |            |            |
+ * | yield  |            |            |            |
+ * |        | wrlock     |            |            |
+ * |        | <queued>   |            |            |
+ * |        |            | rdlock     |            |
+ * |        |            | <queued>   |            |
+ * |        |            |            | wrlock     |
+ * |        |            |            | <queued>   |
+ * | unlock |            |            |            |
+ * | yield  |            |            |            |
+ * |        | <dequeued> |            |            |
+ * |        | downgrade  |            |            |
+ * |        |            | <dequeued> |            |
+ * |        |            | unlock     |            |
+ * |        | ...        |            |            |
+ * |        | unlock     |            |            |
+ * |        |            |            | <dequeued> |
+ * |        |            |            | unlock     |
+ */
+static void test_co_rwlock_downgrade(void)
+{
+    bool c1_done = false;
+    bool c2_done = false;
+    bool c3_done = false;
+    bool c4_done = false;
+    Coroutine *c1, *c2, *c3, *c4;
+
+    qemu_co_rwlock_init(&rwlock);
+
+    c1 = qemu_coroutine_create(rwlock_rdlock_yield, &c1_done);
+    c2 = qemu_coroutine_create(rwlock_wrlock_downgrade, &c2_done);
+    c3 = qemu_coroutine_create(rwlock_rdlock, &c3_done);
+    c4 = qemu_coroutine_create(rwlock_wrlock, &c4_done);
+
+    qemu_coroutine_enter(c1);
+    qemu_coroutine_enter(c2);
+    qemu_coroutine_enter(c3);
+    qemu_coroutine_enter(c4);
+
+    qemu_coroutine_enter(c1);
+
+    g_assert(c2_done);
+    g_assert(c3_done);
+    g_assert(c4_done);
+
+    qemu_coroutine_enter(c1);
+
+    g_assert(c1_done);
+}
+
 /*
  * Check that creation, enter, and return work
  */
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
     g_test_add_func("/locking/co-mutex", test_co_mutex);
     g_test_add_func("/locking/co-mutex/lockable", test_co_mutex_lockable);
     g_test_add_func("/locking/co-rwlock/upgrade", test_co_rwlock_upgrade);
+    g_test_add_func("/locking/co-rwlock/downgrade", test_co_rwlock_downgrade);
     if (g_test_perf()) {
         g_test_add_func("/perf/lifecycle", perf_lifecycle);
         g_test_add_func("/perf/nesting", perf_nesting);
-- 
2.30.2

The main loop thread can consume 100% CPU when using --device
virtio-blk-pci,iothread=<iothread>. ppoll() constantly returns but
reading virtqueue host notifiers fails with EAGAIN. The file descriptors
are stale and remain registered with the AioContext because of bugs in
the virtio-blk dataplane start/stop code.

The problem is that the dataplane start/stop code involves drain
operations, which call virtio_blk_drained_begin() and
virtio_blk_drained_end() at points where the host notifier is not
operational:
- In virtio_blk_data_plane_start(), blk_set_aio_context() drains after
  vblk->dataplane_started has been set to true but the host notifier has
  not been attached yet.
- In virtio_blk_data_plane_stop(), blk_drain() and blk_set_aio_context()
  drain after the host notifier has already been detached but with
  vblk->dataplane_started still set to true.

I would like to simplify ->ioeventfd_start/stop() to avoid interactions
with drain entirely, but couldn't find a way to do that. Instead, this
patch accepts the fragile nature of the code and reorders it so that
vblk->dataplane_started is false during drain operations. This way the
virtio_blk_drained_begin() and virtio_blk_drained_end() calls don't
touch the host notifier. The result is that
virtio_blk_data_plane_start() and virtio_blk_data_plane_stop() have
complete control over the host notifier and stale file descriptors are
no longer left in the AioContext.

This patch fixes the 100% CPU consumption in the main loop thread and
correctly moves host notifier processing to the IOThread.

Fixes: 1665d9326fd2 ("virtio-blk: implement BlockDevOps->drained_begin()")
Reported-by: Lukáš Doktor <ldoktor@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Lukas Doktor <ldoktor@redhat.com>
Message-id: 20230704151527.193586-1-stefanha@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 hw/block/dataplane/virtio-blk.c | 67 +++++++++++++++++++--------------
 1 file changed, 38 insertions(+), 29 deletions(-)

diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/block/dataplane/virtio-blk.c
+++ b/hw/block/dataplane/virtio-blk.c
@@ -XXX,XX +XXX,XX @@ int virtio_blk_data_plane_start(VirtIODevice *vdev)
 
     memory_region_transaction_commit();
 
-    /*
-     * These fields are visible to the IOThread so we rely on implicit barriers
-     * in aio_context_acquire() on the write side and aio_notify_accept() on
-     * the read side.
-     */
-    s->starting = false;
-    vblk->dataplane_started = true;
     trace_virtio_blk_data_plane_start(s);
 
     old_context = blk_get_aio_context(s->conf->conf.blk);
@@ -XXX,XX +XXX,XX @@ int virtio_blk_data_plane_start(VirtIODevice *vdev)
         event_notifier_set(virtio_queue_get_host_notifier(vq));
     }
 
+    /*
+     * These fields must be visible to the IOThread when it processes the
+     * virtqueue, otherwise it will think dataplane has not started yet.
+     *
+     * Make sure ->dataplane_started is false when blk_set_aio_context() is
+     * called above so that draining does not cause the host notifier to be
+     * detached/attached prematurely.
+     */
+    s->starting = false;
+    vblk->dataplane_started = true;
+    smp_wmb(); /* paired with aio_notify_accept() on the read side */
+
     /* Get this show started by hooking up our callbacks */
     if (!blk_in_drain(s->conf->conf.blk)) {
         aio_context_acquire(s->ctx);
@@ -XXX,XX +XXX,XX @@ int virtio_blk_data_plane_start(VirtIODevice *vdev)
   fail_guest_notifiers:
     vblk->dataplane_disabled = true;
     s->starting = false;
-    vblk->dataplane_started = true;
     return -ENOSYS;
 }
 
@@ -XXX,XX +XXX,XX @@ void virtio_blk_data_plane_stop(VirtIODevice *vdev)
         aio_wait_bh_oneshot(s->ctx, virtio_blk_data_plane_stop_bh, s);
     }
 
+    /*
+     * Batch all the host notifiers in a single transaction to avoid
+     * quadratic time complexity in address_space_update_ioeventfds().
+     */
+    memory_region_transaction_begin();
+
+    for (i = 0; i < nvqs; i++) {
+        virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), i, false);
+    }
+
+    /*
+     * The transaction expects the ioeventfds to be open when it
+     * commits. Do it now, before the cleanup loop.
+     */
+    memory_region_transaction_commit();
+
+    for (i = 0; i < nvqs; i++) {
+        virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), i);
+    }
+
+    /*
+     * Set ->dataplane_started to false before draining so that host notifiers
+     * are not detached/attached anymore.
+     */
+    vblk->dataplane_started = false;
+
     aio_context_acquire(s->ctx);
 
     /* Wait for virtio_blk_dma_restart_bh() and in flight I/O to complete */
@@ -XXX,XX +XXX,XX @@ void virtio_blk_data_plane_stop(VirtIODevice *vdev)
 
     aio_context_release(s->ctx);
 
-    /*
-     * Batch all the host notifiers in a single transaction to avoid
-     * quadratic time complexity in address_space_update_ioeventfds().
-     */
-    memory_region_transaction_begin();
-
-    for (i = 0; i < nvqs; i++) {
-        virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), i, false);
-    }
-
-    /*
-     * The transaction expects the ioeventfds to be open when it
-     * commits. Do it now, before the cleanup loop.
-     */
-    memory_region_transaction_commit();
-
-    for (i = 0; i < nvqs; i++) {
-        virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), i);
-    }
-
     qemu_bh_cancel(s->bh);
     notify_guest_bh(s); /* final chance to notify guest */
 
     /* Clean up guest notifier (irq) */
     k->set_guest_notifiers(qbus->parent, nvqs, false);
 
-    vblk->dataplane_started = false;
     s->stopping = false;
 }
-- 
2.40.1