Series comparison

-[Qemu-devel] [PULL 0/3] Block patches
+[PULL for-6.0 0/6] Block patches
-The following changes since commit f90ea7ba7c5ae7010ee0ce062207ae42530f57d6:
+The following changes since commit 6d40ce00c1166c317e298ad82ecf10e650c4f87d:
-  Merge remote-tracking branch 'remotes/pmaydell/tags/pull-target-arm-20171012' into staging (2017-10-12 17:06:50 +0100)
+  Update version for v6.0.0-rc1 release (2021-03-30 18:19:07 +0100)
-are available in the git repository at:
+are available in the Git repository at:
-  git://github.com/stefanha/qemu.git tags/block-pull-request
+  https://gitlab.com/stefanha/qemu.git tags/block-pull-request
-for you to fetch changes up to b867eaa17b3940760f51134e409cb0580dd3dde3:
+for you to fetch changes up to b6489ac06695e257ea0a9841364577e247fdee30:
-  block/throttle.c: add bdrv_co_drain_begin/end callbacks (2017-10-13 12:38:41 +0100)
+  test-coroutine: Add rwlock downgrade test (2021-03-31 10:44:21 +0100)
 ----------------------------------------------------------------
 Pull request
 A fix for VDI image files and more generally for CoRwlock.
 ----------------------------------------------------------------
-----------------------------------------------------------------
+David Edmondson (4):
   block/vdi: When writing new bmap entry fails, don't leak the buffer
   block/vdi: Don't assume that blocks are larger than VdiHeader
   coroutine-lock: Store the coroutine in the CoWaitRecord only once
   test-coroutine: Add rwlock downgrade test
-Manos Pitsidianakis (3):
+Paolo Bonzini (2):
-  block: add bdrv_co_drain_end callback
+  coroutine-lock: Reimplement CoRwlock to fix downgrade bug
-  block: rename bdrv_co_drain to bdrv_co_drain_begin
+  test-coroutine: Add rwlock upgrade test
   block/throttle.c: add bdrv_co_drain_begin/end callbacks
- include/block/block_int.h | 13 ++++++++++---
+ include/qemu/coroutine.h    |  17 ++--
- block/io.c                | 48 +++++++++++++++++++++++++++++++++--------------
+ block/vdi.c                 |  11 ++-
- block/qed.c               |  6 +++---
+ tests/unit/test-coroutine.c | 161 +++++++++++++++++++++++++++++++++++
- block/throttle.c          | 18 ++++++++++++++++++
+ util/qemu-coroutine-lock.c  | 165 +++++++++++++++++++++++-------------
-files changed, 65 insertions(+), 20 deletions(-)
+files changed, 282 insertions(+), 72 deletions(-)
 --
-.13.6
+.30.2

-New patch
+[PULL for-6.0 1/6] block/vdi: When writing new bmap entry fails, don't leak the buffer
+From: David Edmondson <david.edmondson@oracle.com>
+If a new bitmap entry is allocated, requiring the entire block to be
+written, avoiding leaking the buffer allocated for the block should
+the write fail.
+Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Signed-off-by: David Edmondson <david.edmondson@oracle.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Acked-by: Max Reitz <mreitz@redhat.com>
+Message-id: 20210325112941.365238-2-pbonzini@redhat.com
+Message-Id: <20210309144015.557477-2-david.edmondson@oracle.com>
+Acked-by: Max Reitz <mreitz@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+---
+ block/vdi.c | 1 +
+file changed, 1 insertion(+)
+diff --git a/block/vdi.c b/block/vdi.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/vdi.c
++++ b/block/vdi.c
+@@ -XXX,XX +XXX,XX @@ nonallocating_write:
+     logout("finished data write\n");
+     if (ret < 0) {
++        g_free(block);
+         return ret;
+     }
+--
+.30.2

-New patch
+[PULL for-6.0 2/6] block/vdi: Don't assume that blocks are larger than VdiHeader
+From: David Edmondson <david.edmondson@oracle.com>
+Given that the block size is read from the header of the VDI file, a
+wide variety of sizes might be seen. Rather than re-using a block
+sized memory region when writing the VDI header, allocate an
+appropriately sized buffer.
+Signed-off-by: David Edmondson <david.edmondson@oracle.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Acked-by: Max Reitz <mreitz@redhat.com>
+Message-id: 20210325112941.365238-3-pbonzini@redhat.com
+Message-Id: <20210309144015.557477-3-david.edmondson@oracle.com>
+Acked-by: Max Reitz <mreitz@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+---
+ block/vdi.c | 10 ++++++----
+file changed, 6 insertions(+), 4 deletions(-)
+diff --git a/block/vdi.c b/block/vdi.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/vdi.c
++++ b/block/vdi.c
+@@ -XXX,XX +XXX,XX @@ nonallocating_write:
+     if (block) {
+         /* One or more new blocks were allocated. */
+-        VdiHeader *header = (VdiHeader *) block;
++        VdiHeader *header;
+         uint8_t *base;
+         uint64_t offset;
+         uint32_t n_sectors;
++        g_free(block);
++        header = g_malloc(sizeof(*header));
++
+         logout("now writing modified header\n");
+         assert(VDI_IS_ALLOCATED(bmap_first));
+         *header = s->header;
+         vdi_header_to_le(header);
+-        ret = bdrv_pwrite(bs->file, 0, block, sizeof(VdiHeader));
+-        g_free(block);
+-        block = NULL;
++        ret = bdrv_pwrite(bs->file, 0, header, sizeof(*header));
++        g_free(header);
+         if (ret < 0) {
+             return ret;
+--
+.30.2

-New patch
+[PULL for-6.0 3/6] coroutine-lock: Store the coroutine in the CoWaitRecord only once
+From: David Edmondson <david.edmondson@oracle.com>
+When taking the slow path for mutex acquisition, set the coroutine
+value in the CoWaitRecord in push_waiter(), rather than both there and
+in the caller.
+Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
+Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Signed-off-by: David Edmondson <david.edmondson@oracle.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Message-id: 20210325112941.365238-4-pbonzini@redhat.com
+Message-Id: <20210309144015.557477-4-david.edmondson@oracle.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+---
+ util/qemu-coroutine-lock.c | 1 -
+file changed, 1 deletion(-)
+diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
+index XXXXXXX..XXXXXXX 100644
+--- a/util/qemu-coroutine-lock.c
++++ b/util/qemu-coroutine-lock.c
+@@ -XXX,XX +XXX,XX @@ static void coroutine_fn qemu_co_mutex_lock_slowpath(AioContext *ctx,
+     unsigned old_handoff;
+     trace_qemu_co_mutex_lock_entry(mutex, self);
+-    w.co = self;
+     push_waiter(mutex, &w);
+     /* This is the "Responsibility Hand-Off" protocol; a lock() picks from
+--
+.30.2

-[Qemu-devel] [PULL 1/3] block: add bdrv_co_drain_end callback
+[PULL for-6.0 4/6] coroutine-lock: Reimplement CoRwlock to fix downgrade bug
-From: Manos Pitsidianakis <el13635@mail.ntua.gr>
+From: Paolo Bonzini <pbonzini@redhat.com>
-BlockDriverState has a bdrv_co_drain() callback but no equivalent for
+An invariant of the current rwlock is that if multiple coroutines hold a
-the end of the drain. The throttle driver (block/throttle.c) needs a way
+reader lock, all must be runnable. The unlock implementation relies on
-to mark the end of the drain in order to toggle io_limits_disabled
+this, choosing to wake a single coroutine when the final read lock
-correctly, thus bdrv_co_drain_end is needed.
+holder exits the critical section, assuming that it will wake a
+coroutine attempting to acquire a write lock.
-Signed-off-by: Manos Pitsidianakis <el13635@mail.ntua.gr>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+The downgrade implementation violates this assumption by creating a
-Reviewed-by: Fam Zheng <famz@redhat.com>
+read lock owning coroutine that is exclusively runnable - any other
 coroutines that are waiting to acquire a read lock are *not* made
 runnable when the write lock holder converts its ownership to read
 only.
 More in general, the old implementation had lots of other fairness bugs.
 The root cause of the bugs was that CoQueue would wake up readers even
 if there were pending writers, and would wake up writers even if there
 were readers.  In that case, the coroutine would go back to sleep *at
 the end* of the CoQueue, losing its place at the head of the line.
 To fix this, keep the queue of waiters explicitly in the CoRwlock
 instead of using CoQueue, and store for each whether it is a
 potential reader or a writer.  This way, downgrade can look at the
 first queued coroutines and wake it only if it is a reader, causing
 all other readers in line to be released in turn.
 Reported-by: David Edmondson <david.edmondson@oracle.com>
 Reviewed-by: David Edmondson <david.edmondson@oracle.com>
 Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Message-id: 20210325112941.365238-5-pbonzini@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- include/block/block_int.h | 11 +++++++++--
+ include/qemu/coroutine.h   |  17 ++--
- block/io.c                | 48 +++++++++++++++++++++++++++++++++--------------
+ util/qemu-coroutine-lock.c | 164 +++++++++++++++++++++++--------------
-files changed, 43 insertions(+), 16 deletions(-)
+files changed, 114 insertions(+), 67 deletions(-)
-diff --git a/include/block/block_int.h b/include/block/block_int.h
+diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
 index XXXXXXX..XXXXXXX 100644
---- a/include/block/block_int.h
+--- a/include/qemu/coroutine.h
-+++ b/include/block/block_int.h
++++ b/include/qemu/coroutine.h
-@@ -XXX,XX +XXX,XX @@ struct BlockDriver {
+@@ -XXX,XX +XXX,XX @@ bool qemu_co_enter_next_impl(CoQueue *queue, QemuLockable *lock);
-     int (*bdrv_probe_geometry)(BlockDriverState *bs, HDGeometry *geo);
+ bool qemu_co_queue_empty(CoQueue *queue);
-     /**
--     * Drain and stop any internal sources of requests in the driver, and
++typedef struct CoRwTicket CoRwTicket;
--     * remain so until next I/O callback (e.g. bdrv_co_writev) is called.
+ typedef struct CoRwlock {
-+     * bdrv_co_drain is called if implemented in the beginning of a
+-    int pending_writer;
-+     * drain operation to drain and stop any internal sources of requests in
+-    int reader;
-+     * the driver.
+     CoMutex mutex;
-+     * bdrv_co_drain_end is called if implemented at the end of the drain.
+-    CoQueue queue;
-+     *
++
-+     * They should be used by the driver to e.g. manage scheduled I/O
++    /* Number of readers, or -1 if owned for writing.  */
-+     * requests, or toggle an internal state. After the end of the drain new
++    int owners;
-+     * requests will continue normally.
++
-      */
++    /* Waiting coroutines.  */
-     void coroutine_fn (*bdrv_co_drain)(BlockDriverState *bs);
++    QSIMPLEQ_HEAD(, CoRwTicket) tickets;
-+    void coroutine_fn (*bdrv_co_drain_end)(BlockDriverState *bs);
+ } CoRwlock;
-     void (*bdrv_add_child)(BlockDriverState *parent, BlockDriverState *child,
+ /**
-                            Error **errp);
+@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_rdlock(CoRwlock *lock);
-diff --git a/block/io.c b/block/io.c
+ /**
   * Write Locks the CoRwlock from a reader.  This is a bit more efficient than
   * @qemu_co_rwlock_unlock followed by a separate @qemu_co_rwlock_wrlock.
 - * However, if the lock cannot be upgraded immediately, control is transferred
 - * to the caller of the current coroutine.  Also, @qemu_co_rwlock_upgrade
 - * only overrides CoRwlock fairness if there are no concurrent readers, so
 - * another writer might run while @qemu_co_rwlock_upgrade blocks.
 + * Note that if the lock cannot be upgraded immediately, control is transferred
 + * to the caller of the current coroutine; another writer might run while
 + * @qemu_co_rwlock_upgrade blocks.
   */
  void qemu_co_rwlock_upgrade(CoRwlock *lock);
 diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/io.c
+--- a/util/qemu-coroutine-lock.c
-+++ b/block/io.c
++++ b/util/qemu-coroutine-lock.c
-@@ -XXX,XX +XXX,XX @@ typedef struct {
+@@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex)
-     Coroutine *co;
+     trace_qemu_co_mutex_unlock_return(mutex, self);
-     BlockDriverState *bs;
+ }
-     bool done;
-+    bool begin;
++struct CoRwTicket {
- } BdrvCoDrainData;
++    bool read;
++    Coroutine *co;
- static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
++    QSIMPLEQ_ENTRY(CoRwTicket) next;
-@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
++};
-     BdrvCoDrainData *data = opaque;
++
-     BlockDriverState *bs = data->bs;
+ void qemu_co_rwlock_init(CoRwlock *lock)
+ {
--    bs->drv->bdrv_co_drain(bs);
+-    memset(lock, 0, sizeof(*lock));
-+    if (data->begin) {
+-    qemu_co_queue_init(&lock->queue);
-+        bs->drv->bdrv_co_drain(bs);
+     qemu_co_mutex_init(&lock->mutex);
-+    } else {
++    lock->owners = 0;
-+        bs->drv->bdrv_co_drain_end(bs);
++    QSIMPLEQ_INIT(&lock->tickets);
 +}
 +
 +/* Releases the internal CoMutex.  */
 +static void qemu_co_rwlock_maybe_wake_one(CoRwlock *lock)
 +{
 +    CoRwTicket *tkt = QSIMPLEQ_FIRST(&lock->tickets);
 +    Coroutine *co = NULL;
 +
 +    /*
 +     * Setting lock->owners here prevents rdlock and wrlock from
 +     * sneaking in between unlock and wake.
 +     */
 +
 +    if (tkt) {
 +        if (tkt->read) {
 +            if (lock->owners >= 0) {
 +                lock->owners++;
 +                co = tkt->co;
 +            }
 +        } else {
 +            if (lock->owners == 0) {
 +                lock->owners = -1;
 +                co = tkt->co;
 +            }
 +        }
 +    }
++
-     /* Set data->done before reading bs->wakeup.  */
++    if (co) {
-     atomic_mb_set(&data->done, true);
++        QSIMPLEQ_REMOVE_HEAD(&lock->tickets, next);
-     bdrv_wakeup(bs);
++        qemu_co_mutex_unlock(&lock->mutex);
- }
++        aio_co_wake(co);
++    } else {
--static void bdrv_drain_invoke(BlockDriverState *bs)
++        qemu_co_mutex_unlock(&lock->mutex);
-+static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
++    }
- {
+ }
--    BdrvCoDrainData data = { .bs = bs, .done = false };
-+    BdrvCoDrainData data = { .bs = bs, .done = false, .begin = begin};
+ void qemu_co_rwlock_rdlock(CoRwlock *lock)
+@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_rdlock(CoRwlock *lock)
--    if (!bs->drv || !bs->drv->bdrv_co_drain) {
-+    if (!bs->drv || (begin && !bs->drv->bdrv_co_drain) ||
+     qemu_co_mutex_lock(&lock->mutex);
-+            (!begin && !bs->drv->bdrv_co_drain_end)) {
+     /* For fairness, wait if a writer is in line.  */
-         return;
+-    while (lock->pending_writer) {
 -        qemu_co_queue_wait(&lock->queue, &lock->mutex);
 -    }
 -    lock->reader++;
 -    qemu_co_mutex_unlock(&lock->mutex);
 -
 -    /* The rest of the read-side critical section is run without the mutex.  */
 -    self->locks_held++;
 -}
 -
 -void qemu_co_rwlock_unlock(CoRwlock *lock)
 -{
 -    Coroutine *self = qemu_coroutine_self();
 -
 -    assert(qemu_in_coroutine());
 -    if (!lock->reader) {
 -        /* The critical section started in qemu_co_rwlock_wrlock.  */
 -        qemu_co_queue_restart_all(&lock->queue);
 +    if (lock->owners == 0 || (lock->owners > 0 && QSIMPLEQ_EMPTY(&lock->tickets))) {
 +        lock->owners++;
 +        qemu_co_mutex_unlock(&lock->mutex);
      } else {
 -        self->locks_held--;
 +        CoRwTicket my_ticket = { true, self };
 +        QSIMPLEQ_INSERT_TAIL(&lock->tickets, &my_ticket, next);
 +        qemu_co_mutex_unlock(&lock->mutex);
 +        qemu_coroutine_yield();
 +        assert(lock->owners >= 1);
 +
 +        /* Possibly wake another reader, which will wake the next in line.  */
          qemu_co_mutex_lock(&lock->mutex);
 -        lock->reader--;
 -        assert(lock->reader >= 0);
 -        /* Wakeup only one waiting writer */
 -        if (!lock->reader) {
 -            qemu_co_queue_next(&lock->queue);
 -        }
 +        qemu_co_rwlock_maybe_wake_one(lock);
      }
+-    qemu_co_mutex_unlock(&lock->mutex);
-@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs)
++
-     BDRV_POLL_WHILE(bs, !data.done);
++    self->locks_held++;
- }
++}
++
--static bool bdrv_drain_recurse(BlockDriverState *bs)
++void qemu_co_rwlock_unlock(CoRwlock *lock)
-+static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
++{
- {
++    Coroutine *self = qemu_coroutine_self();
-     BdrvChild *child, *tmp;
++
-     bool waited;
++    assert(qemu_in_coroutine());
++    self->locks_held--;
--    waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0);
++
--
++    qemu_co_mutex_lock(&lock->mutex);
-     /* Ensure any pending metadata writes are submitted to bs->file.  */
++    if (lock->owners > 0) {
--    bdrv_drain_invoke(bs);
++        lock->owners--;
-+    bdrv_drain_invoke(bs, begin);
++    } else {
-+
++        assert(lock->owners == -1);
-+    /* Wait for drained requests to finish */
++        lock->owners = 0;
 +    waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0);
      QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
          BlockDriverState *bs = child->bs;
@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs)
               */
              bdrv_ref(bs);
          }
 -        waited |= bdrv_drain_recurse(bs);
 +        waited |= bdrv_drain_recurse(bs, begin);
          if (in_main_loop) {
              bdrv_unref(bs);
          }
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
      BlockDriverState *bs = data->bs;
      bdrv_dec_in_flight(bs);
 -    bdrv_drained_begin(bs);
 +    if (data->begin) {
 +        bdrv_drained_begin(bs);
 +    } else {
 +        bdrv_drained_end(bs);
 +    }
 +
-     data->done = true;
++    qemu_co_rwlock_maybe_wake_one(lock);
-     aio_co_wake(co);
+ }
- }
+ void qemu_co_rwlock_downgrade(CoRwlock *lock)
--static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs)
+ {
-+static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
+-    Coroutine *self = qemu_coroutine_self();
-+                                                bool begin)
++    qemu_co_mutex_lock(&lock->mutex);
- {
++    assert(lock->owners == -1);
-     BdrvCoDrainData data;
++    lock->owners = 1;
-@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs)
+-    /* lock->mutex critical section started in qemu_co_rwlock_wrlock or
-         .co = qemu_coroutine_self(),
+-     * qemu_co_rwlock_upgrade.
-         .bs = bs,
+-     */
-         .done = false,
+-    assert(lock->reader == 0);
-+        .begin = begin,
+-    lock->reader++;
-     };
+-    qemu_co_mutex_unlock(&lock->mutex);
-     bdrv_inc_in_flight(bs);
+-
-     aio_bh_schedule_oneshot(bdrv_get_aio_context(bs),
+-    /* The rest of the read-side critical section is run without the mutex.  */
-@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs)
+-    self->locks_held++;
- void bdrv_drained_begin(BlockDriverState *bs)
++    /* Possibly wake another reader, which will wake the next in line.  */
- {
++    qemu_co_rwlock_maybe_wake_one(lock);
-     if (qemu_in_coroutine()) {
+ }
--        bdrv_co_yield_to_drain(bs);
-+        bdrv_co_yield_to_drain(bs, true);
+ void qemu_co_rwlock_wrlock(CoRwlock *lock)
-         return;
+ {
 +    Coroutine *self = qemu_coroutine_self();
 +
      qemu_co_mutex_lock(&lock->mutex);
 -    lock->pending_writer++;
 -    while (lock->reader) {
 -        qemu_co_queue_wait(&lock->queue, &lock->mutex);
 +    if (lock->owners == 0) {
 +        lock->owners = -1;
 +        qemu_co_mutex_unlock(&lock->mutex);
 +    } else {
 +        CoRwTicket my_ticket = { false, qemu_coroutine_self() };
 +
 +        QSIMPLEQ_INSERT_TAIL(&lock->tickets, &my_ticket, next);
 +        qemu_co_mutex_unlock(&lock->mutex);
 +        qemu_coroutine_yield();
 +        assert(lock->owners == -1);
      }
+-    lock->pending_writer--;
-@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
-         bdrv_parent_drained_begin(bs);
+-    /* The rest of the write-side critical section is run with
 -     * the mutex taken, so that lock->reader remains zero.
 -     * There is no need to update self->locks_held.
 -     */
 +    self->locks_held++;
  }
  void qemu_co_rwlock_upgrade(CoRwlock *lock)
  {
 -    Coroutine *self = qemu_coroutine_self();
 -
      qemu_co_mutex_lock(&lock->mutex);
 -    assert(lock->reader > 0);
 -    lock->reader--;
 -    lock->pending_writer++;
 -    while (lock->reader) {
 -        qemu_co_queue_wait(&lock->queue, &lock->mutex);
 +    assert(lock->owners > 0);
 +    /* For fairness, wait if a writer is in line.  */
 +    if (lock->owners == 1 && QSIMPLEQ_EMPTY(&lock->tickets)) {
 +        lock->owners = -1;
 +        qemu_co_mutex_unlock(&lock->mutex);
 +    } else {
 +        CoRwTicket my_ticket = { false, qemu_coroutine_self() };
 +
 +        lock->owners--;
 +        QSIMPLEQ_INSERT_TAIL(&lock->tickets, &my_ticket, next);
 +        qemu_co_rwlock_maybe_wake_one(lock);
 +        qemu_coroutine_yield();
 +        assert(lock->owners == -1);
      }
+-    lock->pending_writer--;
--    bdrv_drain_recurse(bs);
+-
-+    bdrv_drain_recurse(bs, true);
+-    /* The rest of the write-side critical section is run with
- }
+-     * the mutex taken, similar to qemu_co_rwlock_wrlock.  Do
+-     * not account for the lock twice in self->locks_held.
- void bdrv_drained_end(BlockDriverState *bs)
+-     */
- {
+-    self->locks_held--;
-+    if (qemu_in_coroutine()) {
+ }
 +        bdrv_co_yield_to_drain(bs, false);
 +        return;
 +    }
      assert(bs->quiesce_counter > 0);
      if (atomic_fetch_dec(&bs->quiesce_counter) > 1) {
          return;
      }
      bdrv_parent_drained_end(bs);
 +    bdrv_drain_recurse(bs, false);
      aio_enable_external(bdrv_get_aio_context(bs));
  }
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
              aio_context_acquire(aio_context);
              for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
                  if (aio_context == bdrv_get_aio_context(bs)) {
 -                    waited |= bdrv_drain_recurse(bs);
 +                    waited |= bdrv_drain_recurse(bs, true);
                  }
              }
              aio_context_release(aio_context);
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
          aio_context_acquire(aio_context);
          aio_enable_external(aio_context);
          bdrv_parent_drained_end(bs);
 +        bdrv_drain_recurse(bs, false);
          aio_context_release(aio_context);
      }
 --
-.13.6
+.30.2

-[Qemu-devel] [PULL 2/3] block: rename bdrv_co_drain to bdrv_co_drain_begin
+[PULL for-6.0 5/6] test-coroutine: Add rwlock upgrade test
-From: Manos Pitsidianakis <el13635@mail.ntua.gr>
+From: Paolo Bonzini <pbonzini@redhat.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Test that rwlock upgrade is fair, and that readers go back to sleep if
-Reviewed-by: Fam Zheng <famz@redhat.com>
+a writer is in line.
-Signed-off-by: Manos Pitsidianakis <el13635@mail.ntua.gr>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Message-id: 20210325112941.365238-6-pbonzini@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- include/block/block_int.h | 4 ++--
+ tests/unit/test-coroutine.c | 62 +++++++++++++++++++++++++++++++++++++
- block/io.c                | 4 ++--
+file changed, 62 insertions(+)
  block/qed.c               | 6 +++---
 files changed, 7 insertions(+), 7 deletions(-)
-diff --git a/include/block/block_int.h b/include/block/block_int.h
+diff --git a/tests/unit/test-coroutine.c b/tests/unit/test-coroutine.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/block/block_int.h
+--- a/tests/unit/test-coroutine.c
-+++ b/include/block/block_int.h
++++ b/tests/unit/test-coroutine.c
-@@ -XXX,XX +XXX,XX @@ struct BlockDriver {
+@@ -XXX,XX +XXX,XX @@ static void test_co_mutex_lockable(void)
-     int (*bdrv_probe_geometry)(BlockDriverState *bs, HDGeometry *geo);
+     g_assert(QEMU_MAKE_LOCKABLE(null_pointer) == NULL);
      /**
 -     * bdrv_co_drain is called if implemented in the beginning of a
 +     * bdrv_co_drain_begin is called if implemented in the beginning of a
       * drain operation to drain and stop any internal sources of requests in
       * the driver.
       * bdrv_co_drain_end is called if implemented at the end of the drain.
@@ -XXX,XX +XXX,XX @@ struct BlockDriver {
       * requests, or toggle an internal state. After the end of the drain new
       * requests will continue normally.
       */
 -    void coroutine_fn (*bdrv_co_drain)(BlockDriverState *bs);
 +    void coroutine_fn (*bdrv_co_drain_begin)(BlockDriverState *bs);
      void coroutine_fn (*bdrv_co_drain_end)(BlockDriverState *bs);
      void (*bdrv_add_child)(BlockDriverState *parent, BlockDriverState *child,
 diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/io.c
 +++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
      BlockDriverState *bs = data->bs;
      if (data->begin) {
 -        bs->drv->bdrv_co_drain(bs);
 +        bs->drv->bdrv_co_drain_begin(bs);
      } else {
          bs->drv->bdrv_co_drain_end(bs);
      }
@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
  {
      BdrvCoDrainData data = { .bs = bs, .done = false, .begin = begin};
 -    if (!bs->drv || (begin && !bs->drv->bdrv_co_drain) ||
 +    if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) ||
              (!begin && !bs->drv->bdrv_co_drain_end)) {
          return;
      }
 diff --git a/block/qed.c b/block/qed.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qed.c
 +++ b/block/qed.c
@@ -XXX,XX +XXX,XX @@ static bool qed_plug_allocating_write_reqs(BDRVQEDState *s)
      assert(!s->allocating_write_reqs_plugged);
      if (s->allocating_acb != NULL) {
          /* Another allocating write came concurrently.  This cannot happen
 -         * from bdrv_qed_co_drain, but it can happen when the timer runs.
 +         * from bdrv_qed_co_drain_begin, but it can happen when the timer runs.
           */
          qemu_co_mutex_unlock(&s->table_lock);
          return false;
@@ -XXX,XX +XXX,XX @@ static void bdrv_qed_attach_aio_context(BlockDriverState *bs,
      }
  }
--static void coroutine_fn bdrv_qed_co_drain(BlockDriverState *bs)
++static CoRwlock rwlock;
-+static void coroutine_fn bdrv_qed_co_drain_begin(BlockDriverState *bs)
++
- {
++/* Test that readers are properly sent back to the queue when upgrading,
-     BDRVQEDState *s = bs->opaque;
++ * even if they are the sole readers.  The test scenario is as follows:
++ *
-@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_qed = {
++ *
-     .bdrv_check               = bdrv_qed_check,
++ * | c1           | c2         |
-     .bdrv_detach_aio_context  = bdrv_qed_detach_aio_context,
++ * |--------------+------------+
-     .bdrv_attach_aio_context  = bdrv_qed_attach_aio_context,
++ * | rdlock       |            |
--    .bdrv_co_drain            = bdrv_qed_co_drain,
++ * | yield        |            |
-+    .bdrv_co_drain_begin      = bdrv_qed_co_drain_begin,
++ * |              | wrlock     |
- };
++ * |              | <queued>   |
++ * | upgrade      |            |
- static void bdrv_qed_init(void)
++ * | <queued>     | <dequeued> |
 + * |              | unlock     |
 + * | <dequeued>   |            |
 + * | unlock       |            |
 + */
 +
 +static void coroutine_fn rwlock_yield_upgrade(void *opaque)
 +{
 +    qemu_co_rwlock_rdlock(&rwlock);
 +    qemu_coroutine_yield();
 +
 +    qemu_co_rwlock_upgrade(&rwlock);
 +    qemu_co_rwlock_unlock(&rwlock);
 +
 +    *(bool *)opaque = true;
 +}
 +
 +static void coroutine_fn rwlock_wrlock_yield(void *opaque)
 +{
 +    qemu_co_rwlock_wrlock(&rwlock);
 +    qemu_coroutine_yield();
 +
 +    qemu_co_rwlock_unlock(&rwlock);
 +    *(bool *)opaque = true;
 +}
 +
 +static void test_co_rwlock_upgrade(void)
 +{
 +    bool c1_done = false;
 +    bool c2_done = false;
 +    Coroutine *c1, *c2;
 +
 +    qemu_co_rwlock_init(&rwlock);
 +    c1 = qemu_coroutine_create(rwlock_yield_upgrade, &c1_done);
 +    c2 = qemu_coroutine_create(rwlock_wrlock_yield, &c2_done);
 +
 +    qemu_coroutine_enter(c1);
 +    qemu_coroutine_enter(c2);
 +
 +    /* c1 now should go to sleep.  */
 +    qemu_coroutine_enter(c1);
 +    g_assert(!c1_done);
 +
 +    qemu_coroutine_enter(c2);
 +    g_assert(c1_done);
 +    g_assert(c2_done);
 +}
 +
  /*
   * Check that creation, enter, and return work
   */
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
      g_test_add_func("/basic/order", test_order);
      g_test_add_func("/locking/co-mutex", test_co_mutex);
      g_test_add_func("/locking/co-mutex/lockable", test_co_mutex_lockable);
 +    g_test_add_func("/locking/co-rwlock/upgrade", test_co_rwlock_upgrade);
      if (g_test_perf()) {
          g_test_add_func("/perf/lifecycle", perf_lifecycle);
          g_test_add_func("/perf/nesting", perf_nesting);
 --
-.13.6
+.30.2

-[Qemu-devel] [PULL 3/3] block/throttle.c: add bdrv_co_drain_begin/end callbacks
+[PULL for-6.0 6/6] test-coroutine: Add rwlock downgrade test
-From: Manos Pitsidianakis <el13635@mail.ntua.gr>
+From: David Edmondson <david.edmondson@oracle.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Test that downgrading an rwlock does not result in a failure to
-Reviewed-by: Fam Zheng <famz@redhat.com>
+schedule coroutines queued on the rwlock.
-Signed-off-by: Manos Pitsidianakis <el13635@mail.ntua.gr>
 The diagram associated with test_co_rwlock_downgrade() describes the
 intended behaviour, but what was observed previously corresponds to:
 | c1     | c2         | c3         | c4       |
 |--------+------------+------------+----------|
 | rdlock |            |            |          |
 | yield  |            |            |          |
 |        | wrlock     |            |          |
 |        | <queued>   |            |          |
 |        |            | rdlock     |          |
 |        |            | <queued>   |          |
 |        |            |            | wrlock   |
 |        |            |            | <queued> |
 | unlock |            |            |          |
 | yield  |            |            |          |
 |        | <dequeued> |            |          |
 |        | downgrade  |            |          |
 |        | ...        |            |          |
 |        | unlock     |            |          |
 |        |            | <dequeued> |          |
 |        |            | <queued>   |          |
 This results in a failure...
 ERROR:../tests/test-coroutine.c:369:test_co_rwlock_downgrade: assertion failed: (c3_done)
 Bail out! ERROR:../tests/test-coroutine.c:369:test_co_rwlock_downgrade: assertion failed: (c3_done)
 ...as a result of the c3 coroutine failing to run to completion.
 Signed-off-by: David Edmondson <david.edmondson@oracle.com>
 Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Message-id: 20210325112941.365238-7-pbonzini@redhat.com
 Message-Id: <20210309144015.557477-5-david.edmondson@oracle.com>
 Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- block/throttle.c | 18 ++++++++++++++++++
+ tests/unit/test-coroutine.c | 99 +++++++++++++++++++++++++++++++++++++
-file changed, 18 insertions(+)
+file changed, 99 insertions(+)
-diff --git a/block/throttle.c b/block/throttle.c
+diff --git a/tests/unit/test-coroutine.c b/tests/unit/test-coroutine.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/throttle.c
+--- a/tests/unit/test-coroutine.c
-+++ b/block/throttle.c
++++ b/tests/unit/test-coroutine.c
-@@ -XXX,XX +XXX,XX @@ static bool throttle_recurse_is_first_non_filter(BlockDriverState *bs,
+@@ -XXX,XX +XXX,XX @@ static void test_co_rwlock_upgrade(void)
-     return bdrv_recurse_is_first_non_filter(bs->file->bs, candidate);
+     g_assert(c2_done);
  }
-+static void coroutine_fn throttle_co_drain_begin(BlockDriverState *bs)
++static void coroutine_fn rwlock_rdlock_yield(void *opaque)
 +{
-+    ThrottleGroupMember *tgm = bs->opaque;
++    qemu_co_rwlock_rdlock(&rwlock);
-+    if (atomic_fetch_inc(&tgm->io_limits_disabled) == 0) {
++    qemu_coroutine_yield();
-+        throttle_group_restart_tgm(tgm);
++
-+    }
++    qemu_co_rwlock_unlock(&rwlock);
 +    qemu_coroutine_yield();
 +
 +    *(bool *)opaque = true;
 +}
 +
-+static void coroutine_fn throttle_co_drain_end(BlockDriverState *bs)
++static void coroutine_fn rwlock_wrlock_downgrade(void *opaque)
 +{
-+    ThrottleGroupMember *tgm = bs->opaque;
++    qemu_co_rwlock_wrlock(&rwlock);
-+    assert(tgm->io_limits_disabled);
++
-+    atomic_dec(&tgm->io_limits_disabled);
++    qemu_co_rwlock_downgrade(&rwlock);
 +    qemu_co_rwlock_unlock(&rwlock);
 +    *(bool *)opaque = true;
 +}
 +
- static BlockDriver bdrv_throttle = {
++static void coroutine_fn rwlock_rdlock(void *opaque)
-     .format_name                        =   "throttle",
++{
-     .protocol_name                      =   "throttle",
++    qemu_co_rwlock_rdlock(&rwlock);
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_throttle = {
      .bdrv_reopen_abort                  =   throttle_reopen_abort,
      .bdrv_co_get_block_status           =   bdrv_co_get_block_status_from_file,
 +    .bdrv_co_drain_begin                =   throttle_co_drain_begin,
 +    .bdrv_co_drain_end                  =   throttle_co_drain_end,
 +
-     .is_filter                          =   true,
++    qemu_co_rwlock_unlock(&rwlock);
- };
++    *(bool *)opaque = true;
++}
 +
 +static void coroutine_fn rwlock_wrlock(void *opaque)
 +{
 +    qemu_co_rwlock_wrlock(&rwlock);
 +
 +    qemu_co_rwlock_unlock(&rwlock);
 +    *(bool *)opaque = true;
 +}
 +
 +/*
 + * Check that downgrading a reader-writer lock does not cause a hang.
 + *
 + * Four coroutines are used to produce a situation where there are
 + * both reader and writer hopefuls waiting to acquire an rwlock that
 + * is held by a reader.
 + *
 + * The correct sequence of operations we aim to provoke can be
 + * represented as:
 + *
 + * | c1     | c2         | c3         | c4         |
 + * |--------+------------+------------+------------|
 + * | rdlock |            |            |            |
 + * | yield  |            |            |            |
 + * |        | wrlock     |            |            |
 + * |        | <queued>   |            |            |
 + * |        |            | rdlock     |            |
 + * |        |            | <queued>   |            |
 + * |        |            |            | wrlock     |
 + * |        |            |            | <queued>   |
 + * | unlock |            |            |            |
 + * | yield  |            |            |            |
 + * |        | <dequeued> |            |            |
 + * |        | downgrade  |            |            |
 + * |        |            | <dequeued> |            |
 + * |        |            | unlock     |            |
 + * |        | ...        |            |            |
 + * |        | unlock     |            |            |
 + * |        |            |            | <dequeued> |
 + * |        |            |            | unlock     |
 + */
 +static void test_co_rwlock_downgrade(void)
 +{
 +    bool c1_done = false;
 +    bool c2_done = false;
 +    bool c3_done = false;
 +    bool c4_done = false;
 +    Coroutine *c1, *c2, *c3, *c4;
 +
 +    qemu_co_rwlock_init(&rwlock);
 +
 +    c1 = qemu_coroutine_create(rwlock_rdlock_yield, &c1_done);
 +    c2 = qemu_coroutine_create(rwlock_wrlock_downgrade, &c2_done);
 +    c3 = qemu_coroutine_create(rwlock_rdlock, &c3_done);
 +    c4 = qemu_coroutine_create(rwlock_wrlock, &c4_done);
 +
 +    qemu_coroutine_enter(c1);
 +    qemu_coroutine_enter(c2);
 +    qemu_coroutine_enter(c3);
 +    qemu_coroutine_enter(c4);
 +
 +    qemu_coroutine_enter(c1);
 +
 +    g_assert(c2_done);
 +    g_assert(c3_done);
 +    g_assert(c4_done);
 +
 +    qemu_coroutine_enter(c1);
 +
 +    g_assert(c1_done);
 +}
 +
  /*
   * Check that creation, enter, and return work
   */
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
      g_test_add_func("/locking/co-mutex", test_co_mutex);
      g_test_add_func("/locking/co-mutex/lockable", test_co_mutex_lockable);
      g_test_add_func("/locking/co-rwlock/upgrade", test_co_rwlock_upgrade);
 +    g_test_add_func("/locking/co-rwlock/downgrade", test_co_rwlock_downgrade);
      if (g_test_perf()) {
          g_test_add_func("/perf/lifecycle", perf_lifecycle);
          g_test_add_func("/perf/nesting", perf_nesting);
 --
-.13.6
+.30.2

The following changes since commit f90ea7ba7c5ae7010ee0ce062207ae42530f57d6:

Merge remote-tracking branch 'remotes/pmaydell/tags/pull-target-arm-20171012' into staging (2017-10-12 17:06:50 +0100)

are available in the git repository at:

git://github.com/stefanha/qemu.git tags/block-pull-request

for you to fetch changes up to b867eaa17b3940760f51134e409cb0580dd3dde3:

block/throttle.c: add bdrv_co_drain_begin/end callbacks (2017-10-13 12:38:41 +0100)

----------------------------------------------------------------

Manos Pitsidianakis (3):
  block: add bdrv_co_drain_end callback
  block: rename bdrv_co_drain to bdrv_co_drain_begin
  block/throttle.c: add bdrv_co_drain_begin/end callbacks

include/block/block_int.h | 13 ++++++++++---
 block/io.c                | 48 +++++++++++++++++++++++++++++++++--------------
 block/qed.c               |  6 +++---
 block/throttle.c          | 18 ++++++++++++++++++
 4 files changed, 65 insertions(+), 20 deletions(-)

-- 
2.13.6

From: Manos Pitsidianakis <el13635@mail.ntua.gr>

BlockDriverState has a bdrv_co_drain() callback but no equivalent for
the end of the drain. The throttle driver (block/throttle.c) needs a way
to mark the end of the drain in order to toggle io_limits_disabled
correctly, thus bdrv_co_drain_end is needed.

Signed-off-by: Manos Pitsidianakis <el13635@mail.ntua.gr>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/block/block_int.h | 11 +++++++++--
 block/io.c                | 48 +++++++++++++++++++++++++++++++++--------------
 2 files changed, 43 insertions(+), 16 deletions(-)

diff --git a/include/block/block_int.h b/include/block/block_int.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -XXX,XX +XXX,XX @@ struct BlockDriver {
     int (*bdrv_probe_geometry)(BlockDriverState *bs, HDGeometry *geo);
 
     /**
-     * Drain and stop any internal sources of requests in the driver, and
-     * remain so until next I/O callback (e.g. bdrv_co_writev) is called.
+     * bdrv_co_drain is called if implemented in the beginning of a
+     * drain operation to drain and stop any internal sources of requests in
+     * the driver.
+     * bdrv_co_drain_end is called if implemented at the end of the drain.
+     *
+     * They should be used by the driver to e.g. manage scheduled I/O
+     * requests, or toggle an internal state. After the end of the drain new
+     * requests will continue normally.
      */
     void coroutine_fn (*bdrv_co_drain)(BlockDriverState *bs);
+    void coroutine_fn (*bdrv_co_drain_end)(BlockDriverState *bs);
 
     void (*bdrv_add_child)(BlockDriverState *parent, BlockDriverState *child,
                            Error **errp);
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ typedef struct {
     Coroutine *co;
     BlockDriverState *bs;
     bool done;
+    bool begin;
 } BdrvCoDrainData;
 
 static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
     BdrvCoDrainData *data = opaque;
     BlockDriverState *bs = data->bs;
 
-    bs->drv->bdrv_co_drain(bs);
+    if (data->begin) {
+        bs->drv->bdrv_co_drain(bs);
+    } else {
+        bs->drv->bdrv_co_drain_end(bs);
+    }
 
     /* Set data->done before reading bs->wakeup.  */
     atomic_mb_set(&data->done, true);
     bdrv_wakeup(bs);
 }
 
-static void bdrv_drain_invoke(BlockDriverState *bs)
+static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
 {
-    BdrvCoDrainData data = { .bs = bs, .done = false };
+    BdrvCoDrainData data = { .bs = bs, .done = false, .begin = begin};
 
-    if (!bs->drv || !bs->drv->bdrv_co_drain) {
+    if (!bs->drv || (begin && !bs->drv->bdrv_co_drain) ||
+            (!begin && !bs->drv->bdrv_co_drain_end)) {
         return;
     }
 
@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs)
     BDRV_POLL_WHILE(bs, !data.done);
 }
 
-static bool bdrv_drain_recurse(BlockDriverState *bs)
+static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
 {
     BdrvChild *child, *tmp;
     bool waited;
 
-    waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0);
-
     /* Ensure any pending metadata writes are submitted to bs->file.  */
-    bdrv_drain_invoke(bs);
+    bdrv_drain_invoke(bs, begin);
+
+    /* Wait for drained requests to finish */
+    waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0);
 
     QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
         BlockDriverState *bs = child->bs;
@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs)
              */
             bdrv_ref(bs);
         }
-        waited |= bdrv_drain_recurse(bs);
+        waited |= bdrv_drain_recurse(bs, begin);
         if (in_main_loop) {
             bdrv_unref(bs);
         }
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
     BlockDriverState *bs = data->bs;
 
     bdrv_dec_in_flight(bs);
-    bdrv_drained_begin(bs);
+    if (data->begin) {
+        bdrv_drained_begin(bs);
+    } else {
+        bdrv_drained_end(bs);
+    }
+
     data->done = true;
     aio_co_wake(co);
 }
 
-static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs)
+static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
+                                                bool begin)
 {
     BdrvCoDrainData data;
 
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs)
         .co = qemu_coroutine_self(),
         .bs = bs,
         .done = false,
+        .begin = begin,
     };
     bdrv_inc_in_flight(bs);
     aio_bh_schedule_oneshot(bdrv_get_aio_context(bs),
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs)
 void bdrv_drained_begin(BlockDriverState *bs)
 {
     if (qemu_in_coroutine()) {
-        bdrv_co_yield_to_drain(bs);
+        bdrv_co_yield_to_drain(bs, true);
         return;
     }
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
         bdrv_parent_drained_begin(bs);
     }
 
-    bdrv_drain_recurse(bs);
+    bdrv_drain_recurse(bs, true);
 }
 
 void bdrv_drained_end(BlockDriverState *bs)
 {
+    if (qemu_in_coroutine()) {
+        bdrv_co_yield_to_drain(bs, false);
+        return;
+    }
     assert(bs->quiesce_counter > 0);
     if (atomic_fetch_dec(&bs->quiesce_counter) > 1) {
         return;
     }
 
     bdrv_parent_drained_end(bs);
+    bdrv_drain_recurse(bs, false);
     aio_enable_external(bdrv_get_aio_context(bs));
 }
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
             aio_context_acquire(aio_context);
             for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
                 if (aio_context == bdrv_get_aio_context(bs)) {
-                    waited |= bdrv_drain_recurse(bs);
+                    waited |= bdrv_drain_recurse(bs, true);
                 }
             }
             aio_context_release(aio_context);
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
         aio_context_acquire(aio_context);
         aio_enable_external(aio_context);
         bdrv_parent_drained_end(bs);
+        bdrv_drain_recurse(bs, false);
         aio_context_release(aio_context);
     }
 
-- 
2.13.6

From: Manos Pitsidianakis <el13635@mail.ntua.gr>

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Manos Pitsidianakis <el13635@mail.ntua.gr>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/block/block_int.h | 4 ++--
 block/io.c                | 4 ++--
 block/qed.c               | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

From: Manos Pitsidianakis <el13635@mail.ntua.gr>

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Manos Pitsidianakis <el13635@mail.ntua.gr>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/throttle.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/block/throttle.c b/block/throttle.c
index XXXXXXX..XXXXXXX 100644
--- a/block/throttle.c
+++ b/block/throttle.c
@@ -XXX,XX +XXX,XX @@ static bool throttle_recurse_is_first_non_filter(BlockDriverState *bs,
     return bdrv_recurse_is_first_non_filter(bs->file->bs, candidate);
 }
 
+static void coroutine_fn throttle_co_drain_begin(BlockDriverState *bs)
+{
+    ThrottleGroupMember *tgm = bs->opaque;
+    if (atomic_fetch_inc(&tgm->io_limits_disabled) == 0) {
+        throttle_group_restart_tgm(tgm);
+    }
+}
+
+static void coroutine_fn throttle_co_drain_end(BlockDriverState *bs)
+{
+    ThrottleGroupMember *tgm = bs->opaque;
+    assert(tgm->io_limits_disabled);
+    atomic_dec(&tgm->io_limits_disabled);
+}
+
 static BlockDriver bdrv_throttle = {
     .format_name                        =   "throttle",
     .protocol_name                      =   "throttle",
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_throttle = {
     .bdrv_reopen_abort                  =   throttle_reopen_abort,
     .bdrv_co_get_block_status           =   bdrv_co_get_block_status_from_file,
 
+    .bdrv_co_drain_begin                =   throttle_co_drain_begin,
+    .bdrv_co_drain_end                  =   throttle_co_drain_end,
+
     .is_filter                          =   true,
 };
 
-- 
2.13.6

The following changes since commit 6d40ce00c1166c317e298ad82ecf10e650c4f87d:

Update version for v6.0.0-rc1 release (2021-03-30 18:19:07 +0100)

are available in the Git repository at:

https://gitlab.com/stefanha/qemu.git tags/block-pull-request

for you to fetch changes up to b6489ac06695e257ea0a9841364577e247fdee30:

test-coroutine: Add rwlock downgrade test (2021-03-31 10:44:21 +0100)

----------------------------------------------------------------
Pull request

A fix for VDI image files and more generally for CoRwlock.

----------------------------------------------------------------

David Edmondson (4):
  block/vdi: When writing new bmap entry fails, don't leak the buffer
  block/vdi: Don't assume that blocks are larger than VdiHeader
  coroutine-lock: Store the coroutine in the CoWaitRecord only once
  test-coroutine: Add rwlock downgrade test

Paolo Bonzini (2):
  coroutine-lock: Reimplement CoRwlock to fix downgrade bug
  test-coroutine: Add rwlock upgrade test

include/qemu/coroutine.h    |  17 ++--
 block/vdi.c                 |  11 ++-
 tests/unit/test-coroutine.c | 161 +++++++++++++++++++++++++++++++++++
 util/qemu-coroutine-lock.c  | 165 +++++++++++++++++++++++-------------
 4 files changed, 282 insertions(+), 72 deletions(-)

-- 
2.30.2

From: David Edmondson <david.edmondson@oracle.com>

If a new bitmap entry is allocated, requiring the entire block to be
written, avoiding leaking the buffer allocated for the block should
the write fail.

Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: David Edmondson <david.edmondson@oracle.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Acked-by: Max Reitz <mreitz@redhat.com>
Message-id: 20210325112941.365238-2-pbonzini@redhat.com
Message-Id: <20210309144015.557477-2-david.edmondson@oracle.com>
Acked-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/vdi.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/block/vdi.c b/block/vdi.c
index XXXXXXX..XXXXXXX 100644
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -XXX,XX +XXX,XX @@ nonallocating_write:
 
     logout("finished data write\n");
     if (ret < 0) {
+        g_free(block);
         return ret;
     }
 
-- 
2.30.2

From: David Edmondson <david.edmondson@oracle.com>

Given that the block size is read from the header of the VDI file, a
wide variety of sizes might be seen. Rather than re-using a block
sized memory region when writing the VDI header, allocate an
appropriately sized buffer.

Signed-off-by: David Edmondson <david.edmondson@oracle.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Acked-by: Max Reitz <mreitz@redhat.com>
Message-id: 20210325112941.365238-3-pbonzini@redhat.com
Message-Id: <20210309144015.557477-3-david.edmondson@oracle.com>
Acked-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/vdi.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/block/vdi.c b/block/vdi.c
index XXXXXXX..XXXXXXX 100644
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -XXX,XX +XXX,XX @@ nonallocating_write:
 
     if (block) {
         /* One or more new blocks were allocated. */
-        VdiHeader *header = (VdiHeader *) block;
+        VdiHeader *header;
         uint8_t *base;
         uint64_t offset;
         uint32_t n_sectors;
 
+        g_free(block);
+        header = g_malloc(sizeof(*header));
+
         logout("now writing modified header\n");
         assert(VDI_IS_ALLOCATED(bmap_first));
         *header = s->header;
         vdi_header_to_le(header);
-        ret = bdrv_pwrite(bs->file, 0, block, sizeof(VdiHeader));
-        g_free(block);
-        block = NULL;
+        ret = bdrv_pwrite(bs->file, 0, header, sizeof(*header));
+        g_free(header);
 
         if (ret < 0) {
             return ret;
-- 
2.30.2

From: David Edmondson <david.edmondson@oracle.com>

When taking the slow path for mutex acquisition, set the coroutine
value in the CoWaitRecord in push_waiter(), rather than both there and
in the caller.

Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: David Edmondson <david.edmondson@oracle.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-id: 20210325112941.365238-4-pbonzini@redhat.com
Message-Id: <20210309144015.557477-4-david.edmondson@oracle.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 util/qemu-coroutine-lock.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
index XXXXXXX..XXXXXXX 100644
--- a/util/qemu-coroutine-lock.c
+++ b/util/qemu-coroutine-lock.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn qemu_co_mutex_lock_slowpath(AioContext *ctx,
     unsigned old_handoff;
 
     trace_qemu_co_mutex_lock_entry(mutex, self);
-    w.co = self;
     push_waiter(mutex, &w);
 
     /* This is the "Responsibility Hand-Off" protocol; a lock() picks from
-- 
2.30.2

From: Paolo Bonzini <pbonzini@redhat.com>

An invariant of the current rwlock is that if multiple coroutines hold a
reader lock, all must be runnable. The unlock implementation relies on
this, choosing to wake a single coroutine when the final read lock
holder exits the critical section, assuming that it will wake a
coroutine attempting to acquire a write lock.

The downgrade implementation violates this assumption by creating a
read lock owning coroutine that is exclusively runnable - any other
coroutines that are waiting to acquire a read lock are *not* made
runnable when the write lock holder converts its ownership to read
only.

More in general, the old implementation had lots of other fairness bugs.
The root cause of the bugs was that CoQueue would wake up readers even
if there were pending writers, and would wake up writers even if there
were readers.  In that case, the coroutine would go back to sleep *at
the end* of the CoQueue, losing its place at the head of the line.

To fix this, keep the queue of waiters explicitly in the CoRwlock
instead of using CoQueue, and store for each whether it is a
potential reader or a writer.  This way, downgrade can look at the
first queued coroutines and wake it only if it is a reader, causing
all other readers in line to be released in turn.

Reported-by: David Edmondson <david.edmondson@oracle.com>
Reviewed-by: David Edmondson <david.edmondson@oracle.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-id: 20210325112941.365238-5-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/qemu/coroutine.h   |  17 ++--
 util/qemu-coroutine-lock.c | 164 +++++++++++++++++++++++--------------
 2 files changed, 114 insertions(+), 67 deletions(-)

diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/coroutine.h
+++ b/include/qemu/coroutine.h
@@ -XXX,XX +XXX,XX @@ bool qemu_co_enter_next_impl(CoQueue *queue, QemuLockable *lock);
 bool qemu_co_queue_empty(CoQueue *queue);
 
 
+typedef struct CoRwTicket CoRwTicket;
 typedef struct CoRwlock {
-    int pending_writer;
-    int reader;
     CoMutex mutex;
-    CoQueue queue;
+
+    /* Number of readers, or -1 if owned for writing.  */
+    int owners;
+
+    /* Waiting coroutines.  */
+    QSIMPLEQ_HEAD(, CoRwTicket) tickets;
 } CoRwlock;
 
 /**
@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_rdlock(CoRwlock *lock);
 /**
  * Write Locks the CoRwlock from a reader.  This is a bit more efficient than
  * @qemu_co_rwlock_unlock followed by a separate @qemu_co_rwlock_wrlock.
- * However, if the lock cannot be upgraded immediately, control is transferred
- * to the caller of the current coroutine.  Also, @qemu_co_rwlock_upgrade
- * only overrides CoRwlock fairness if there are no concurrent readers, so
- * another writer might run while @qemu_co_rwlock_upgrade blocks.
+ * Note that if the lock cannot be upgraded immediately, control is transferred
+ * to the caller of the current coroutine; another writer might run while
+ * @qemu_co_rwlock_upgrade blocks.
  */
 void qemu_co_rwlock_upgrade(CoRwlock *lock);
 
diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
index XXXXXXX..XXXXXXX 100644
--- a/util/qemu-coroutine-lock.c
+++ b/util/qemu-coroutine-lock.c
@@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex)
     trace_qemu_co_mutex_unlock_return(mutex, self);
 }
 
+struct CoRwTicket {
+    bool read;
+    Coroutine *co;
+    QSIMPLEQ_ENTRY(CoRwTicket) next;
+};
+
 void qemu_co_rwlock_init(CoRwlock *lock)
 {
-    memset(lock, 0, sizeof(*lock));
-    qemu_co_queue_init(&lock->queue);
     qemu_co_mutex_init(&lock->mutex);
+    lock->owners = 0;
+    QSIMPLEQ_INIT(&lock->tickets);
+}
+
+/* Releases the internal CoMutex.  */
+static void qemu_co_rwlock_maybe_wake_one(CoRwlock *lock)
+{
+    CoRwTicket *tkt = QSIMPLEQ_FIRST(&lock->tickets);
+    Coroutine *co = NULL;
+
+    /*
+     * Setting lock->owners here prevents rdlock and wrlock from
+     * sneaking in between unlock and wake.
+     */
+
+    if (tkt) {
+        if (tkt->read) {
+            if (lock->owners >= 0) {
+                lock->owners++;
+                co = tkt->co;
+            }
+        } else {
+            if (lock->owners == 0) {
+                lock->owners = -1;
+                co = tkt->co;
+            }
+        }
+    }
+
+    if (co) {
+        QSIMPLEQ_REMOVE_HEAD(&lock->tickets, next);
+        qemu_co_mutex_unlock(&lock->mutex);
+        aio_co_wake(co);
+    } else {
+        qemu_co_mutex_unlock(&lock->mutex);
+    }
 }
 
 void qemu_co_rwlock_rdlock(CoRwlock *lock)
@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_rdlock(CoRwlock *lock)
 
     qemu_co_mutex_lock(&lock->mutex);
     /* For fairness, wait if a writer is in line.  */
-    while (lock->pending_writer) {
-        qemu_co_queue_wait(&lock->queue, &lock->mutex);
-    }
-    lock->reader++;
-    qemu_co_mutex_unlock(&lock->mutex);
-
-    /* The rest of the read-side critical section is run without the mutex.  */
-    self->locks_held++;
-}
-
-void qemu_co_rwlock_unlock(CoRwlock *lock)
-{
-    Coroutine *self = qemu_coroutine_self();
-
-    assert(qemu_in_coroutine());
-    if (!lock->reader) {
-        /* The critical section started in qemu_co_rwlock_wrlock.  */
-        qemu_co_queue_restart_all(&lock->queue);
+    if (lock->owners == 0 || (lock->owners > 0 && QSIMPLEQ_EMPTY(&lock->tickets))) {
+        lock->owners++;
+        qemu_co_mutex_unlock(&lock->mutex);
     } else {
-        self->locks_held--;
+        CoRwTicket my_ticket = { true, self };
 
+        QSIMPLEQ_INSERT_TAIL(&lock->tickets, &my_ticket, next);
+        qemu_co_mutex_unlock(&lock->mutex);
+        qemu_coroutine_yield();
+        assert(lock->owners >= 1);
+
+        /* Possibly wake another reader, which will wake the next in line.  */
         qemu_co_mutex_lock(&lock->mutex);
-        lock->reader--;
-        assert(lock->reader >= 0);
-        /* Wakeup only one waiting writer */
-        if (!lock->reader) {
-            qemu_co_queue_next(&lock->queue);
-        }
+        qemu_co_rwlock_maybe_wake_one(lock);
     }
-    qemu_co_mutex_unlock(&lock->mutex);
+
+    self->locks_held++;
+}
+
+void qemu_co_rwlock_unlock(CoRwlock *lock)
+{
+    Coroutine *self = qemu_coroutine_self();
+
+    assert(qemu_in_coroutine());
+    self->locks_held--;
+
+    qemu_co_mutex_lock(&lock->mutex);
+    if (lock->owners > 0) {
+        lock->owners--;
+    } else {
+        assert(lock->owners == -1);
+        lock->owners = 0;
+    }
+
+    qemu_co_rwlock_maybe_wake_one(lock);
 }
 
 void qemu_co_rwlock_downgrade(CoRwlock *lock)
 {
-    Coroutine *self = qemu_coroutine_self();
+    qemu_co_mutex_lock(&lock->mutex);
+    assert(lock->owners == -1);
+    lock->owners = 1;
 
-    /* lock->mutex critical section started in qemu_co_rwlock_wrlock or
-     * qemu_co_rwlock_upgrade.
-     */
-    assert(lock->reader == 0);
-    lock->reader++;
-    qemu_co_mutex_unlock(&lock->mutex);
-
-    /* The rest of the read-side critical section is run without the mutex.  */
-    self->locks_held++;
+    /* Possibly wake another reader, which will wake the next in line.  */
+    qemu_co_rwlock_maybe_wake_one(lock);
 }
 
 void qemu_co_rwlock_wrlock(CoRwlock *lock)
 {
+    Coroutine *self = qemu_coroutine_self();
+
     qemu_co_mutex_lock(&lock->mutex);
-    lock->pending_writer++;
-    while (lock->reader) {
-        qemu_co_queue_wait(&lock->queue, &lock->mutex);
+    if (lock->owners == 0) {
+        lock->owners = -1;
+        qemu_co_mutex_unlock(&lock->mutex);
+    } else {
+        CoRwTicket my_ticket = { false, qemu_coroutine_self() };
+
+        QSIMPLEQ_INSERT_TAIL(&lock->tickets, &my_ticket, next);
+        qemu_co_mutex_unlock(&lock->mutex);
+        qemu_coroutine_yield();
+        assert(lock->owners == -1);
     }
-    lock->pending_writer--;
 
-    /* The rest of the write-side critical section is run with
-     * the mutex taken, so that lock->reader remains zero.
-     * There is no need to update self->locks_held.
-     */
+    self->locks_held++;
 }
 
 void qemu_co_rwlock_upgrade(CoRwlock *lock)
 {
-    Coroutine *self = qemu_coroutine_self();
-
     qemu_co_mutex_lock(&lock->mutex);
-    assert(lock->reader > 0);
-    lock->reader--;
-    lock->pending_writer++;
-    while (lock->reader) {
-        qemu_co_queue_wait(&lock->queue, &lock->mutex);
+    assert(lock->owners > 0);
+    /* For fairness, wait if a writer is in line.  */
+    if (lock->owners == 1 && QSIMPLEQ_EMPTY(&lock->tickets)) {
+        lock->owners = -1;
+        qemu_co_mutex_unlock(&lock->mutex);
+    } else {
+        CoRwTicket my_ticket = { false, qemu_coroutine_self() };
+
+        lock->owners--;
+        QSIMPLEQ_INSERT_TAIL(&lock->tickets, &my_ticket, next);
+        qemu_co_rwlock_maybe_wake_one(lock);
+        qemu_coroutine_yield();
+        assert(lock->owners == -1);
     }
-    lock->pending_writer--;
-
-    /* The rest of the write-side critical section is run with
-     * the mutex taken, similar to qemu_co_rwlock_wrlock.  Do
-     * not account for the lock twice in self->locks_held.
-     */
-    self->locks_held--;
 }
-- 
2.30.2

From: Paolo Bonzini <pbonzini@redhat.com>

Test that rwlock upgrade is fair, and that readers go back to sleep if
a writer is in line.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-id: 20210325112941.365238-6-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 tests/unit/test-coroutine.c | 62 +++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

diff --git a/tests/unit/test-coroutine.c b/tests/unit/test-coroutine.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/unit/test-coroutine.c
+++ b/tests/unit/test-coroutine.c
@@ -XXX,XX +XXX,XX @@ static void test_co_mutex_lockable(void)
     g_assert(QEMU_MAKE_LOCKABLE(null_pointer) == NULL);
 }
 
+static CoRwlock rwlock;
+
+/* Test that readers are properly sent back to the queue when upgrading,
+ * even if they are the sole readers.  The test scenario is as follows:
+ *
+ *
+ * | c1           | c2         |
+ * |--------------+------------+
+ * | rdlock       |            |
+ * | yield        |            |
+ * |              | wrlock     |
+ * |              | <queued>   |
+ * | upgrade      |            |
+ * | <queued>     | <dequeued> |
+ * |              | unlock     |
+ * | <dequeued>   |            |
+ * | unlock       |            |
+ */
+
+static void coroutine_fn rwlock_yield_upgrade(void *opaque)
+{
+    qemu_co_rwlock_rdlock(&rwlock);
+    qemu_coroutine_yield();
+
+    qemu_co_rwlock_upgrade(&rwlock);
+    qemu_co_rwlock_unlock(&rwlock);
+
+    *(bool *)opaque = true;
+}
+
+static void coroutine_fn rwlock_wrlock_yield(void *opaque)
+{
+    qemu_co_rwlock_wrlock(&rwlock);
+    qemu_coroutine_yield();
+
+    qemu_co_rwlock_unlock(&rwlock);
+    *(bool *)opaque = true;
+}
+
+static void test_co_rwlock_upgrade(void)
+{
+    bool c1_done = false;
+    bool c2_done = false;
+    Coroutine *c1, *c2;
+
+    qemu_co_rwlock_init(&rwlock);
+    c1 = qemu_coroutine_create(rwlock_yield_upgrade, &c1_done);
+    c2 = qemu_coroutine_create(rwlock_wrlock_yield, &c2_done);
+
+    qemu_coroutine_enter(c1);
+    qemu_coroutine_enter(c2);
+
+    /* c1 now should go to sleep.  */
+    qemu_coroutine_enter(c1);
+    g_assert(!c1_done);
+
+    qemu_coroutine_enter(c2);
+    g_assert(c1_done);
+    g_assert(c2_done);
+}
+
 /*
  * Check that creation, enter, and return work
  */
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
     g_test_add_func("/basic/order", test_order);
     g_test_add_func("/locking/co-mutex", test_co_mutex);
     g_test_add_func("/locking/co-mutex/lockable", test_co_mutex_lockable);
+    g_test_add_func("/locking/co-rwlock/upgrade", test_co_rwlock_upgrade);
     if (g_test_perf()) {
         g_test_add_func("/perf/lifecycle", perf_lifecycle);
         g_test_add_func("/perf/nesting", perf_nesting);
-- 
2.30.2

From: David Edmondson <david.edmondson@oracle.com>

Test that downgrading an rwlock does not result in a failure to
schedule coroutines queued on the rwlock.

The diagram associated with test_co_rwlock_downgrade() describes the
intended behaviour, but what was observed previously corresponds to:

| c1     | c2         | c3         | c4       |
|--------+------------+------------+----------|
| rdlock |            |            |          |
| yield  |            |            |          |
|        | wrlock     |            |          |
|        | <queued>   |            |          |
|        |            | rdlock     |          |
|        |            | <queued>   |          |
|        |            |            | wrlock   |
|        |            |            | <queued> |
| unlock |            |            |          |
| yield  |            |            |          |
|        | <dequeued> |            |          |
|        | downgrade  |            |          |
|        | ...        |            |          |
|        | unlock     |            |          |
|        |            | <dequeued> |          |
|        |            | <queued>   |          |

This results in a failure...

ERROR:../tests/test-coroutine.c:369:test_co_rwlock_downgrade: assertion failed: (c3_done)
Bail out! ERROR:../tests/test-coroutine.c:369:test_co_rwlock_downgrade: assertion failed: (c3_done)

...as a result of the c3 coroutine failing to run to completion.

Signed-off-by: David Edmondson <david.edmondson@oracle.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-id: 20210325112941.365238-7-pbonzini@redhat.com
Message-Id: <20210309144015.557477-5-david.edmondson@oracle.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 tests/unit/test-coroutine.c | 99 +++++++++++++++++++++++++++++++++++++
 1 file changed, 99 insertions(+)

diff --git a/tests/unit/test-coroutine.c b/tests/unit/test-coroutine.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/unit/test-coroutine.c
+++ b/tests/unit/test-coroutine.c
@@ -XXX,XX +XXX,XX @@ static void test_co_rwlock_upgrade(void)
     g_assert(c2_done);
 }
 
+static void coroutine_fn rwlock_rdlock_yield(void *opaque)
+{
+    qemu_co_rwlock_rdlock(&rwlock);
+    qemu_coroutine_yield();
+
+    qemu_co_rwlock_unlock(&rwlock);
+    qemu_coroutine_yield();
+
+    *(bool *)opaque = true;
+}
+
+static void coroutine_fn rwlock_wrlock_downgrade(void *opaque)
+{
+    qemu_co_rwlock_wrlock(&rwlock);
+
+    qemu_co_rwlock_downgrade(&rwlock);
+    qemu_co_rwlock_unlock(&rwlock);
+    *(bool *)opaque = true;
+}
+
+static void coroutine_fn rwlock_rdlock(void *opaque)
+{
+    qemu_co_rwlock_rdlock(&rwlock);
+
+    qemu_co_rwlock_unlock(&rwlock);
+    *(bool *)opaque = true;
+}
+
+static void coroutine_fn rwlock_wrlock(void *opaque)
+{
+    qemu_co_rwlock_wrlock(&rwlock);
+
+    qemu_co_rwlock_unlock(&rwlock);
+    *(bool *)opaque = true;
+}
+
+/*
+ * Check that downgrading a reader-writer lock does not cause a hang.
+ *
+ * Four coroutines are used to produce a situation where there are
+ * both reader and writer hopefuls waiting to acquire an rwlock that
+ * is held by a reader.
+ *
+ * The correct sequence of operations we aim to provoke can be
+ * represented as:
+ *
+ * | c1     | c2         | c3         | c4         |
+ * |--------+------------+------------+------------|
+ * | rdlock |            |            |            |
+ * | yield  |            |            |            |
+ * |        | wrlock     |            |            |
+ * |        | <queued>   |            |            |
+ * |        |            | rdlock     |            |
+ * |        |            | <queued>   |            |
+ * |        |            |            | wrlock     |
+ * |        |            |            | <queued>   |
+ * | unlock |            |            |            |
+ * | yield  |            |            |            |
+ * |        | <dequeued> |            |            |
+ * |        | downgrade  |            |            |
+ * |        |            | <dequeued> |            |
+ * |        |            | unlock     |            |
+ * |        | ...        |            |            |
+ * |        | unlock     |            |            |
+ * |        |            |            | <dequeued> |
+ * |        |            |            | unlock     |
+ */
+static void test_co_rwlock_downgrade(void)
+{
+    bool c1_done = false;
+    bool c2_done = false;
+    bool c3_done = false;
+    bool c4_done = false;
+    Coroutine *c1, *c2, *c3, *c4;
+
+    qemu_co_rwlock_init(&rwlock);
+
+    c1 = qemu_coroutine_create(rwlock_rdlock_yield, &c1_done);
+    c2 = qemu_coroutine_create(rwlock_wrlock_downgrade, &c2_done);
+    c3 = qemu_coroutine_create(rwlock_rdlock, &c3_done);
+    c4 = qemu_coroutine_create(rwlock_wrlock, &c4_done);
+
+    qemu_coroutine_enter(c1);
+    qemu_coroutine_enter(c2);
+    qemu_coroutine_enter(c3);
+    qemu_coroutine_enter(c4);
+
+    qemu_coroutine_enter(c1);
+
+    g_assert(c2_done);
+    g_assert(c3_done);
+    g_assert(c4_done);
+
+    qemu_coroutine_enter(c1);
+
+    g_assert(c1_done);
+}
+
 /*
  * Check that creation, enter, and return work
  */
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
     g_test_add_func("/locking/co-mutex", test_co_mutex);
     g_test_add_func("/locking/co-mutex/lockable", test_co_mutex_lockable);
     g_test_add_func("/locking/co-rwlock/upgrade", test_co_rwlock_upgrade);
+    g_test_add_func("/locking/co-rwlock/downgrade", test_co_rwlock_downgrade);
     if (g_test_perf()) {
         g_test_add_func("/perf/lifecycle", perf_lifecycle);
         g_test_add_func("/perf/nesting", perf_nesting);
-- 
2.30.2