Series comparison

-[Qemu-devel] [PULL 00/35] Block layer patches
+[Qemu-devel] [PULL v3 00/35] Block layer patches
-The following changes since commit 2ef2f16781af9dee6ba6517755e9073ba5799fa2:
+The following changes since commit 281f327487c9c9b1599f93c589a408bbf4a651b8:
-  Merge remote-tracking branch 'remotes/dgilbert/tags/pull-migration-20180615a' into staging (2018-06-15 18:13:35 +0100)
+  Merge remote-tracking branch 'remotes/vivier/tags/m68k-for-2.12-pull-request' into staging (2017-12-22 00:11:36 +0000)
 are available in the git repository at:
   git://repo.or.cz/qemu/kevin.git tags/for-upstream
-for you to fetch changes up to 4c790afe2503eab12874508acab5b388d7babfd2:
+for you to fetch changes up to 1a63a907507fbbcfaee3f622907ec244b7eabda8:
-  Merge remote-tracking branch 'mreitz/tags/pull-block-2018-06-18' into queue-block (2018-06-18 17:20:42 +0200)
+  block: Keep nodes drained between reopen_queue/multiple (2017-12-22 15:05:32 +0100)
 ----------------------------------------------------------------
-Block layer patches:
+Block layer patches
 - Active mirror (blockdev-mirror copy-mode=write-blocking)
 - bdrv_drain_*() fixes and test cases
 - Fix crash with scsi-hd and drive_del
 ----------------------------------------------------------------
-Greg Kurz (1):
+Doug Gale (1):
-      block: fix QEMU crash with scsi-hd and drive_del
+      nvme: Add tracing
-Kevin Wolf (20):
+Edgar Kaziakhmedov (1):
-      test-bdrv-drain: bdrv_drain() works with cross-AioContext events
+      qcow2: get rid of qcow2_backing_read1 routine
       block: Use bdrv_do_drain_begin/end in bdrv_drain_all()
       block: Remove 'recursive' parameter from bdrv_drain_invoke()
       block: Don't manually poll in bdrv_drain_all()
       tests/test-bdrv-drain: bdrv_drain_all() works in coroutines now
       block: Avoid unnecessary aio_poll() in AIO_WAIT_WHILE()
       block: Really pause block jobs on drain
       block: Remove bdrv_drain_recurse()
       block: Drain recursively with a single BDRV_POLL_WHILE()
       test-bdrv-drain: Test node deletion in subtree recursion
       block: Don't poll in parent drain callbacks
       test-bdrv-drain: Graph change through parent callback
       block: Defer .bdrv_drain_begin callback to polling phase
       test-bdrv-drain: Test that bdrv_drain_invoke() doesn't poll
       block: Allow AIO_WAIT_WHILE with NULL ctx
       block: Move bdrv_drain_all_begin() out of coroutine context
       block: ignore_bds_parents parameter for drain functions
       block: Allow graph changes in bdrv_drain_all_begin/end sections
       test-bdrv-drain: Test graph changes in drain_all section
       Merge remote-tracking branch 'mreitz/tags/pull-block-2018-06-18' into queue-block
-Max Reitz (15):
+Fam Zheng (2):
-      test-bdrv-drain: Add test for node deletion
+      block: Open backing image in force share mode for size probe
-      block/mirror: Pull out mirror_perform()
+      block: Remove unused bdrv_requests_pending
       block/mirror: Convert to coroutines
       block/mirror: Use CoQueue to wait on in-flight ops
       block/mirror: Wait for in-flight op conflicts
       block/mirror: Use source as a BdrvChild
       block: Generalize should_update_child() rule
       hbitmap: Add @advance param to hbitmap_iter_next()
       test-hbitmap: Add non-advancing iter_next tests
       block/dirty-bitmap: Add bdrv_dirty_iter_next_area
       block/mirror: Add MirrorBDSOpaque
       job: Add job_progress_increase_remaining()
       block/mirror: Add active mirroring
       block/mirror: Add copy mode QAPI interface
       iotests: Add test for active mirroring
- qapi/block-core.json         |  29 +-
+John Snow (1):
- include/block/aio-wait.h     |  25 +-
+      iotests: fix 197 for vpc
  include/block/block.h        |  31 +-
  include/block/block_int.h    |  18 +-
  include/block/blockjob_int.h |   8 +
  include/block/dirty-bitmap.h |   2 +
  include/qemu/hbitmap.h       |   5 +-
  include/qemu/job.h           |  15 +
  block.c                      |  96 +++++-
  block/backup.c               |   2 +-
  block/block-backend.c        |   5 +
  block/dirty-bitmap.c         |  57 +++-
  block/io.c                   | 332 ++++++++++++--------
  block/mirror.c               | 613 +++++++++++++++++++++++++++++--------
  block/vvfat.c                |   1 +
  blockdev.c                   |   9 +-
  blockjob.c                   |  23 ++
  job.c                        |   5 +
  tests/test-bdrv-drain.c      | 705 +++++++++++++++++++++++++++++++++++++++++--
  tests/test-hbitmap.c         |  38 ++-
  util/hbitmap.c               |  10 +-
  tests/qemu-iotests/151       | 120 ++++++++
  tests/qemu-iotests/151.out   |   5 +
  tests/qemu-iotests/group     |   1 +
 files changed, 1836 insertions(+), 319 deletions(-)
  create mode 100755 tests/qemu-iotests/151
  create mode 100644 tests/qemu-iotests/151.out
+Kevin Wolf (27):
+      block: Formats don't need CONSISTENT_READ with NO_IO
+      block: Make bdrv_drain_invoke() recursive
+      block: Call .drain_begin only once in bdrv_drain_all_begin()
+      test-bdrv-drain: Test BlockDriver callbacks for drain
+      block: bdrv_drain_recurse(): Remove unused begin parameter
+      block: Don't wait for requests in bdrv_drain*_end()
+      block: Unify order in drain functions
+      block: Don't acquire AioContext in hmp_qemu_io()
+      block: Document that x-blockdev-change breaks quorum children list
+      block: Assert drain_all is only called from main AioContext
+      block: Make bdrv_drain() driver callbacks non-recursive
+      test-bdrv-drain: Test callback for bdrv_drain
+      test-bdrv-drain: Test bs->quiesce_counter
+      blockjob: Pause job on draining any job BDS
+      test-bdrv-drain: Test drain vs. block jobs
+      block: Don't block_job_pause_all() in bdrv_drain_all()
+      block: Nested drain_end must still call callbacks
+      test-bdrv-drain: Test nested drain sections
+      block: Don't notify parents in drain call chain
+      block: Add bdrv_subtree_drained_begin/end()
+      test-bdrv-drain: Tests for bdrv_subtree_drain
+      test-bdrv-drain: Test behaviour in coroutine context
+      test-bdrv-drain: Recursive draining with multiple parents
+      block: Allow graph changes in subtree drained section
+      test-bdrv-drain: Test graph changes in drained section
+      commit: Simplify reopen of base
+      block: Keep nodes drained between reopen_queue/multiple
+Thomas Huth (3):
+      block: Remove the obsolete -drive boot=on|off parameter
+      block: Remove the deprecated -hdachs option
+      block: Mention -drive cyls/heads/secs/trans/serial/addr in deprecation chapter
+ qapi/block-core.json             |   4 +
+ block/qcow2.h                    |   3 -
+ include/block/block.h            |  15 +-
+ include/block/block_int.h        |   6 +-
+ block.c                          |  75 ++++-
+ block/commit.c                   |   8 +-
+ block/io.c                       | 164 +++++++---
+ block/qcow2.c                    |  51 +--
+ block/replication.c              |   6 +
+ blockdev.c                       |  11 -
+ blockjob.c                       |  22 +-
+ hmp.c                            |   6 -
+ hw/block/nvme.c                  | 349 +++++++++++++++++----
+ qemu-io-cmds.c                   |   3 +
+ tests/test-bdrv-drain.c          | 651 +++++++++++++++++++++++++++++++++++++++
+ vl.c                             |  86 +-----
+ hw/block/trace-events            |  93 ++++++
+ qemu-doc.texi                    |  29 +-
+ qemu-options.hx                  |  19 +-
+ tests/Makefile.include           |   2 +
+ tests/qemu-iotests/197           |   4 +
+ tests/qemu-iotests/common.filter |   3 +-
+files changed, 1294 insertions(+), 316 deletions(-)
+ create mode 100644 tests/test-bdrv-drain.c

-[Qemu-devel] [PULL 16/35] block: Allow AIO_WAIT_WHILE with NULL ctx
+[Qemu-devel] [PULL v3 01/35] block: Formats don't need CONSISTENT_READ with NO_IO
-bdrv_drain_all() wants to have a single polling loop for draining the
+Commit 1f4ad7d fixed 'qemu-img info' for raw images that are currently
-in-flight requests of all nodes. This means that the AIO_WAIT_WHILE()
+in use as a mirror target. It is not enough for image formats, though,
-condition relies on activity in multiple AioContexts, which is polled
+as these still unconditionally request BLK_PERM_CONSISTENT_READ.
 from the mainloop context. We must therefore call AIO_WAIT_WHILE() from
 the mainloop thread and use the AioWait notification mechanism.
-Just randomly picking the AioContext of any non-mainloop thread would
+As this permission is geared towards whether the guest-visible data is
-work, but instead of bothering to find such a context in the caller, we
+consistent, and has no impact on whether the metadata is sane, and
-can just as well accept NULL for ctx.
+'qemu-img info' does not read guest-visible data (except for the raw
 format), it makes sense to not require BLK_PERM_CONSISTENT_READ if there
 is not going to be any guest I/O performed, regardless of image format.
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- include/block/aio-wait.h | 13 +++++++++----
+ block.c | 6 +++++-
-file changed, 9 insertions(+), 4 deletions(-)
+file changed, 5 insertions(+), 1 deletion(-)
-diff --git a/include/block/aio-wait.h b/include/block/aio-wait.h
+diff --git a/block.c b/block.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/block/aio-wait.h
+--- a/block.c
-+++ b/include/block/aio-wait.h
++++ b/block.c
-@@ -XXX,XX +XXX,XX @@ typedef struct {
+@@ -XXX,XX +XXX,XX @@ void bdrv_format_default_perms(BlockDriverState *bs, BdrvChild *c,
- /**
+     assert(role == &child_backing || role == &child_file);
-  * AIO_WAIT_WHILE:
-  * @wait: the aio wait object
+     if (!backing) {
-- * @ctx: the aio context
++        int flags = bdrv_reopen_get_flags(reopen_queue, bs);
-+ * @ctx: the aio context, or NULL if multiple aio contexts (for which the
++
-+ *       caller does not hold a lock) are involved in the polling condition.
+         /* Apart from the modifications below, the same permissions are
-  * @cond: wait while this conditional expression is true
+          * forwarded and left alone as for filters */
-  *
+         bdrv_filter_default_perms(bs, c, role, reopen_queue, perm, shared,
-  * Wait while a condition is true.  Use this to implement synchronous
+@@ -XXX,XX +XXX,XX @@ void bdrv_format_default_perms(BlockDriverState *bs, BdrvChild *c,
-@@ -XXX,XX +XXX,XX @@ typedef struct {
-     bool waited_ = false;                                          \
+         /* bs->file always needs to be consistent because of the metadata. We
-     AioWait *wait_ = (wait);                                       \
+          * can never allow other users to resize or write to it. */
-     AioContext *ctx_ = (ctx);                                      \
+-        perm |= BLK_PERM_CONSISTENT_READ;
--    if (in_aio_context_home_thread(ctx_)) {                        \
++        if (!(flags & BDRV_O_NO_IO)) {
-+    if (ctx_ && in_aio_context_home_thread(ctx_)) {                \
++            perm |= BLK_PERM_CONSISTENT_READ;
-         while ((cond)) {                                           \
++        }
-             aio_poll(ctx_, true);                                  \
+         shared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
-             waited_ = true;                                        \
+     } else {
-@@ -XXX,XX +XXX,XX @@ typedef struct {
+         /* We want consistent read from backing files if the parent needs it.
          /* Increment wait_->num_waiters before evaluating cond. */ \
          atomic_inc(&wait_->num_waiters);                           \
          while ((cond)) {                                           \
 -            aio_context_release(ctx_);                             \
 +            if (ctx_) {                                            \
 +                aio_context_release(ctx_);                         \
 +            }                                                      \
              aio_poll(qemu_get_aio_context(), true);                \
 -            aio_context_acquire(ctx_);                             \
 +            if (ctx_) {                                            \
 +                aio_context_acquire(ctx_);                         \
 +            }                                                      \
              waited_ = true;                                        \
          }                                                          \
          atomic_dec(&wait_->num_waiters);                           \
 --
 .13.6

-[Qemu-devel] [PULL 32/35] job: Add job_progress_increase_remaining()
+[Qemu-devel] [PULL v3 02/35] iotests: fix 197 for vpc
-From: Max Reitz <mreitz@redhat.com>
+From: John Snow <jsnow@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+VPC has some difficulty creating geometries of particular size.
-Message-id: 20180613181823.13618-12-mreitz@redhat.com
+However, we can indeed force it to use a literal one, so let's
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
+do that for the sake of test 197, which is testing some specific
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+offsets.
 Signed-off-by: John Snow <jsnow@redhat.com>
 Reviewed-by: Eric Blake <eblake@redhat.com>
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 Reviewed-by: Lukáš Doktor <ldoktor@redhat.com>
 ---
- include/qemu/job.h | 15 +++++++++++++++
+ tests/qemu-iotests/197           | 4 ++++
- job.c              |  5 +++++
+ tests/qemu-iotests/common.filter | 3 ++-
-files changed, 20 insertions(+)
+files changed, 6 insertions(+), 1 deletion(-)
-diff --git a/include/qemu/job.h b/include/qemu/job.h
+diff --git a/tests/qemu-iotests/197 b/tests/qemu-iotests/197
 index XXXXXXX..XXXXXXX 100755
 --- a/tests/qemu-iotests/197
 +++ b/tests/qemu-iotests/197
@@ -XXX,XX +XXX,XX @@ echo '=== Copy-on-read ==='
  echo
  # Prep the images
 +# VPC rounds image sizes to a specific geometry, force a specific size.
 +if [ "$IMGFMT" = "vpc" ]; then
 +    IMGOPTS=$(_optstr_add "$IMGOPTS" "force_size")
 +fi
  _make_test_img 4G
  $QEMU_IO -c "write -P 55 3G 1k" "$TEST_IMG" | _filter_qemu_io
  IMGPROTO=file IMGFMT=qcow2 IMGOPTS= TEST_IMG_FILE="$TEST_WRAP" \
 diff --git a/tests/qemu-iotests/common.filter b/tests/qemu-iotests/common.filter
 index XXXXXXX..XXXXXXX 100644
---- a/include/qemu/job.h
+--- a/tests/qemu-iotests/common.filter
-+++ b/include/qemu/job.h
++++ b/tests/qemu-iotests/common.filter
-@@ -XXX,XX +XXX,XX @@ void job_progress_update(Job *job, uint64_t done);
+@@ -XXX,XX +XXX,XX @@ _filter_img_create()
-  */
+         -e "s# log_size=[0-9]\\+##g" \
- void job_progress_set_remaining(Job *job, uint64_t remaining);
+         -e "s# refcount_bits=[0-9]\\+##g" \
+         -e "s# key-secret=[a-zA-Z0-9]\\+##g" \
-+/**
+-        -e "s# iter-time=[0-9]\\+##g"
-+ * @job: The job whose expected progress end value is updated
++        -e "s# iter-time=[0-9]\\+##g" \
-+ * @delta: Value which is to be added to the current expected end
++        -e "s# force_size=\\(on\\|off\\)##g"
 + *         value
 + *
 + * Increases the expected end value of the progress counter of a job.
 + * This is useful for parenthesis operations: If a job has to
 + * conditionally perform a high-priority operation as part of its
 + * progress, it calls this function with the expected operation's
 + * length before, and job_progress_update() afterwards.
 + * (So the operation acts as a parenthesis in regards to the main job
 + * operation running in background.)
 + */
 +void job_progress_increase_remaining(Job *job, uint64_t delta);
 +
  /** To be called when a cancelled job is finalised. */
  void job_event_cancelled(Job *job);
 diff --git a/job.c b/job.c
 index XXXXXXX..XXXXXXX 100644
 --- a/job.c
 +++ b/job.c
@@ -XXX,XX +XXX,XX @@ void job_progress_set_remaining(Job *job, uint64_t remaining)
      job->progress_total = job->progress_current + remaining;
  }
-+void job_progress_increase_remaining(Job *job, uint64_t delta)
+ _filter_img_info()
 +{
 +    job->progress_total += delta;
 +}
 +
  void job_event_cancelled(Job *job)
  {
      notifier_list_notify(&job->on_finalize_cancelled, job);
 --
 .13.6

-[Qemu-devel] [PULL 03/35] block: Remove 'recursive' parameter from bdrv_drain_invoke()
+[Qemu-devel] [PULL v3 03/35] block: Make bdrv_drain_invoke() recursive
-All callers pass false for the 'recursive' parameter now. Remove it.
+This change separates bdrv_drain_invoke(), which calls the BlockDriver
 drain callbacks, from bdrv_drain_recurse(). Instead, the function
 performs its own recursion now.
+One reason for this is that bdrv_drain_recurse() can be called multiple
+times by bdrv_drain_all_begin(), but the callbacks may only be called
+once. The separation is necessary to fix this bug.
+The other reason is that we intend to go to a model where we call all
+driver callbacks first, and only then start polling. This is not fully
+achieved yet with this patch, as bdrv_drain_invoke() contains a
+BDRV_POLL_WHILE() loop for the block driver callbacks, which can still
+call callbacks for any unrelated event. It's a step in this direction
+anyway.
+Cc: qemu-stable@nongnu.org
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- block/io.c | 13 +++----------
+ block/io.c | 14 +++++++++++---
-file changed, 3 insertions(+), 10 deletions(-)
+file changed, 11 insertions(+), 3 deletions(-)
 diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/io.c
 +++ b/block/io.c
 @@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
+     bdrv_wakeup(bs);
  }
- /* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
++/* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
--static void bdrv_drain_invoke(BlockDriverState *bs, bool begin, bool recursive)
+ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
 +static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
  {
--    BdrvChild *child, *tmp;
++    BdrvChild *child, *tmp;
      BdrvCoDrainData data = { .bs = bs, .done = false, .begin = begin};
      if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) ||
-@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin, bool recursive)
+@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
      data.co = qemu_coroutine_create(bdrv_drain_invoke_entry, &data);
      bdrv_coroutine_enter(bs, data.co);
      BDRV_POLL_WHILE(bs, !data.done);
++
++    QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
++        bdrv_drain_invoke(child->bs, begin);
++    }
+ }
+ static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
+@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
+     BdrvChild *child, *tmp;
+     bool waited;
+-    /* Ensure any pending metadata writes are submitted to bs->file.  */
+-    bdrv_drain_invoke(bs, begin);
 -
--    if (recursive) {
+     /* Wait for drained requests to finish */
--        QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
+     waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0);
--            bdrv_drain_invoke(child->bs, begin, true);
--        }
+@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
--    }
+         bdrv_parent_drained_begin(bs);
      }
 +    bdrv_drain_invoke(bs, true);
      bdrv_drain_recurse(bs, true);
  }
- static bool bdrv_drain_recurse(BlockDriverState *bs)
+@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
@@ -XXX,XX +XXX,XX @@ void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
      }
-     bdrv_parent_drained_begin(bs, parent);
+     bdrv_parent_drained_end(bs);
 -    bdrv_drain_invoke(bs, true, false);
 +    bdrv_drain_invoke(bs, true);
      bdrv_drain_recurse(bs);
      if (recursive) {
@@ -XXX,XX +XXX,XX @@ void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
      old_quiesce_counter = atomic_fetch_dec(&bs->quiesce_counter);
      /* Re-enable things in child-to-parent order */
 -    bdrv_drain_invoke(bs, false, false);
 +    bdrv_drain_invoke(bs, false);
-     bdrv_parent_drained_end(bs, parent);
+     bdrv_drain_recurse(bs, false);
-     if (old_quiesce_counter == 1) {
+     aio_enable_external(bdrv_get_aio_context(bs));
-         aio_enable_external(bdrv_get_aio_context(bs));
+ }
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
              aio_context_acquire(aio_context);
              for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
                  if (aio_context == bdrv_get_aio_context(bs)) {
 +                    /* FIXME Calling this multiple times is wrong */
 +                    bdrv_drain_invoke(bs, true);
                      waited |= bdrv_drain_recurse(bs, true);
                  }
              }
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
          aio_context_acquire(aio_context);
          aio_enable_external(aio_context);
          bdrv_parent_drained_end(bs);
 +        bdrv_drain_invoke(bs, false);
          bdrv_drain_recurse(bs, false);
          aio_context_release(aio_context);
      }
 --
 .13.6

-[Qemu-devel] [PULL 06/35] block: Avoid unnecessary aio_poll() in AIO_WAIT_WHILE()
+[Qemu-devel] [PULL v3 04/35] block: Call .drain_begin only once in bdrv_drain_all_begin()
-Commit 91af091f923 added an additional aio_poll() to BDRV_POLL_WHILE()
+bdrv_drain_all_begin() used to call the .bdrv_co_drain_begin() driver
-in order to make sure that all pending BHs are executed on drain. This
+callback inside its polling loop. This means that how many times it got
-was the wrong place to make the fix, as it is useless overhead for all
+called for each node depended on long it had to poll the event loop.
 other users of the macro and unnecessarily complicates the mechanism.
-This patch effectively reverts said commit (the context has changed a
+This is obviously not right and results in nodes that stay drained even
-bit and the code has moved to AIO_WAIT_WHILE()) and instead polls in the
+after bdrv_drain_all_end(), which calls .bdrv_co_drain_begin() once per
-loop condition for drain.
+node.
-The effect is probably hard to measure in any real-world use case
+Fix bdrv_drain_all_begin() to call the callback only once, too.
 because actual I/O will dominate, but if I run only the initialisation
 part of 'qemu-img convert' where it calls bdrv_block_status() for the
 whole image to find out how much data there is copy, this phase actually
 needs only roughly half the time after this patch.
+Cc: qemu-stable@nongnu.org
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- include/block/aio-wait.h | 22 ++++++++--------------
+ block/io.c | 3 +--
- block/io.c               | 11 ++++++++++-
+file changed, 1 insertion(+), 2 deletions(-)
 files changed, 18 insertions(+), 15 deletions(-)
-diff --git a/include/block/aio-wait.h b/include/block/aio-wait.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/block/aio-wait.h
-+++ b/include/block/aio-wait.h
-@@ -XXX,XX +XXX,XX @@ typedef struct {
-  */
- #define AIO_WAIT_WHILE(wait, ctx, cond) ({                         \
-     bool waited_ = false;                                          \
--    bool busy_ = true;                                             \
-     AioWait *wait_ = (wait);                                       \
-     AioContext *ctx_ = (ctx);                                      \
-     if (in_aio_context_home_thread(ctx_)) {                        \
--        while ((cond) || busy_) {                                  \
--            busy_ = aio_poll(ctx_, (cond));                        \
--            waited_ |= !!(cond) | busy_;                           \
-+        while ((cond)) {                                           \
-+            aio_poll(ctx_, true);                                  \
-+            waited_ = true;                                        \
-         }                                                          \
-     } else {                                                       \
-         assert(qemu_get_current_aio_context() ==                   \
-                qemu_get_aio_context());                            \
-         /* Increment wait_->num_waiters before evaluating cond. */ \
-         atomic_inc(&wait_->num_waiters);                           \
--        while (busy_) {                                            \
--            if ((cond)) {                                          \
--                waited_ = busy_ = true;                            \
--                aio_context_release(ctx_);                         \
--                aio_poll(qemu_get_aio_context(), true);            \
--                aio_context_acquire(ctx_);                         \
--            } else {                                               \
--                busy_ = aio_poll(ctx_, false);                     \
--                waited_ |= busy_;                                  \
--            }                                                      \
-+        while ((cond)) {                                           \
-+            aio_context_release(ctx_);                             \
-+            aio_poll(qemu_get_aio_context(), true);                \
-+            aio_context_acquire(ctx_);                             \
-+            waited_ = true;                                        \
-         }                                                          \
-         atomic_dec(&wait_->num_waiters);                           \
-     }                                                              \
 diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/io.c
 +++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
-     BDRV_POLL_WHILE(bs, !data.done);
+         aio_context_acquire(aio_context);
- }
+         bdrv_parent_drained_begin(bs);
+         aio_disable_external(aio_context);
-+/* Returns true if BDRV_POLL_WHILE() should go into a blocking aio_poll() */
++        bdrv_drain_invoke(bs, true);
-+static bool bdrv_drain_poll(BlockDriverState *bs)
+         aio_context_release(aio_context);
-+{
-+    /* Execute pending BHs first and check everything else only after the BHs
+         if (!g_slist_find(aio_ctxs, aio_context)) {
-+     * have executed. */
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
-+    while (aio_poll(bs->aio_context, false));
+             aio_context_acquire(aio_context);
-+    return atomic_read(&bs->in_flight);
+             for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
-+}
+                 if (aio_context == bdrv_get_aio_context(bs)) {
-+
+-                    /* FIXME Calling this multiple times is wrong */
- static bool bdrv_drain_recurse(BlockDriverState *bs)
+-                    bdrv_drain_invoke(bs, true);
- {
+                     waited |= bdrv_drain_recurse(bs, true);
-     BdrvChild *child, *tmp;
+                 }
-     bool waited;
+             }
      /* Wait for drained requests to finish */
 -    waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0);
 +    waited = BDRV_POLL_WHILE(bs, bdrv_drain_poll(bs));
      QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
          BlockDriverState *bs = child->bs;
 --
 .13.6

-[Qemu-devel] [PULL 01/35] test-bdrv-drain: bdrv_drain() works with cross-AioContext events
+[Qemu-devel] [PULL v3 05/35] test-bdrv-drain: Test BlockDriver callbacks for drain
-As long as nobody keeps the other I/O thread from working, there is no
+This adds a test case that the BlockDriver callbacks for drain are
-reason why bdrv_drain() wouldn't work with cross-AioContext events. The
+called in bdrv_drained_all_begin/end(), and that both of them are called
-key is that the root request we're waiting for is in the AioContext
+exactly once.
 we're polling (which it always is for bdrv_drain()) so that aio_poll()
 is woken up in the end.
 Add a test case that shows that it works. Remove the comment in
 bdrv_drain() that claims otherwise.
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Reviewed-by: Eric Blake <eblake@redhat.com>
 ---
- block/io.c              |   4 --
+ tests/test-bdrv-drain.c | 137 ++++++++++++++++++++++++++++++++++++++++++++++++
- tests/test-bdrv-drain.c | 187 +++++++++++++++++++++++++++++++++++++++++++++++-
+ tests/Makefile.include  |   2 +
-files changed, 186 insertions(+), 5 deletions(-)
+files changed, 139 insertions(+)
  create mode 100644 tests/test-bdrv-drain.c
-diff --git a/block/io.c b/block/io.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/io.c
-+++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@ void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent)
-  *
-  * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
-  * AioContext.
-- *
-- * Only this BlockDriverState's AioContext is run, so in-flight requests must
-- * not depend on events in other AioContexts.  In that case, use
-- * bdrv_drain_all() instead.
-  */
- void coroutine_fn bdrv_co_drain(BlockDriverState *bs)
- {
 diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
-index XXXXXXX..XXXXXXX 100644
+new file mode 100644
---- a/tests/test-bdrv-drain.c
+index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/tests/test-bdrv-drain.c
 @@ -XXX,XX +XXX,XX @@
- #include "block/blockjob_int.h"
++/*
- #include "sysemu/block-backend.h"
++ * Block node draining tests
- #include "qapi/error.h"
++ *
-+#include "iothread.h"
++ * Copyright (c) 2017 Kevin Wolf <kwolf@redhat.com>
 + *
 + * Permission is hereby granted, free of charge, to any person obtaining a copy
 + * of this software and associated documentation files (the "Software"), to deal
 + * in the Software without restriction, including without limitation the rights
 + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 + * copies of the Software, and to permit persons to whom the Software is
 + * furnished to do so, subject to the following conditions:
 + *
 + * The above copyright notice and this permission notice shall be included in
 + * all copies or substantial portions of the Software.
 + *
 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 + * THE SOFTWARE.
 + */
 +
-+static QemuEvent done_event;
++#include "qemu/osdep.h"
++#include "block/block.h"
- typedef struct BDRVTestState {
++#include "sysemu/block-backend.h"
-     int drain_count;
++#include "qapi/error.h"
-+    AioContext *bh_indirection_ctx;
++
- } BDRVTestState;
++typedef struct BDRVTestState {
++    int drain_count;
- static void coroutine_fn bdrv_test_co_drain_begin(BlockDriverState *bs)
++} BDRVTestState;
-@@ -XXX,XX +XXX,XX @@ static void bdrv_test_close(BlockDriverState *bs)
++
-     g_assert_cmpint(s->drain_count, >, 0);
++static void coroutine_fn bdrv_test_co_drain_begin(BlockDriverState *bs)
  }
 +static void co_reenter_bh(void *opaque)
 +{
-+    aio_co_wake(opaque);
++    BDRVTestState *s = bs->opaque;
 +    s->drain_count++;
 +}
 +
- static int coroutine_fn bdrv_test_co_preadv(BlockDriverState *bs,
++static void coroutine_fn bdrv_test_co_drain_end(BlockDriverState *bs)
-                                             uint64_t offset, uint64_t bytes,
++{
                                              QEMUIOVector *qiov, int flags)
  {
 +    BDRVTestState *s = bs->opaque;
++    s->drain_count--;
++}
 +
-     /* We want this request to stay until the polling loop in drain waits for
++static void bdrv_test_close(BlockDriverState *bs)
-      * it to complete. We need to sleep a while as bdrv_drain_invoke() comes
++{
-      * first and polls its result, too, but it shouldn't accidentally complete
++    BDRVTestState *s = bs->opaque;
-      * this request yet. */
++    g_assert_cmpint(s->drain_count, >, 0);
-     qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100000);
++}
 +    if (s->bh_indirection_ctx) {
 +        aio_bh_schedule_oneshot(s->bh_indirection_ctx, co_reenter_bh,
 +                                qemu_coroutine_self());
 +        qemu_coroutine_yield();
 +    }
 +
-     return 0;
++static int coroutine_fn bdrv_test_co_preadv(BlockDriverState *bs,
- }
++                                            uint64_t offset, uint64_t bytes,
++                                            QEMUIOVector *qiov, int flags)
-@@ -XXX,XX +XXX,XX @@ static void test_graph_change(void)
++{
-     blk_unref(blk_b);
++    /* We want this request to stay until the polling loop in drain waits for
- }
++     * it to complete. We need to sleep a while as bdrv_drain_invoke() comes
++     * first and polls its result, too, but it shouldn't accidentally complete
-+struct test_iothread_data {
++     * this request yet. */
-+    BlockDriverState *bs;
++    qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100000);
-+    enum drain_type drain_type;
++
-+    int *aio_ret;
++    return 0;
 +}
 +
 +static BlockDriver bdrv_test = {
 +    .format_name            = "test",
 +    .instance_size          = sizeof(BDRVTestState),
 +
 +    .bdrv_close             = bdrv_test_close,
 +    .bdrv_co_preadv         = bdrv_test_co_preadv,
 +
 +    .bdrv_co_drain_begin    = bdrv_test_co_drain_begin,
 +    .bdrv_co_drain_end      = bdrv_test_co_drain_end,
 +};
 +
-+static void test_iothread_drain_entry(void *opaque)
++static void aio_ret_cb(void *opaque, int ret)
 +{
 +    struct test_iothread_data *data = opaque;
 +
 +    aio_context_acquire(bdrv_get_aio_context(data->bs));
 +    do_drain_begin(data->drain_type, data->bs);
 +    g_assert_cmpint(*data->aio_ret, ==, 0);
 +    do_drain_end(data->drain_type, data->bs);
 +    aio_context_release(bdrv_get_aio_context(data->bs));
 +
 +    qemu_event_set(&done_event);
 +}
 +
 +static void test_iothread_aio_cb(void *opaque, int ret)
 +{
 +    int *aio_ret = opaque;
 +    *aio_ret = ret;
-+    qemu_event_set(&done_event);
 +}
 +
-+/*
++static void test_drv_cb_drain_all(void)
 + * Starts an AIO request on a BDS that runs in the AioContext of iothread 1.
 + * The request involves a BH on iothread 2 before it can complete.
 + *
 + * @drain_thread = 0 means that do_drain_begin/end are called from the main
 + * thread, @drain_thread = 1 means that they are called from iothread 1. Drain
 + * for this BDS cannot be called from iothread 2 because only the main thread
 + * may do cross-AioContext polling.
 + */
 +static void test_iothread_common(enum drain_type drain_type, int drain_thread)
 +{
 +    BlockBackend *blk;
 +    BlockDriverState *bs;
 +    BDRVTestState *s;
 +    BlockAIOCB *acb;
 +    int aio_ret;
-+    struct test_iothread_data data;
-+
-+    IOThread *a = iothread_new();
-+    IOThread *b = iothread_new();
-+    AioContext *ctx_a = iothread_get_aio_context(a);
-+    AioContext *ctx_b = iothread_get_aio_context(b);
 +
 +    QEMUIOVector qiov;
 +    struct iovec iov = {
 +        .iov_base = NULL,
 +        .iov_len = 0,
 +    };
 +    qemu_iovec_init_external(&qiov, &iov, 1);
 +
-+    /* bdrv_drain_all() may only be called from the main loop thread */
-+    if (drain_type == BDRV_DRAIN_ALL && drain_thread != 0) {
-+        goto out;
-+    }
-+
 +    blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
 +    bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
 +                              &error_abort);
 +    s = bs->opaque;
 +    blk_insert_bs(blk, bs, &error_abort);
 +
-+    blk_set_aio_context(blk, ctx_a);
++    /* Simple bdrv_drain_all_begin/end pair, check that CBs are called */
-+    aio_context_acquire(ctx_a);
++    g_assert_cmpint(s->drain_count, ==, 0);
 +    bdrv_drain_all_begin();
 +    g_assert_cmpint(s->drain_count, ==, 1);
 +    bdrv_drain_all_end();
 +    g_assert_cmpint(s->drain_count, ==, 0);
 +
-+    s->bh_indirection_ctx = ctx_b;
++    /* Now do the same while a request is pending */
 +
 +    aio_ret = -EINPROGRESS;
-+    if (drain_thread == 0) {
++    acb = blk_aio_preadv(blk, 0, &qiov, 0, aio_ret_cb, &aio_ret);
 +        acb = blk_aio_preadv(blk, 0, &qiov, 0, test_iothread_aio_cb, &aio_ret);
 +    } else {
 +        acb = blk_aio_preadv(blk, 0, &qiov, 0, aio_ret_cb, &aio_ret);
 +    }
 +    g_assert(acb != NULL);
 +    g_assert_cmpint(aio_ret, ==, -EINPROGRESS);
 +
-+    aio_context_release(ctx_a);
++    g_assert_cmpint(s->drain_count, ==, 0);
-+
++    bdrv_drain_all_begin();
-+    data = (struct test_iothread_data) {
++    g_assert_cmpint(aio_ret, ==, 0);
-+        .bs         = bs,
++    g_assert_cmpint(s->drain_count, ==, 1);
-+        .drain_type = drain_type,
++    bdrv_drain_all_end();
-+        .aio_ret    = &aio_ret,
++    g_assert_cmpint(s->drain_count, ==, 0);
 +    };
 +
 +    switch (drain_thread) {
 +    case 0:
 +        if (drain_type != BDRV_DRAIN_ALL) {
 +            aio_context_acquire(ctx_a);
 +        }
 +
 +        /* The request is running on the IOThread a. Draining its block device
 +         * will make sure that it has completed as far as the BDS is concerned,
 +         * but the drain in this thread can continue immediately after
 +         * bdrv_dec_in_flight() and aio_ret might be assigned only slightly
 +         * later. */
 +        qemu_event_reset(&done_event);
 +        do_drain_begin(drain_type, bs);
 +        g_assert_cmpint(bs->in_flight, ==, 0);
 +
 +        if (drain_type != BDRV_DRAIN_ALL) {
 +            aio_context_release(ctx_a);
 +        }
 +        qemu_event_wait(&done_event);
 +        if (drain_type != BDRV_DRAIN_ALL) {
 +            aio_context_acquire(ctx_a);
 +        }
 +
 +        g_assert_cmpint(aio_ret, ==, 0);
 +        do_drain_end(drain_type, bs);
 +
 +        if (drain_type != BDRV_DRAIN_ALL) {
 +            aio_context_release(ctx_a);
 +        }
 +        break;
 +    case 1:
 +        qemu_event_reset(&done_event);
 +        aio_bh_schedule_oneshot(ctx_a, test_iothread_drain_entry, &data);
 +        qemu_event_wait(&done_event);
 +        break;
 +    default:
 +        g_assert_not_reached();
 +    }
 +
 +    aio_context_acquire(ctx_a);
 +    blk_set_aio_context(blk, qemu_get_aio_context());
 +    aio_context_release(ctx_a);
 +
 +    bdrv_unref(bs);
 +    blk_unref(blk);
-+
-+out:
-+    iothread_join(a);
-+    iothread_join(b);
 +}
 +
-+static void test_iothread_drain_all(void)
++int main(int argc, char **argv)
 +{
-+    test_iothread_common(BDRV_DRAIN_ALL, 0);
++    bdrv_init();
-+    test_iothread_common(BDRV_DRAIN_ALL, 1);
++    qemu_init_main_loop(&error_abort);
 +
 +    g_test_init(&argc, &argv, NULL);
 +
 +    g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
 +
 +    return g_test_run();
 +}
-+
+diff --git a/tests/Makefile.include b/tests/Makefile.include
-+static void test_iothread_drain(void)
+index XXXXXXX..XXXXXXX 100644
-+{
+--- a/tests/Makefile.include
-+    test_iothread_common(BDRV_DRAIN, 0);
++++ b/tests/Makefile.include
-+    test_iothread_common(BDRV_DRAIN, 1);
+@@ -XXX,XX +XXX,XX @@ gcov-files-test-thread-pool-y = thread-pool.c
-+}
+ gcov-files-test-hbitmap-y = util/hbitmap.c
-+
+ check-unit-y += tests/test-hbitmap$(EXESUF)
-+static void test_iothread_drain_subtree(void)
+ gcov-files-test-hbitmap-y = blockjob.c
-+{
++check-unit-y += tests/test-bdrv-drain$(EXESUF)
-+    test_iothread_common(BDRV_SUBTREE_DRAIN, 0);
+ check-unit-y += tests/test-blockjob$(EXESUF)
-+    test_iothread_common(BDRV_SUBTREE_DRAIN, 1);
+ check-unit-y += tests/test-blockjob-txn$(EXESUF)
-+}
+ check-unit-y += tests/test-x86-cpuid$(EXESUF)
-+
+@@ -XXX,XX +XXX,XX @@ tests/test-coroutine$(EXESUF): tests/test-coroutine.o $(test-block-obj-y)
+ tests/test-aio$(EXESUF): tests/test-aio.o $(test-block-obj-y)
- typedef struct TestBlockJob {
+ tests/test-aio-multithread$(EXESUF): tests/test-aio-multithread.o $(test-block-obj-y)
-     BlockJob common;
+ tests/test-throttle$(EXESUF): tests/test-throttle.o $(test-block-obj-y)
-@@ -XXX,XX +XXX,XX @@ static void test_blockjob_drain_subtree(void)
++tests/test-bdrv-drain$(EXESUF): tests/test-bdrv-drain.o $(test-block-obj-y) $(test-util-obj-y)
+ tests/test-blockjob$(EXESUF): tests/test-blockjob.o $(test-block-obj-y) $(test-util-obj-y)
- int main(int argc, char **argv)
+ tests/test-blockjob-txn$(EXESUF): tests/test-blockjob-txn.o $(test-block-obj-y) $(test-util-obj-y)
- {
+ tests/test-thread-pool$(EXESUF): tests/test-thread-pool.o $(test-block-obj-y)
 +    int ret;
 +
      bdrv_init();
      qemu_init_main_loop(&error_abort);
      g_test_init(&argc, &argv, NULL);
 +    qemu_event_init(&done_event, false);
      g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
      g_test_add_func("/bdrv-drain/driver-cb/drain", test_drv_cb_drain);
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
      g_test_add_func("/bdrv-drain/multiparent", test_multiparent);
      g_test_add_func("/bdrv-drain/graph-change", test_graph_change);
 +    g_test_add_func("/bdrv-drain/iothread/drain_all", test_iothread_drain_all);
 +    g_test_add_func("/bdrv-drain/iothread/drain", test_iothread_drain);
 +    g_test_add_func("/bdrv-drain/iothread/drain_subtree",
 +                    test_iothread_drain_subtree);
 +
      g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
      g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
      g_test_add_func("/bdrv-drain/blockjob/drain_subtree",
                      test_blockjob_drain_subtree);
 -    return g_test_run();
 +    ret = g_test_run();
 +    qemu_event_destroy(&done_event);
 +    return ret;
  }
 --
 .13.6

-[Qemu-devel] [PULL 07/35] block: Really pause block jobs on drain
+[Qemu-devel] [PULL v3 06/35] block: bdrv_drain_recurse(): Remove unused begin parameter
-We already requested that block jobs be paused in .bdrv_drained_begin,
+Now that the bdrv_drain_invoke() calls are pulled up to the callers of
-but no guarantee was made that the job was actually inactive at the
+bdrv_drain_recurse(), the 'begin' parameter isn't needed any more.
 point where bdrv_drained_begin() returned.
 This introduces a new callback BdrvChildRole.bdrv_drained_poll() and
 uses it to make bdrv_drain_poll() consider block jobs using the node to
 be drained.
 For the test case to work as expected, we have to switch from
 block_job_sleep_ns() to qemu_co_sleep_ns() so that the test job is even
 considered active and must be waited for when draining the node.
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- include/block/block.h        |  8 ++++++++
+ block/io.c | 12 ++++++------
- include/block/block_int.h    |  7 +++++++
+file changed, 6 insertions(+), 6 deletions(-)
  include/block/blockjob_int.h |  8 ++++++++
  block.c                      |  9 +++++++++
  block/io.c                   | 40 ++++++++++++++++++++++++++++++++++------
  block/mirror.c               |  8 ++++++++
  blockjob.c                   | 23 +++++++++++++++++++++++
  tests/test-bdrv-drain.c      | 18 ++++++++++--------
 files changed, 107 insertions(+), 14 deletions(-)
-diff --git a/include/block/block.h b/include/block/block.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/block/block.h
-+++ b/include/block/block.h
-@@ -XXX,XX +XXX,XX @@ void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore);
- void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore);
- /**
-+ * bdrv_drain_poll:
-+ *
-+ * Poll for pending requests in @bs and its parents (except for
-+ * @ignore_parent). This is part of bdrv_drained_begin.
-+ */
-+bool bdrv_drain_poll(BlockDriverState *bs, BdrvChild *ignore_parent);
-+
-+/**
-  * bdrv_drained_begin:
-  *
-  * Begin a quiesced section for exclusive access to the BDS, by disabling
-diff --git a/include/block/block_int.h b/include/block/block_int.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/block/block_int.h
-+++ b/include/block/block_int.h
-@@ -XXX,XX +XXX,XX @@ struct BdrvChildRole {
-     void (*drained_begin)(BdrvChild *child);
-     void (*drained_end)(BdrvChild *child);
-+    /*
-+     * Returns whether the parent has pending requests for the child. This
-+     * callback is polled after .drained_begin() has been called until all
-+     * activity on the child has stopped.
-+     */
-+    bool (*drained_poll)(BdrvChild *child);
-+
-     /* Notifies the parent that the child has been activated/inactivated (e.g.
-      * when migration is completing) and it can start/stop requesting
-      * permissions and doing I/O on it. */
-diff --git a/include/block/blockjob_int.h b/include/block/blockjob_int.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/block/blockjob_int.h
-+++ b/include/block/blockjob_int.h
-@@ -XXX,XX +XXX,XX @@ struct BlockJobDriver {
-     JobDriver job_driver;
-     /*
-+     * Returns whether the job has pending requests for the child or will
-+     * submit new requests before the next pause point. This callback is polled
-+     * in the context of draining a job node after requesting that the job be
-+     * paused, until all activity on the child has stopped.
-+     */
-+    bool (*drained_poll)(BlockJob *job);
-+
-+    /*
-      * If the callback is not NULL, it will be invoked before the job is
-      * resumed in a new AioContext.  This is the place to move any resources
-      * besides job->blk to the new AioContext.
-diff --git a/block.c b/block.c
-index XXXXXXX..XXXXXXX 100644
---- a/block.c
-+++ b/block.c
-@@ -XXX,XX +XXX,XX @@ static void bdrv_child_cb_drained_begin(BdrvChild *child)
-     bdrv_drained_begin(bs);
- }
-+static bool bdrv_child_cb_drained_poll(BdrvChild *child)
-+{
-+    BlockDriverState *bs = child->opaque;
-+    return bdrv_drain_poll(bs, NULL);
-+}
-+
- static void bdrv_child_cb_drained_end(BdrvChild *child)
- {
-     BlockDriverState *bs = child->opaque;
-@@ -XXX,XX +XXX,XX @@ const BdrvChildRole child_file = {
-     .get_parent_desc = bdrv_child_get_parent_desc,
-     .inherit_options = bdrv_inherited_options,
-     .drained_begin   = bdrv_child_cb_drained_begin,
-+    .drained_poll    = bdrv_child_cb_drained_poll,
-     .drained_end     = bdrv_child_cb_drained_end,
-     .attach          = bdrv_child_cb_attach,
-     .detach          = bdrv_child_cb_detach,
-@@ -XXX,XX +XXX,XX @@ const BdrvChildRole child_format = {
-     .get_parent_desc = bdrv_child_get_parent_desc,
-     .inherit_options = bdrv_inherited_fmt_options,
-     .drained_begin   = bdrv_child_cb_drained_begin,
-+    .drained_poll    = bdrv_child_cb_drained_poll,
-     .drained_end     = bdrv_child_cb_drained_end,
-     .attach          = bdrv_child_cb_attach,
-     .detach          = bdrv_child_cb_detach,
-@@ -XXX,XX +XXX,XX @@ const BdrvChildRole child_backing = {
-     .detach          = bdrv_backing_detach,
-     .inherit_options = bdrv_backing_options,
-     .drained_begin   = bdrv_child_cb_drained_begin,
-+    .drained_poll    = bdrv_child_cb_drained_poll,
-     .drained_end     = bdrv_child_cb_drained_end,
-     .inactivate      = bdrv_child_cb_inactivate,
-     .update_filename = bdrv_backing_update_filename,
 diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/io.c
 +++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@ void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore)
+@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
      }
  }
-+static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore)
+-static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
-+{
++static bool bdrv_drain_recurse(BlockDriverState *bs)
 +    BdrvChild *c, *next;
 +    bool busy = false;
 +
 +    QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
 +        if (c == ignore) {
 +            continue;
 +        }
 +        if (c->role->drained_poll) {
 +            busy |= c->role->drained_poll(c);
 +        }
 +    }
 +
 +    return busy;
 +}
 +
  static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src)
  {
      dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer);
@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
  }
  /* Returns true if BDRV_POLL_WHILE() should go into a blocking aio_poll() */
 -static bool bdrv_drain_poll(BlockDriverState *bs)
 +bool bdrv_drain_poll(BlockDriverState *bs, BdrvChild *ignore_parent)
 +{
 +    if (bdrv_parent_drained_poll(bs, ignore_parent)) {
 +        return true;
 +    }
 +
 +    return atomic_read(&bs->in_flight);
 +}
 +
 +static bool bdrv_drain_poll_top_level(BlockDriverState *bs,
 +                                      BdrvChild *ignore_parent)
  {
      /* Execute pending BHs first and check everything else only after the BHs
       * have executed. */
      while (aio_poll(bs->aio_context, false));
 -    return atomic_read(&bs->in_flight);
 +
 +    return bdrv_drain_poll(bs, ignore_parent);
  }
 -static bool bdrv_drain_recurse(BlockDriverState *bs)
 +static bool bdrv_drain_recurse(BlockDriverState *bs, BdrvChild *parent)
  {
      BdrvChild *child, *tmp;
      bool waited;
+@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
      /* Wait for drained requests to finish */
 -    waited = BDRV_POLL_WHILE(bs, bdrv_drain_poll(bs));
 +    waited = BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, parent));
      QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
          BlockDriverState *bs = child->bs;
@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs)
               */
              bdrv_ref(bs);
          }
--        waited |= bdrv_drain_recurse(bs);
+-        waited |= bdrv_drain_recurse(bs, begin);
-+        waited |= bdrv_drain_recurse(bs, child);
++        waited |= bdrv_drain_recurse(bs);
          if (in_main_loop) {
              bdrv_unref(bs);
          }
-@@ -XXX,XX +XXX,XX @@ void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
+@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
+     }
-     bdrv_parent_drained_begin(bs, parent);
      bdrv_drain_invoke(bs, true);
--    bdrv_drain_recurse(bs);
+-    bdrv_drain_recurse(bs, true);
-+    bdrv_drain_recurse(bs, parent);
++    bdrv_drain_recurse(bs);
      if (recursive) {
          bs->recursive_quiesce_counter++;
 diff --git a/block/mirror.c b/block/mirror.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/mirror.c
 +++ b/block/mirror.c
@@ -XXX,XX +XXX,XX @@ static void mirror_pause(Job *job)
      mirror_wait_for_all_io(s);
  }
-+static bool mirror_drained_poll(BlockJob *job)
+ void bdrv_drained_end(BlockDriverState *bs)
-+{
+@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
-+    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
-+    return !!s->in_flight;
+     bdrv_parent_drained_end(bs);
-+}
+     bdrv_drain_invoke(bs, false);
-+
+-    bdrv_drain_recurse(bs, false);
- static void mirror_attached_aio_context(BlockJob *job, AioContext *new_context)
++    bdrv_drain_recurse(bs);
- {
+     aio_enable_external(bdrv_get_aio_context(bs));
      MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
@@ -XXX,XX +XXX,XX @@ static const BlockJobDriver mirror_job_driver = {
          .pause                  = mirror_pause,
          .complete               = mirror_complete,
      },
 +    .drained_poll           = mirror_drained_poll,
      .attached_aio_context   = mirror_attached_aio_context,
      .drain                  = mirror_drain,
  };
@@ -XXX,XX +XXX,XX @@ static const BlockJobDriver commit_active_job_driver = {
          .pause                  = mirror_pause,
          .complete               = mirror_complete,
      },
 +    .drained_poll           = mirror_drained_poll,
      .attached_aio_context   = mirror_attached_aio_context,
      .drain                  = mirror_drain,
  };
 diff --git a/blockjob.c b/blockjob.c
 index XXXXXXX..XXXXXXX 100644
 --- a/blockjob.c
 +++ b/blockjob.c
@@ -XXX,XX +XXX,XX @@ static void child_job_drained_begin(BdrvChild *c)
      job_pause(&job->job);
  }
-+static bool child_job_drained_poll(BdrvChild *c)
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
-+{
+             aio_context_acquire(aio_context);
-+    BlockJob *bjob = c->opaque;
+             for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
-+    Job *job = &bjob->job;
+                 if (aio_context == bdrv_get_aio_context(bs)) {
-+    const BlockJobDriver *drv = block_job_driver(bjob);
+-                    waited |= bdrv_drain_recurse(bs, true);
-+
++                    waited |= bdrv_drain_recurse(bs);
-+    /* An inactive or completed job doesn't have any pending requests. Jobs
+                 }
-+     * with !job->busy are either already paused or have a pause point after
+             }
-+     * being reentered, so no job driver code will run before they pause. */
+             aio_context_release(aio_context);
-+    if (!job->busy || job_is_completed(job) || job->deferred_to_main_loop) {
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
-+        return false;
+         aio_enable_external(aio_context);
-+    }
+         bdrv_parent_drained_end(bs);
-+
+         bdrv_drain_invoke(bs, false);
-+    /* Otherwise, assume that it isn't fully stopped yet, but allow the job to
+-        bdrv_drain_recurse(bs, false);
-+     * override this assumption. */
++        bdrv_drain_recurse(bs);
-+    if (drv->drained_poll) {
+         aio_context_release(aio_context);
 +        return drv->drained_poll(bjob);
 +    } else {
 +        return true;
 +    }
 +}
 +
  static void child_job_drained_end(BdrvChild *c)
  {
      BlockJob *job = c->opaque;
@@ -XXX,XX +XXX,XX @@ static void child_job_drained_end(BdrvChild *c)
  static const BdrvChildRole child_job = {
      .get_parent_desc    = child_job_get_parent_desc,
      .drained_begin      = child_job_drained_begin,
 +    .drained_poll       = child_job_drained_poll,
      .drained_end        = child_job_drained_end,
      .stay_at_node       = true,
  };
 diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tests/test-bdrv-drain.c
 +++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn test_job_start(void *opaque)
      job_transition_to_ready(&s->common.job);
      while (!s->should_complete) {
 -        job_sleep_ns(&s->common.job, 100000);
 +        /* Avoid block_job_sleep_ns() because it marks the job as !busy. We
 +         * want to emulate some actual activity (probably some I/O) here so
 +         * that drain has to wait for this acitivity to stop. */
 +        qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100000);
 +        job_pause_point(&s->common.job);
      }
-     job_defer_to_main_loop(&s->common.job, test_job_completed, NULL);
-@@ -XXX,XX +XXX,XX @@ static void test_blockjob_common(enum drain_type drain_type)
-     g_assert_cmpint(job->job.pause_count, ==, 0);
-     g_assert_false(job->job.paused);
--    g_assert_false(job->job.busy); /* We're in job_sleep_ns() */
-+    g_assert_true(job->job.busy); /* We're in job_sleep_ns() */
-     do_drain_begin(drain_type, src);
-@@ -XXX,XX +XXX,XX @@ static void test_blockjob_common(enum drain_type drain_type)
-     } else {
-         g_assert_cmpint(job->job.pause_count, ==, 1);
-     }
--    /* XXX We don't wait until the job is actually paused. Is this okay? */
--    /* g_assert_true(job->job.paused); */
-+    g_assert_true(job->job.paused);
-     g_assert_false(job->job.busy); /* The job is paused */
-     do_drain_end(drain_type, src);
-     g_assert_cmpint(job->job.pause_count, ==, 0);
-     g_assert_false(job->job.paused);
--    g_assert_false(job->job.busy); /* We're in job_sleep_ns() */
-+    g_assert_true(job->job.busy); /* We're in qemu_co_sleep_ns() */
-     do_drain_begin(drain_type, target);
-@@ -XXX,XX +XXX,XX @@ static void test_blockjob_common(enum drain_type drain_type)
-     } else {
-         g_assert_cmpint(job->job.pause_count, ==, 1);
-     }
--    /* XXX We don't wait until the job is actually paused. Is this okay? */
--    /* g_assert_true(job->job.paused); */
-+    g_assert_true(job->job.paused);
-     g_assert_false(job->job.busy); /* The job is paused */
-     do_drain_end(drain_type, target);
-     g_assert_cmpint(job->job.pause_count, ==, 0);
-     g_assert_false(job->job.paused);
--    g_assert_false(job->job.busy); /* We're in job_sleep_ns() */
-+    g_assert_true(job->job.busy); /* We're in job_sleep_ns() */
-     ret = job_complete_sync(&job->job, &error_abort);
-     g_assert_cmpint(ret, ==, 0);
 --
 .13.6

-[Qemu-devel] [PULL 04/35] block: Don't manually poll in bdrv_drain_all()
+[Qemu-devel] [PULL v3 07/35] block: Don't wait for requests in bdrv_drain*_end()
-All involved nodes are already idle, we called bdrv_do_drain_begin() on
+The device is drained, so there is no point in waiting for requests at
-them.
+the end of the drained section. Remove the bdrv_drain_recurse() calls
 there.
-The comment in the code suggested that this was not correct because the
+The bdrv_drain_recurse() calls were introduced in commit 481cad48e5e
-completion of a request on one node could spawn a new request on a
+in order to call the .bdrv_co_drain_end() driver callback. This is now
-different node (which might have been drained before, so we wouldn't
+done by a separate bdrv_drain_invoke() call.
 drain the new request). In reality, new requests to different nodes
 aren't spawned out of nothing, but only in the context of a parent
 request, and they aren't submitted to random nodes, but only to child
 nodes. As long as we still poll for the completion of the parent request
 (which we do), draining each root node separately is good enough.
 Remove the additional polling code from bdrv_drain_all_begin() and
 replace it with an assertion that all nodes are already idle after we
 drained them separately.
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
+Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- block/io.c | 41 ++++++++++++-----------------------------
+ block/io.c | 2 --
-file changed, 12 insertions(+), 29 deletions(-)
+file changed, 2 deletions(-)
 diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/io.c
 +++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@ void bdrv_drain(BlockDriverState *bs)
+@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
-     bdrv_drained_end(bs);
      bdrv_parent_drained_end(bs);
      bdrv_drain_invoke(bs, false);
 -    bdrv_drain_recurse(bs);
      aio_enable_external(bdrv_get_aio_context(bs));
  }
-+static void bdrv_drain_assert_idle(BlockDriverState *bs)
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
-+{
+         aio_enable_external(aio_context);
-+    BdrvChild *child, *next;
+         bdrv_parent_drained_end(bs);
-+
+         bdrv_drain_invoke(bs, false);
-+    assert(atomic_read(&bs->in_flight) == 0);
+-        bdrv_drain_recurse(bs);
 +    QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
 +        bdrv_drain_assert_idle(child->bs);
 +    }
 +}
 +
  /*
   * Wait for pending requests to complete across all BlockDriverStates
   *
@@ -XXX,XX +XXX,XX @@ void bdrv_drain(BlockDriverState *bs)
   */
  void bdrv_drain_all_begin(void)
  {
 -    /* Always run first iteration so any pending completion BHs run */
 -    bool waited = true;
      BlockDriverState *bs;
      BdrvNextIterator it;
 -    GSList *aio_ctxs = NULL, *ctx;
      /* BDRV_POLL_WHILE() for a node can only be called from its own I/O thread
       * or the main loop AioContext. We potentially use BDRV_POLL_WHILE() on
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
          aio_context_acquire(aio_context);
          bdrv_do_drained_begin(bs, true, NULL);
          aio_context_release(aio_context);
--
--        if (!g_slist_find(aio_ctxs, aio_context)) {
--            aio_ctxs = g_slist_prepend(aio_ctxs, aio_context);
--        }
      }
--    /* Note that completion of an asynchronous I/O operation can trigger any
--     * number of other I/O operations on other devices---for example a
--     * coroutine can submit an I/O request to another device in response to
--     * request completion.  Therefore we must keep looping until there was no
--     * more activity rather than simply draining each device independently.
--     */
--    while (waited) {
--        waited = false;
--
--        for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) {
--            AioContext *aio_context = ctx->data;
--
--            aio_context_acquire(aio_context);
--            for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
--                if (aio_context == bdrv_get_aio_context(bs)) {
--                    waited |= bdrv_drain_recurse(bs);
--                }
--            }
--            aio_context_release(aio_context);
--        }
-+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
-+        bdrv_drain_assert_idle(bs);
-     }
--
--    g_slist_free(aio_ctxs);
- }
- void bdrv_drain_all_end(void)
 --
 .13.6

-[Qemu-devel] [PULL 02/35] block: Use bdrv_do_drain_begin/end in bdrv_drain_all()
+[Qemu-devel] [PULL v3 08/35] block: Unify order in drain functions
-bdrv_do_drain_begin/end() implement already everything that
+Drain requests are propagated to child nodes, parent nodes and directly
-bdrv_drain_all_begin/end() need and currently still do manually: Disable
+to the AioContext. The order in which this happened was different
-external events, call parent drain callbacks, call block driver
+between all combinations of drain/drain_all and begin/end.
 callbacks.
-It also does two more things:
+The correct order is to keep children only drained when their parents
 are also drained. This means that at the start of a drained section, the
 AioContext needs to be drained first, the parents second and only then
 the children. The correct order for the end of a drained section is the
 opposite.
-The first is incrementing bs->quiesce_counter. bdrv_drain_all() already
+This patch changes the three other functions to follow the example of
-stood out in the test case by behaving different from the other drain
+bdrv_drained_begin(), which is the only one that got it right.
 variants. Adding this is not only safe, but in fact a bug fix.
 The second is calling bdrv_drain_recurse(). We already do that later in
 the same function in a loop, so basically doing an early first iteration
 doesn't hurt.
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- block/io.c              | 10 ++--------
+ block/io.c | 12 ++++++++----
- tests/test-bdrv-drain.c | 14 ++++----------
+file changed, 8 insertions(+), 4 deletions(-)
 files changed, 6 insertions(+), 18 deletions(-)
 diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/io.c
 +++ b/block/io.c
+@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
+         return;
+     }
++    /* Stop things in parent-to-child order */
+     if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
+         aio_disable_external(bdrv_get_aio_context(bs));
+         bdrv_parent_drained_begin(bs);
+@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
+         return;
+     }
+-    bdrv_parent_drained_end(bs);
++    /* Re-enable things in child-to-parent order */
+     bdrv_drain_invoke(bs, false);
++    bdrv_parent_drained_end(bs);
+     aio_enable_external(bdrv_get_aio_context(bs));
+ }
 @@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
      for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
          AioContext *aio_context = bdrv_get_aio_context(bs);
--        /* Stop things in parent-to-child order */
++        /* Stop things in parent-to-child order */
          aio_context_acquire(aio_context);
--        aio_disable_external(aio_context);
+-        bdrv_parent_drained_begin(bs);
--        bdrv_parent_drained_begin(bs, NULL);
+         aio_disable_external(aio_context);
--        bdrv_drain_invoke(bs, true, true);
++        bdrv_parent_drained_begin(bs);
-+        bdrv_do_drained_begin(bs, true, NULL);
+         bdrv_drain_invoke(bs, true);
          aio_context_release(aio_context);
-         if (!g_slist_find(aio_ctxs, aio_context)) {
 @@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
      for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
          AioContext *aio_context = bdrv_get_aio_context(bs);
--        /* Re-enable things in child-to-parent order */
++        /* Re-enable things in child-to-parent order */
          aio_context_acquire(aio_context);
--        bdrv_drain_invoke(bs, false, true);
--        bdrv_parent_drained_end(bs, NULL);
 -        aio_enable_external(aio_context);
-+        bdrv_do_drained_end(bs, true, NULL);
+-        bdrv_parent_drained_end(bs);
          bdrv_drain_invoke(bs, false);
 +        bdrv_parent_drained_end(bs);
 +        aio_enable_external(aio_context);
          aio_context_release(aio_context);
      }
- }
 diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tests/test-bdrv-drain.c
 +++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static void test_quiesce_common(enum drain_type drain_type, bool recursive)
  static void test_quiesce_drain_all(void)
  {
 -    // XXX drain_all doesn't quiesce
 -    //test_quiesce_common(BDRV_DRAIN_ALL, true);
 +    test_quiesce_common(BDRV_DRAIN_ALL, true);
  }
  static void test_quiesce_drain(void)
@@ -XXX,XX +XXX,XX @@ static void test_nested(void)
      for (outer = 0; outer < DRAIN_TYPE_MAX; outer++) {
          for (inner = 0; inner < DRAIN_TYPE_MAX; inner++) {
 -            /* XXX bdrv_drain_all() doesn't increase the quiesce_counter */
 -            int bs_quiesce      = (outer != BDRV_DRAIN_ALL) +
 -                                  (inner != BDRV_DRAIN_ALL);
 -            int backing_quiesce = (outer == BDRV_SUBTREE_DRAIN) +
 -                                  (inner == BDRV_SUBTREE_DRAIN);
 -            int backing_cb_cnt  = (outer != BDRV_DRAIN) +
 +            int backing_quiesce = (outer != BDRV_DRAIN) +
                                    (inner != BDRV_DRAIN);
              g_assert_cmpint(bs->quiesce_counter, ==, 0);
@@ -XXX,XX +XXX,XX @@ static void test_nested(void)
              do_drain_begin(outer, bs);
              do_drain_begin(inner, bs);
 -            g_assert_cmpint(bs->quiesce_counter, ==, bs_quiesce);
 +            g_assert_cmpint(bs->quiesce_counter, ==, 2);
              g_assert_cmpint(backing->quiesce_counter, ==, backing_quiesce);
              g_assert_cmpint(s->drain_count, ==, 2);
 -            g_assert_cmpint(backing_s->drain_count, ==, backing_cb_cnt);
 +            g_assert_cmpint(backing_s->drain_count, ==, backing_quiesce);
              do_drain_end(inner, bs);
              do_drain_end(outer, bs);
 --
 .13.6

-[Qemu-devel] [PULL 28/35] hbitmap: Add @advance param to hbitmap_iter_next()
+[Qemu-devel] [PULL v3 09/35] block: Don't acquire AioContext in hmp_qemu_io()
-From: Max Reitz <mreitz@redhat.com>
+Commit 15afd94a047 added code to acquire and release the AioContext in
 qemuio_command(). This means that the lock is taken twice now in the
 call path from hmp_qemu_io(). This causes BDRV_POLL_WHILE() to hang for
 any requests issued to nodes in a non-mainloop AioContext.
-This new parameter allows the caller to just query the next dirty
+Dropping the first locking from hmp_qemu_io() fixes the problem.
 position without moving the iterator.
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-Reviewed-by: Fam Zheng <famz@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Reviewed-by: John Snow <jsnow@redhat.com>
 Message-id: 20180613181823.13618-8-mreitz@redhat.com
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 ---
- include/qemu/hbitmap.h |  5 ++++-
+ hmp.c | 6 ------
- block/backup.c         |  2 +-
+file changed, 6 deletions(-)
  block/dirty-bitmap.c   |  2 +-
  tests/test-hbitmap.c   | 26 +++++++++++++-------------
  util/hbitmap.c         | 10 +++++++---
 files changed, 26 insertions(+), 19 deletions(-)
-diff --git a/include/qemu/hbitmap.h b/include/qemu/hbitmap.h
+diff --git a/hmp.c b/hmp.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/qemu/hbitmap.h
+--- a/hmp.c
-+++ b/include/qemu/hbitmap.h
++++ b/hmp.c
-@@ -XXX,XX +XXX,XX @@ void hbitmap_free_meta(HBitmap *hb);
+@@ -XXX,XX +XXX,XX @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
  /**
   * hbitmap_iter_next:
   * @hbi: HBitmapIter to operate on.
 + * @advance: If true, advance the iterator.  Otherwise, the next call
 + *           of this function will return the same result (if that
 + *           position is still dirty).
   *
   * Return the next bit that is set in @hbi's associated HBitmap,
   * or -1 if all remaining bits are zero.
   */
 -int64_t hbitmap_iter_next(HBitmapIter *hbi);
 +int64_t hbitmap_iter_next(HBitmapIter *hbi, bool advance);
  /**
   * hbitmap_iter_next_word:
 diff --git a/block/backup.c b/block/backup.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/backup.c
 +++ b/block/backup.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
      HBitmapIter hbi;
      hbitmap_iter_init(&hbi, job->copy_bitmap, 0);
 -    while ((cluster = hbitmap_iter_next(&hbi)) != -1) {
 +    while ((cluster = hbitmap_iter_next(&hbi, true)) != -1) {
          do {
              if (yield_and_check(job)) {
                  return 0;
 diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/dirty-bitmap.c
 +++ b/block/dirty-bitmap.c
@@ -XXX,XX +XXX,XX @@ void bdrv_dirty_iter_free(BdrvDirtyBitmapIter *iter)
  int64_t bdrv_dirty_iter_next(BdrvDirtyBitmapIter *iter)
  {
--    return hbitmap_iter_next(&iter->hbi);
+     BlockBackend *blk;
-+    return hbitmap_iter_next(&iter->hbi, true);
+     BlockBackend *local_blk = NULL;
- }
+-    AioContext *aio_context;
+     const char* device = qdict_get_str(qdict, "device");
- /* Called within bdrv_dirty_bitmap_lock..unlock */
+     const char* command = qdict_get_str(qdict, "command");
-diff --git a/tests/test-hbitmap.c b/tests/test-hbitmap.c
+     Error *err = NULL;
-index XXXXXXX..XXXXXXX 100644
+@@ -XXX,XX +XXX,XX @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
 --- a/tests/test-hbitmap.c
 +++ b/tests/test-hbitmap.c
@@ -XXX,XX +XXX,XX @@ static void hbitmap_test_check(TestHBitmapData *data,
      i = first;
      for (;;) {
 -        next = hbitmap_iter_next(&hbi);
 +        next = hbitmap_iter_next(&hbi, true);
          if (next < 0) {
              next = data->size;
          }
@@ -XXX,XX +XXX,XX @@ static void test_hbitmap_iter_granularity(TestHBitmapData *data,
      /* Note that hbitmap_test_check has to be invoked manually in this test.  */
      hbitmap_test_init(data, 131072 << 7, 7);
      hbitmap_iter_init(&hbi, data->hb, 0);
 -    g_assert_cmpint(hbitmap_iter_next(&hbi), <, 0);
 +    g_assert_cmpint(hbitmap_iter_next(&hbi, true), <, 0);
      hbitmap_test_set(data, ((L2 + L1 + 1) << 7) + 8, 8);
      hbitmap_iter_init(&hbi, data->hb, 0);
 -    g_assert_cmpint(hbitmap_iter_next(&hbi), ==, (L2 + L1 + 1) << 7);
 -    g_assert_cmpint(hbitmap_iter_next(&hbi), <, 0);
 +    g_assert_cmpint(hbitmap_iter_next(&hbi, true), ==, (L2 + L1 + 1) << 7);
 +    g_assert_cmpint(hbitmap_iter_next(&hbi, true), <, 0);
      hbitmap_iter_init(&hbi, data->hb, (L2 + L1 + 2) << 7);
 -    g_assert_cmpint(hbitmap_iter_next(&hbi), <, 0);
 +    g_assert_cmpint(hbitmap_iter_next(&hbi, true), <, 0);
      hbitmap_test_set(data, (131072 << 7) - 8, 8);
      hbitmap_iter_init(&hbi, data->hb, 0);
 -    g_assert_cmpint(hbitmap_iter_next(&hbi), ==, (L2 + L1 + 1) << 7);
 -    g_assert_cmpint(hbitmap_iter_next(&hbi), ==, 131071 << 7);
 -    g_assert_cmpint(hbitmap_iter_next(&hbi), <, 0);
 +    g_assert_cmpint(hbitmap_iter_next(&hbi, true), ==, (L2 + L1 + 1) << 7);
 +    g_assert_cmpint(hbitmap_iter_next(&hbi, true), ==, 131071 << 7);
 +    g_assert_cmpint(hbitmap_iter_next(&hbi, true), <, 0);
      hbitmap_iter_init(&hbi, data->hb, (L2 + L1 + 2) << 7);
 -    g_assert_cmpint(hbitmap_iter_next(&hbi), ==, 131071 << 7);
 -    g_assert_cmpint(hbitmap_iter_next(&hbi), <, 0);
 +    g_assert_cmpint(hbitmap_iter_next(&hbi, true), ==, 131071 << 7);
 +    g_assert_cmpint(hbitmap_iter_next(&hbi, true), <, 0);
  }
  static void hbitmap_test_set_boundary_bits(TestHBitmapData *data, ssize_t diff)
@@ -XXX,XX +XXX,XX @@ static void test_hbitmap_serialize_zeroes(TestHBitmapData *data,
      for (i = 0; i < num_positions; i++) {
          hbitmap_deserialize_zeroes(data->hb, positions[i], min_l1, true);
          hbitmap_iter_init(&iter, data->hb, 0);
 -        next = hbitmap_iter_next(&iter);
 +        next = hbitmap_iter_next(&iter, true);
          if (i == num_positions - 1) {
              g_assert_cmpint(next, ==, -1);
          } else {
@@ -XXX,XX +XXX,XX @@ static void test_hbitmap_iter_and_reset(TestHBitmapData *data,
      hbitmap_iter_init(&hbi, data->hb, BITS_PER_LONG - 1);
 -    hbitmap_iter_next(&hbi);
 +    hbitmap_iter_next(&hbi, true);
      hbitmap_reset_all(data->hb);
 -    hbitmap_iter_next(&hbi);
 +    hbitmap_iter_next(&hbi, true);
  }
  static void test_hbitmap_next_zero_check(TestHBitmapData *data, int64_t start)
 diff --git a/util/hbitmap.c b/util/hbitmap.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/hbitmap.c
 +++ b/util/hbitmap.c
@@ -XXX,XX +XXX,XX @@ unsigned long hbitmap_iter_skip_words(HBitmapIter *hbi)
      return cur;
  }
 -int64_t hbitmap_iter_next(HBitmapIter *hbi)
 +int64_t hbitmap_iter_next(HBitmapIter *hbi, bool advance)
  {
      unsigned long cur = hbi->cur[HBITMAP_LEVELS - 1] &
              hbi->hb->levels[HBITMAP_LEVELS - 1][hbi->pos];
@@ -XXX,XX +XXX,XX @@ int64_t hbitmap_iter_next(HBitmapIter *hbi)
          }
      }
--    /* The next call will resume work from the next bit.  */
+-    aio_context = blk_get_aio_context(blk);
--    hbi->cur[HBITMAP_LEVELS - 1] = cur & (cur - 1);
+-    aio_context_acquire(aio_context);
-+    if (advance) {
+-
-+        /* The next call will resume work from the next bit.  */
+     /*
-+        hbi->cur[HBITMAP_LEVELS - 1] = cur & (cur - 1);
+      * Notably absent: Proper permission management. This is sad, but it seems
-+    } else {
+      * almost impossible to achieve without changing the semantics and thereby
-+        hbi->cur[HBITMAP_LEVELS - 1] = cur;
+@@ -XXX,XX +XXX,XX @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
-+    }
+      */
-     item = ((uint64_t)hbi->pos << BITS_PER_LEVEL) + ctzl(cur);
+     qemuio_command(blk, command);
-     return item << hbi->granularity;
+-    aio_context_release(aio_context);
 -
  fail:
      blk_unref(local_blk);
      hmp_handle_error(mon, &err);
 --
 .13.6

-[Qemu-devel] [PULL 29/35] test-hbitmap: Add non-advancing iter_next tests
+[Qemu-devel] [PULL v3 10/35] qcow2: get rid of qcow2_backing_read1 routine
-From: Max Reitz <mreitz@redhat.com>
+From: Edgar Kaziakhmedov <edgar.kaziakhmedov@virtuozzo.com>
-Add a function that wraps hbitmap_iter_next() and always calls it in
+Since bdrv_co_preadv does all neccessary checks including
-non-advancing mode first, and in advancing mode next.  The result should
+reading after the end of the backing file, avoid duplication
-always be the same.
+of verification before bdrv_co_preadv call.
-By using this function everywhere we called hbitmap_iter_next() before,
+Signed-off-by: Edgar Kaziakhmedov <edgar.kaziakhmedov@virtuozzo.com>
-we should get good test coverage for non-advancing hbitmap_iter_next().
+Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
 Reviewed-by: Eric Blake <eblake@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
  block/qcow2.h |  3 ---
  block/qcow2.c | 51 ++++++++-------------------------------------------
 files changed, 8 insertions(+), 46 deletions(-)
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+diff --git a/block/qcow2.h b/block/qcow2.h
 Reviewed-by: Fam Zheng <famz@redhat.com>
 Reviewed-by: John Snow <jsnow@redhat.com>
 Message-id: 20180613181823.13618-9-mreitz@redhat.com
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 ---
  tests/test-hbitmap.c | 36 ++++++++++++++++++++++++------------
 file changed, 24 insertions(+), 12 deletions(-)
 diff --git a/tests/test-hbitmap.c b/tests/test-hbitmap.c
 index XXXXXXX..XXXXXXX 100644
---- a/tests/test-hbitmap.c
+--- a/block/qcow2.h
-+++ b/tests/test-hbitmap.c
++++ b/block/qcow2.h
-@@ -XXX,XX +XXX,XX @@ typedef struct TestHBitmapData {
+@@ -XXX,XX +XXX,XX @@ uint32_t offset_to_reftable_index(BDRVQcow2State *s, uint64_t offset)
  } TestHBitmapData;
 +static int64_t check_hbitmap_iter_next(HBitmapIter *hbi)
 +{
 +    int next0, next1;
 +
 +    next0 = hbitmap_iter_next(hbi, false);
 +    next1 = hbitmap_iter_next(hbi, true);
 +
 +    g_assert_cmpint(next0, ==, next1);
 +
 +    return next0;
 +}
 +
  /* Check that the HBitmap and the shadow bitmap contain the same data,
   * ignoring the same "first" bits.
   */
@@ -XXX,XX +XXX,XX @@ static void hbitmap_test_check(TestHBitmapData *data,
      i = first;
      for (;;) {
 -        next = hbitmap_iter_next(&hbi, true);
 +        next = check_hbitmap_iter_next(&hbi);
          if (next < 0) {
              next = data->size;
          }
@@ -XXX,XX +XXX,XX @@ static void test_hbitmap_iter_granularity(TestHBitmapData *data,
      /* Note that hbitmap_test_check has to be invoked manually in this test.  */
      hbitmap_test_init(data, 131072 << 7, 7);
      hbitmap_iter_init(&hbi, data->hb, 0);
 -    g_assert_cmpint(hbitmap_iter_next(&hbi, true), <, 0);
 +    g_assert_cmpint(check_hbitmap_iter_next(&hbi), <, 0);
      hbitmap_test_set(data, ((L2 + L1 + 1) << 7) + 8, 8);
      hbitmap_iter_init(&hbi, data->hb, 0);
 -    g_assert_cmpint(hbitmap_iter_next(&hbi, true), ==, (L2 + L1 + 1) << 7);
 -    g_assert_cmpint(hbitmap_iter_next(&hbi, true), <, 0);
 +    g_assert_cmpint(check_hbitmap_iter_next(&hbi), ==, (L2 + L1 + 1) << 7);
 +    g_assert_cmpint(check_hbitmap_iter_next(&hbi), <, 0);
      hbitmap_iter_init(&hbi, data->hb, (L2 + L1 + 2) << 7);
      g_assert_cmpint(hbitmap_iter_next(&hbi, true), <, 0);
      hbitmap_test_set(data, (131072 << 7) - 8, 8);
      hbitmap_iter_init(&hbi, data->hb, 0);
 -    g_assert_cmpint(hbitmap_iter_next(&hbi, true), ==, (L2 + L1 + 1) << 7);
 -    g_assert_cmpint(hbitmap_iter_next(&hbi, true), ==, 131071 << 7);
 -    g_assert_cmpint(hbitmap_iter_next(&hbi, true), <, 0);
 +    g_assert_cmpint(check_hbitmap_iter_next(&hbi), ==, (L2 + L1 + 1) << 7);
 +    g_assert_cmpint(check_hbitmap_iter_next(&hbi), ==, 131071 << 7);
 +    g_assert_cmpint(check_hbitmap_iter_next(&hbi), <, 0);
      hbitmap_iter_init(&hbi, data->hb, (L2 + L1 + 2) << 7);
 -    g_assert_cmpint(hbitmap_iter_next(&hbi, true), ==, 131071 << 7);
 -    g_assert_cmpint(hbitmap_iter_next(&hbi, true), <, 0);
 +    g_assert_cmpint(check_hbitmap_iter_next(&hbi), ==, 131071 << 7);
 +    g_assert_cmpint(check_hbitmap_iter_next(&hbi), <, 0);
  }
- static void hbitmap_test_set_boundary_bits(TestHBitmapData *data, ssize_t diff)
+ /* qcow2.c functions */
-@@ -XXX,XX +XXX,XX @@ static void test_hbitmap_serialize_zeroes(TestHBitmapData *data,
+-int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
-     for (i = 0; i < num_positions; i++) {
+-                  int64_t sector_num, int nb_sectors);
-         hbitmap_deserialize_zeroes(data->hb, positions[i], min_l1, true);
+-
-         hbitmap_iter_init(&iter, data->hb, 0);
+ int64_t qcow2_refcount_metadata_size(int64_t clusters, size_t cluster_size,
--        next = hbitmap_iter_next(&iter, true);
+                                      int refcount_order, bool generous_increase,
-+        next = check_hbitmap_iter_next(&iter);
+                                      uint64_t *refblock_count);
-         if (i == num_positions - 1) {
+diff --git a/block/qcow2.c b/block/qcow2.c
-             g_assert_cmpint(next, ==, -1);
+index XXXXXXX..XXXXXXX 100644
-         } else {
+--- a/block/qcow2.c
-@@ -XXX,XX +XXX,XX @@ static void test_hbitmap_iter_and_reset(TestHBitmapData *data,
++++ b/block/qcow2.c
+@@ -XXX,XX +XXX,XX @@ static int64_t coroutine_fn qcow2_co_get_block_status(BlockDriverState *bs,
-     hbitmap_iter_init(&hbi, data->hb, BITS_PER_LONG - 1);
+     return status;
 -    hbitmap_iter_next(&hbi, true);
 +    check_hbitmap_iter_next(&hbi);
      hbitmap_reset_all(data->hb);
 -    hbitmap_iter_next(&hbi, true);
 +    check_hbitmap_iter_next(&hbi);
  }
- static void test_hbitmap_next_zero_check(TestHBitmapData *data, int64_t start)
+-/* handle reading after the end of the backing file */
 -int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
 -                        int64_t offset, int bytes)
 -{
 -    uint64_t bs_size = bs->total_sectors * BDRV_SECTOR_SIZE;
 -    int n1;
 -
 -    if ((offset + bytes) <= bs_size) {
 -        return bytes;
 -    }
 -
 -    if (offset >= bs_size) {
 -        n1 = 0;
 -    } else {
 -        n1 = bs_size - offset;
 -    }
 -
 -    qemu_iovec_memset(qiov, n1, 0, bytes - n1);
 -
 -    return n1;
 -}
 -
  static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
                                          uint64_t bytes, QEMUIOVector *qiov,
                                          int flags)
  {
      BDRVQcow2State *s = bs->opaque;
 -    int offset_in_cluster, n1;
 +    int offset_in_cluster;
      int ret;
      unsigned int cur_bytes; /* number of bytes in current iteration */
      uint64_t cluster_offset = 0;
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
          case QCOW2_CLUSTER_UNALLOCATED:
              if (bs->backing) {
 -                /* read from the base image */
 -                n1 = qcow2_backing_read1(bs->backing->bs, &hd_qiov,
 -                                         offset, cur_bytes);
 -                if (n1 > 0) {
 -                    QEMUIOVector local_qiov;
 -
 -                    qemu_iovec_init(&local_qiov, hd_qiov.niov);
 -                    qemu_iovec_concat(&local_qiov, &hd_qiov, 0, n1);
 -
 -                    BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
 -                    qemu_co_mutex_unlock(&s->lock);
 -                    ret = bdrv_co_preadv(bs->backing, offset, n1,
 -                                         &local_qiov, 0);
 -                    qemu_co_mutex_lock(&s->lock);
 -
 -                    qemu_iovec_destroy(&local_qiov);
 -
 -                    if (ret < 0) {
 -                        goto fail;
 -                    }
 +                BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
 +                qemu_co_mutex_unlock(&s->lock);
 +                ret = bdrv_co_preadv(bs->backing, offset, cur_bytes,
 +                                     &hd_qiov, 0);
 +                qemu_co_mutex_lock(&s->lock);
 +                if (ret < 0) {
 +                    goto fail;
                  }
              } else {
                  /* Note: in this case, no need to wait */
 --
 .13.6

-[Qemu-devel] [PULL 33/35] block/mirror: Add active mirroring
+[Qemu-devel] [PULL v3 11/35] block: Document that x-blockdev-change breaks quorum children list
-From: Max Reitz <mreitz@redhat.com>
+Removing a quorum child node with x-blockdev-change results in a quorum
 driver state that cannot be recreated with create options because it
 would require a list with gaps. This causes trouble in at least
 .bdrv_refresh_filename().
-This patch implements active synchronous mirroring.  In active mode, the
+Document this problem so that we won't accidentally mark the command
-passive mechanism will still be in place and is used to copy all
+stable without having addressed it.
 initially dirty clusters off the source disk; but every write request
 will write data both to the source and the target disk, so the source
 cannot be dirtied faster than data is mirrored to the target.  Also,
 once the block job has converged (BLOCK_JOB_READY sent), source and
 target are guaranteed to stay in sync (unless an error occurs).
-Active mode is completely optional and currently disabled at runtime.  A
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-later patch will add a way for users to enable it.
+Reviewed-by: Alberto Garcia <berto@igalia.com>
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 Reviewed-by: Fam Zheng <famz@redhat.com>
 Message-id: 20180613181823.13618-13-mreitz@redhat.com
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 ---
- qapi/block-core.json |  18 ++++
+ qapi/block-core.json | 4 ++++
- block/mirror.c       | 252 ++++++++++++++++++++++++++++++++++++++++++++++++++-
+file changed, 4 insertions(+)
 files changed, 265 insertions(+), 5 deletions(-)
 diff --git a/qapi/block-core.json b/qapi/block-core.json
 index XXXXXXX..XXXXXXX 100644
 --- a/qapi/block-core.json
 +++ b/qapi/block-core.json
 @@ -XXX,XX +XXX,XX @@
-   'data': ['top', 'full', 'none', 'incremental'] }
+ # does not support all kinds of operations, all kinds of children, nor
+ # all block drivers.
- ##
+ #
-+# @MirrorCopyMode:
++# FIXME Removing children from a quorum node means introducing gaps in the
 +# child indices. This cannot be represented in the 'children' list of
 +# BlockdevOptionsQuorum, as returned by .bdrv_refresh_filename().
 +#
-+# An enumeration whose values tell the mirror block job when to
+ # Warning: The data in a new quorum child MUST be consistent with that of
-+# trigger writes to the target.
+ # the rest of the array.
 +#
 +# @background: copy data in background only.
 +#
 +# @write-blocking: when data is written to the source, write it
 +#                  (synchronously) to the target as well.  In
 +#                  addition, data is copied in background just like in
 +#                  @background mode.
 +#
 +# Since: 3.0
 +##
 +{ 'enum': 'MirrorCopyMode',
 +  'data': ['background', 'write-blocking'] }
 +
 +##
  # @BlockJobInfo:
  #
- # Information about a long-running block device operation.
-diff --git a/block/mirror.c b/block/mirror.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/mirror.c
-+++ b/block/mirror.c
-@@ -XXX,XX +XXX,XX @@ typedef struct MirrorBlockJob {
-     Error *replace_blocker;
-     bool is_none_mode;
-     BlockMirrorBackingMode backing_mode;
-+    MirrorCopyMode copy_mode;
-     BlockdevOnError on_source_error, on_target_error;
-     bool synced;
-+    /* Set when the target is synced (dirty bitmap is clean, nothing
-+     * in flight) and the job is running in active mode */
-+    bool actively_synced;
-     bool should_complete;
-     int64_t granularity;
-     size_t buf_size;
-@@ -XXX,XX +XXX,XX @@ typedef struct MirrorBlockJob {
-     int target_cluster_size;
-     int max_iov;
-     bool initial_zeroing_ongoing;
-+    int in_active_write_counter;
- } MirrorBlockJob;
- typedef struct MirrorBDSOpaque {
-@@ -XXX,XX +XXX,XX @@ struct MirrorOp {
-     int64_t *bytes_handled;
-     bool is_pseudo_op;
-+    bool is_active_write;
-     CoQueue waiting_requests;
-     QTAILQ_ENTRY(MirrorOp) next;
-@@ -XXX,XX +XXX,XX @@ static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read,
-                                             int error)
- {
-     s->synced = false;
-+    s->actively_synced = false;
-     if (read) {
-         return block_job_error_action(&s->common, s->on_source_error,
-                                       true, error);
-@@ -XXX,XX +XXX,XX @@ static int mirror_cow_align(MirrorBlockJob *s, int64_t *offset,
-     return ret;
- }
--static inline void mirror_wait_for_free_in_flight_slot(MirrorBlockJob *s)
-+static inline void mirror_wait_for_any_operation(MirrorBlockJob *s, bool active)
- {
-     MirrorOp *op;
-@@ -XXX,XX +XXX,XX @@ static inline void mirror_wait_for_free_in_flight_slot(MirrorBlockJob *s)
-          * caller of this function.  Since there is only one pseudo op
-          * at any given time, we will always find some real operation
-          * to wait on. */
--        if (!op->is_pseudo_op) {
-+        if (!op->is_pseudo_op && op->is_active_write == active) {
-             qemu_co_queue_wait(&op->waiting_requests, NULL);
-             return;
-         }
-@@ -XXX,XX +XXX,XX @@ static inline void mirror_wait_for_free_in_flight_slot(MirrorBlockJob *s)
-     abort();
- }
-+static inline void mirror_wait_for_free_in_flight_slot(MirrorBlockJob *s)
-+{
-+    /* Only non-active operations use up in-flight slots */
-+    mirror_wait_for_any_operation(s, false);
-+}
-+
- /* Perform a mirror copy operation.
-  *
-  * *op->bytes_handled is set to the number of bytes copied after and
-@@ -XXX,XX +XXX,XX @@ static void coroutine_fn mirror_run(void *opaque)
-         /* Transition to the READY state and wait for complete. */
-         job_transition_to_ready(&s->common.job);
-         s->synced = true;
-+        s->actively_synced = true;
-         while (!job_is_cancelled(&s->common.job) && !s->should_complete) {
-             job_yield(&s->common.job);
-         }
-@@ -XXX,XX +XXX,XX @@ static void coroutine_fn mirror_run(void *opaque)
-         int64_t cnt, delta;
-         bool should_complete;
-+        /* Do not start passive operations while there are active
-+         * writes in progress */
-+        while (s->in_active_write_counter) {
-+            mirror_wait_for_any_operation(s, true);
-+        }
-+
-         if (s->ret < 0) {
-             ret = s->ret;
-             goto immediate_exit;
-@@ -XXX,XX +XXX,XX @@ static void coroutine_fn mirror_run(void *opaque)
-                  */
-                 job_transition_to_ready(&s->common.job);
-                 s->synced = true;
-+                if (s->copy_mode != MIRROR_COPY_MODE_BACKGROUND) {
-+                    s->actively_synced = true;
-+                }
-             }
-             should_complete = s->should_complete ||
-@@ -XXX,XX +XXX,XX @@ static const BlockJobDriver commit_active_job_driver = {
-     .drain                  = mirror_drain,
- };
-+static void do_sync_target_write(MirrorBlockJob *job, MirrorMethod method,
-+                                 uint64_t offset, uint64_t bytes,
-+                                 QEMUIOVector *qiov, int flags)
-+{
-+    BdrvDirtyBitmapIter *iter;
-+    QEMUIOVector target_qiov;
-+    uint64_t dirty_offset;
-+    int dirty_bytes;
-+
-+    if (qiov) {
-+        qemu_iovec_init(&target_qiov, qiov->niov);
-+    }
-+
-+    iter = bdrv_dirty_iter_new(job->dirty_bitmap);
-+    bdrv_set_dirty_iter(iter, offset);
-+
-+    while (true) {
-+        bool valid_area;
-+        int ret;
-+
-+        bdrv_dirty_bitmap_lock(job->dirty_bitmap);
-+        valid_area = bdrv_dirty_iter_next_area(iter, offset + bytes,
-+                                               &dirty_offset, &dirty_bytes);
-+        if (!valid_area) {
-+            bdrv_dirty_bitmap_unlock(job->dirty_bitmap);
-+            break;
-+        }
-+
-+        bdrv_reset_dirty_bitmap_locked(job->dirty_bitmap,
-+                                       dirty_offset, dirty_bytes);
-+        bdrv_dirty_bitmap_unlock(job->dirty_bitmap);
-+
-+        job_progress_increase_remaining(&job->common.job, dirty_bytes);
-+
-+        assert(dirty_offset - offset <= SIZE_MAX);
-+        if (qiov) {
-+            qemu_iovec_reset(&target_qiov);
-+            qemu_iovec_concat(&target_qiov, qiov,
-+                              dirty_offset - offset, dirty_bytes);
-+        }
-+
-+        switch (method) {
-+        case MIRROR_METHOD_COPY:
-+            ret = blk_co_pwritev(job->target, dirty_offset, dirty_bytes,
-+                                 qiov ? &target_qiov : NULL, flags);
-+            break;
-+
-+        case MIRROR_METHOD_ZERO:
-+            assert(!qiov);
-+            ret = blk_co_pwrite_zeroes(job->target, dirty_offset, dirty_bytes,
-+                                       flags);
-+            break;
-+
-+        case MIRROR_METHOD_DISCARD:
-+            assert(!qiov);
-+            ret = blk_co_pdiscard(job->target, dirty_offset, dirty_bytes);
-+            break;
-+
-+        default:
-+            abort();
-+        }
-+
-+        if (ret >= 0) {
-+            job_progress_update(&job->common.job, dirty_bytes);
-+        } else {
-+            BlockErrorAction action;
-+
-+            bdrv_set_dirty_bitmap(job->dirty_bitmap, dirty_offset, dirty_bytes);
-+            job->actively_synced = false;
-+
-+            action = mirror_error_action(job, false, -ret);
-+            if (action == BLOCK_ERROR_ACTION_REPORT) {
-+                if (!job->ret) {
-+                    job->ret = ret;
-+                }
-+                break;
-+            }
-+        }
-+    }
-+
-+    bdrv_dirty_iter_free(iter);
-+    if (qiov) {
-+        qemu_iovec_destroy(&target_qiov);
-+    }
-+}
-+
-+static MirrorOp *coroutine_fn active_write_prepare(MirrorBlockJob *s,
-+                                                   uint64_t offset,
-+                                                   uint64_t bytes)
-+{
-+    MirrorOp *op;
-+    uint64_t start_chunk = offset / s->granularity;
-+    uint64_t end_chunk = DIV_ROUND_UP(offset + bytes, s->granularity);
-+
-+    op = g_new(MirrorOp, 1);
-+    *op = (MirrorOp){
-+        .s                  = s,
-+        .offset             = offset,
-+        .bytes              = bytes,
-+        .is_active_write    = true,
-+    };
-+    qemu_co_queue_init(&op->waiting_requests);
-+    QTAILQ_INSERT_TAIL(&s->ops_in_flight, op, next);
-+
-+    s->in_active_write_counter++;
-+
-+    mirror_wait_on_conflicts(op, s, offset, bytes);
-+
-+    bitmap_set(s->in_flight_bitmap, start_chunk, end_chunk - start_chunk);
-+
-+    return op;
-+}
-+
-+static void coroutine_fn active_write_settle(MirrorOp *op)
-+{
-+    uint64_t start_chunk = op->offset / op->s->granularity;
-+    uint64_t end_chunk = DIV_ROUND_UP(op->offset + op->bytes,
-+                                      op->s->granularity);
-+
-+    if (!--op->s->in_active_write_counter && op->s->actively_synced) {
-+        BdrvChild *source = op->s->mirror_top_bs->backing;
-+
-+        if (QLIST_FIRST(&source->bs->parents) == source &&
-+            QLIST_NEXT(source, next_parent) == NULL)
-+        {
-+            /* Assert that we are back in sync once all active write
-+             * operations are settled.
-+             * Note that we can only assert this if the mirror node
-+             * is the source node's only parent. */
-+            assert(!bdrv_get_dirty_count(op->s->dirty_bitmap));
-+        }
-+    }
-+    bitmap_clear(op->s->in_flight_bitmap, start_chunk, end_chunk - start_chunk);
-+    QTAILQ_REMOVE(&op->s->ops_in_flight, op, next);
-+    qemu_co_queue_restart_all(&op->waiting_requests);
-+    g_free(op);
-+}
-+
- static int coroutine_fn bdrv_mirror_top_preadv(BlockDriverState *bs,
-     uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags)
- {
-     return bdrv_co_preadv(bs->backing, offset, bytes, qiov, flags);
- }
-+static int coroutine_fn bdrv_mirror_top_do_write(BlockDriverState *bs,
-+    MirrorMethod method, uint64_t offset, uint64_t bytes, QEMUIOVector *qiov,
-+    int flags)
-+{
-+    MirrorOp *op = NULL;
-+    MirrorBDSOpaque *s = bs->opaque;
-+    int ret = 0;
-+    bool copy_to_target;
-+
-+    copy_to_target = s->job->ret >= 0 &&
-+                     s->job->copy_mode == MIRROR_COPY_MODE_WRITE_BLOCKING;
-+
-+    if (copy_to_target) {
-+        op = active_write_prepare(s->job, offset, bytes);
-+    }
-+
-+    switch (method) {
-+    case MIRROR_METHOD_COPY:
-+        ret = bdrv_co_pwritev(bs->backing, offset, bytes, qiov, flags);
-+        break;
-+
-+    case MIRROR_METHOD_ZERO:
-+        ret = bdrv_co_pwrite_zeroes(bs->backing, offset, bytes, flags);
-+        break;
-+
-+    case MIRROR_METHOD_DISCARD:
-+        ret = bdrv_co_pdiscard(bs->backing->bs, offset, bytes);
-+        break;
-+
-+    default:
-+        abort();
-+    }
-+
-+    if (ret < 0) {
-+        goto out;
-+    }
-+
-+    if (copy_to_target) {
-+        do_sync_target_write(s->job, method, offset, bytes, qiov, flags);
-+    }
-+
-+out:
-+    if (copy_to_target) {
-+        active_write_settle(op);
-+    }
-+    return ret;
-+}
-+
- static int coroutine_fn bdrv_mirror_top_pwritev(BlockDriverState *bs,
-     uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags)
- {
--    return bdrv_co_pwritev(bs->backing, offset, bytes, qiov, flags);
-+    MirrorBDSOpaque *s = bs->opaque;
-+    QEMUIOVector bounce_qiov;
-+    void *bounce_buf;
-+    int ret = 0;
-+    bool copy_to_target;
-+
-+    copy_to_target = s->job->ret >= 0 &&
-+                     s->job->copy_mode == MIRROR_COPY_MODE_WRITE_BLOCKING;
-+
-+    if (copy_to_target) {
-+        /* The guest might concurrently modify the data to write; but
-+         * the data on source and destination must match, so we have
-+         * to use a bounce buffer if we are going to write to the
-+         * target now. */
-+        bounce_buf = qemu_blockalign(bs, bytes);
-+        iov_to_buf_full(qiov->iov, qiov->niov, 0, bounce_buf, bytes);
-+
-+        qemu_iovec_init(&bounce_qiov, 1);
-+        qemu_iovec_add(&bounce_qiov, bounce_buf, bytes);
-+        qiov = &bounce_qiov;
-+    }
-+
-+    ret = bdrv_mirror_top_do_write(bs, MIRROR_METHOD_COPY, offset, bytes, qiov,
-+                                   flags);
-+
-+    if (copy_to_target) {
-+        qemu_iovec_destroy(&bounce_qiov);
-+        qemu_vfree(bounce_buf);
-+    }
-+
-+    return ret;
- }
- static int coroutine_fn bdrv_mirror_top_flush(BlockDriverState *bs)
-@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_mirror_top_flush(BlockDriverState *bs)
- static int coroutine_fn bdrv_mirror_top_pwrite_zeroes(BlockDriverState *bs,
-     int64_t offset, int bytes, BdrvRequestFlags flags)
- {
--    return bdrv_co_pwrite_zeroes(bs->backing, offset, bytes, flags);
-+    return bdrv_mirror_top_do_write(bs, MIRROR_METHOD_ZERO, offset, bytes, NULL,
-+                                    flags);
- }
- static int coroutine_fn bdrv_mirror_top_pdiscard(BlockDriverState *bs,
-     int64_t offset, int bytes)
- {
--    return bdrv_co_pdiscard(bs->backing->bs, offset, bytes);
-+    return bdrv_mirror_top_do_write(bs, MIRROR_METHOD_DISCARD, offset, bytes,
-+                                    NULL, 0);
- }
- static void bdrv_mirror_top_refresh_filename(BlockDriverState *bs, QDict *opts)
-@@ -XXX,XX +XXX,XX @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs,
-     s->on_target_error = on_target_error;
-     s->is_none_mode = is_none_mode;
-     s->backing_mode = backing_mode;
-+    s->copy_mode = MIRROR_COPY_MODE_BACKGROUND;
-     s->base = base;
-     s->granularity = granularity;
-     s->buf_size = ROUND_UP(buf_size, granularity);
 --
 .13.6

-[Qemu-devel] [PULL 35/35] iotests: Add test for active mirroring
+[Qemu-devel] [PULL v3 12/35] nvme: Add tracing
-From: Max Reitz <mreitz@redhat.com>
+From: Doug Gale <doug16k@gmail.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+Add trace output for commands, errors, and undefined behavior.
-Reviewed-by: Fam Zheng <famz@redhat.com>
+Add guest error log output for undefined behavior.
-Reviewed-by: Alberto Garcia <berto@igalia.com>
+Report invalid undefined accesses to MMIO.
-Message-id: 20180613181823.13618-15-mreitz@redhat.com
+Annotate unlikely error checks with unlikely.
-Signed-off-by: Max Reitz <mreitz@redhat.com>
 Signed-off-by: Doug Gale <doug16k@gmail.com>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- tests/qemu-iotests/151     | 120 +++++++++++++++++++++++++++++++++++++++++++++
+ hw/block/nvme.c       | 349 ++++++++++++++++++++++++++++++++++++++++++--------
- tests/qemu-iotests/151.out |   5 ++
+ hw/block/trace-events |  93 ++++++++++++++
- tests/qemu-iotests/group   |   1 +
+files changed, 390 insertions(+), 52 deletions(-)
 files changed, 126 insertions(+)
  create mode 100755 tests/qemu-iotests/151
  create mode 100644 tests/qemu-iotests/151.out
-diff --git a/tests/qemu-iotests/151 b/tests/qemu-iotests/151
+diff --git a/hw/block/nvme.c b/hw/block/nvme.c
-new file mode 100755
+index XXXXXXX..XXXXXXX 100644
-index XXXXXXX..XXXXXXX
+--- a/hw/block/nvme.c
---- /dev/null
++++ b/hw/block/nvme.c
 +++ b/tests/qemu-iotests/151
 @@ -XXX,XX +XXX,XX @@
-+#!/usr/bin/env python
+ #include "qapi/visitor.h"
-+#
+ #include "sysemu/block-backend.h"
-+# Tests for active mirroring
-+#
++#include "qemu/log.h"
-+# Copyright (C) 2018 Red Hat, Inc.
++#include "trace.h"
-+#
+ #include "nvme.h"
-+# This program is free software; you can redistribute it and/or modify
-+# it under the terms of the GNU General Public License as published by
++#define NVME_GUEST_ERR(trace, fmt, ...) \
-+# the Free Software Foundation; either version 2 of the License, or
++    do { \
-+# (at your option) any later version.
++        (trace_##trace)(__VA_ARGS__); \
-+#
++        qemu_log_mask(LOG_GUEST_ERROR, #trace \
-+# This program is distributed in the hope that it will be useful,
++            " in %s: " fmt "\n", __func__, ## __VA_ARGS__); \
-+# but WITHOUT ANY WARRANTY; without even the implied warranty of
++    } while (0)
-+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++
-+# GNU General Public License for more details.
+ static void nvme_process_sq(void *opaque);
-+#
-+# You should have received a copy of the GNU General Public License
+ static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
-+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+@@ -XXX,XX +XXX,XX @@ static void nvme_isr_notify(NvmeCtrl *n, NvmeCQueue *cq)
-+#
+ {
-+
+     if (cq->irq_enabled) {
-+import os
+         if (msix_enabled(&(n->parent_obj))) {
-+import iotests
++            trace_nvme_irq_msix(cq->vector);
-+from iotests import qemu_img
+             msix_notify(&(n->parent_obj), cq->vector);
-+
+         } else {
-+source_img = os.path.join(iotests.test_dir, 'source.' + iotests.imgfmt)
++            trace_nvme_irq_pin();
-+target_img = os.path.join(iotests.test_dir, 'target.' + iotests.imgfmt)
+             pci_irq_pulse(&n->parent_obj);
-+
+         }
-+class TestActiveMirror(iotests.QMPTestCase):
++    } else {
-+    image_len = 128 * 1024 * 1024 # MB
++        trace_nvme_irq_masked();
-+    potential_writes_in_flight = True
+     }
-+
+ }
-+    def setUp(self):
-+        qemu_img('create', '-f', iotests.imgfmt, source_img, '128M')
+@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
-+        qemu_img('create', '-f', iotests.imgfmt, target_img, '128M')
+     trans_len = MIN(len, trans_len);
-+
+     int num_prps = (len >> n->page_bits) + 1;
-+        blk_source = {'id': 'source',
-+                      'if': 'none',
+-    if (!prp1) {
-+                      'node-name': 'source-node',
++    if (unlikely(!prp1)) {
-+                      'driver': iotests.imgfmt,
++        trace_nvme_err_invalid_prp();
-+                      'file': {'driver': 'file',
+         return NVME_INVALID_FIELD | NVME_DNR;
-+                               'filename': source_img}}
+     } else if (n->cmbsz && prp1 >= n->ctrl_mem.addr &&
-+
+                prp1 < n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size)) {
-+        blk_target = {'node-name': 'target-node',
+@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
-+                      'driver': iotests.imgfmt,
+     }
-+                      'file': {'driver': 'file',
+     len -= trans_len;
-+                               'filename': target_img}}
+     if (len) {
-+
+-        if (!prp2) {
-+        self.vm = iotests.VM()
++        if (unlikely(!prp2)) {
-+        self.vm.add_drive_raw(self.vm.qmp_to_opts(blk_source))
++            trace_nvme_err_invalid_prp2_missing();
-+        self.vm.add_blockdev(self.vm.qmp_to_opts(blk_target))
+             goto unmap;
-+        self.vm.add_device('virtio-blk,drive=source')
+         }
-+        self.vm.launch()
+         if (len > n->page_size) {
-+
+@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
-+    def tearDown(self):
+                 uint64_t prp_ent = le64_to_cpu(prp_list[i]);
-+        self.vm.shutdown()
-+
+                 if (i == n->max_prp_ents - 1 && len > n->page_size) {
-+        if not self.potential_writes_in_flight:
+-                    if (!prp_ent || prp_ent & (n->page_size - 1)) {
-+            self.assertTrue(iotests.compare_images(source_img, target_img),
++                    if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) {
-+                            'mirror target does not match source')
++                        trace_nvme_err_invalid_prplist_ent(prp_ent);
-+
+                         goto unmap;
-+        os.remove(source_img)
+                     }
-+        os.remove(target_img)
-+
+@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
-+    def doActiveIO(self, sync_source_and_target):
+                     prp_ent = le64_to_cpu(prp_list[i]);
-+        # Fill the source image
+                 }
-+        self.vm.hmp_qemu_io('source',
-+                            'write -P 1 0 %i' % self.image_len);
+-                if (!prp_ent || prp_ent & (n->page_size - 1)) {
-+
++                if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) {
-+        # Start some background requests
++                    trace_nvme_err_invalid_prplist_ent(prp_ent);
-+        for offset in range(1 * self.image_len / 8, 3 * self.image_len / 8, 1024 * 1024):
+                     goto unmap;
-+            self.vm.hmp_qemu_io('source', 'aio_write -P 2 %i 1M' % offset)
+                 }
-+        for offset in range(2 * self.image_len / 8, 3 * self.image_len / 8, 1024 * 1024):
-+            self.vm.hmp_qemu_io('source', 'aio_write -z %i 1M' % offset)
+@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
-+
+                 i++;
-+        # Start the block job
+             }
-+        result = self.vm.qmp('blockdev-mirror',
+         } else {
-+                             job_id='mirror',
+-            if (prp2 & (n->page_size - 1)) {
-+                             filter_node_name='mirror-node',
++            if (unlikely(prp2 & (n->page_size - 1))) {
-+                             device='source-node',
++                trace_nvme_err_invalid_prp2_align(prp2);
-+                             target='target-node',
+                 goto unmap;
-+                             sync='full',
+             }
-+                             copy_mode='write-blocking')
+             if (qsg->nsg) {
-+        self.assert_qmp(result, 'return', {})
+@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_dma_read_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
-+
+     QEMUIOVector iov;
-+        # Start some more requests
+     uint16_t status = NVME_SUCCESS;
-+        for offset in range(3 * self.image_len / 8, 5 * self.image_len / 8, 1024 * 1024):
-+            self.vm.hmp_qemu_io('source', 'aio_write -P 3 %i 1M' % offset)
++    trace_nvme_dma_read(prp1, prp2);
-+        for offset in range(4 * self.image_len / 8, 5 * self.image_len / 8, 1024 * 1024):
++
-+            self.vm.hmp_qemu_io('source', 'aio_write -z %i 1M' % offset)
+     if (nvme_map_prp(&qsg, &iov, prp1, prp2, len, n)) {
-+
+         return NVME_INVALID_FIELD | NVME_DNR;
-+        # Wait for the READY event
+     }
-+        self.wait_ready(drive='mirror')
+     if (qsg.nsg > 0) {
-+
+-        if (dma_buf_read(ptr, len, &qsg)) {
-+        # Now start some final requests; all of these (which land on
++        if (unlikely(dma_buf_read(ptr, len, &qsg))) {
-+        # the source) should be settled using the active mechanism.
++            trace_nvme_err_invalid_dma();
-+        # The mirror code itself asserts that the source BDS's dirty
+             status = NVME_INVALID_FIELD | NVME_DNR;
-+        # bitmap will stay clean between READY and COMPLETED.
+         }
-+        for offset in range(5 * self.image_len / 8, 7 * self.image_len / 8, 1024 * 1024):
+         qemu_sglist_destroy(&qsg);
-+            self.vm.hmp_qemu_io('source', 'aio_write -P 3 %i 1M' % offset)
+     } else {
-+        for offset in range(6 * self.image_len / 8, 7 * self.image_len / 8, 1024 * 1024):
+-        if (qemu_iovec_to_buf(&iov, 0, ptr, len) != len) {
-+            self.vm.hmp_qemu_io('source', 'aio_write -z %i 1M' % offset)
++        if (unlikely(qemu_iovec_to_buf(&iov, 0, ptr, len) != len)) {
-+
++            trace_nvme_err_invalid_dma();
-+        if sync_source_and_target:
+             status = NVME_INVALID_FIELD | NVME_DNR;
-+            # If source and target should be in sync after the mirror,
+         }
-+            # we have to flush before completion
+         qemu_iovec_destroy(&iov);
-+            self.vm.hmp_qemu_io('source', 'aio_flush')
+@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_write_zeros(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
-+            self.potential_writes_in_flight = False
+     uint64_t aio_slba = slba << (data_shift - BDRV_SECTOR_BITS);
-+
+     uint32_t aio_nlb = nlb << (data_shift - BDRV_SECTOR_BITS);
-+        self.complete_and_wait(drive='mirror', wait_ready=False)
-+
+-    if (slba + nlb > ns->id_ns.nsze) {
-+    def testActiveIO(self):
++    if (unlikely(slba + nlb > ns->id_ns.nsze)) {
-+        self.doActiveIO(False)
++        trace_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
-+
+         return NVME_LBA_RANGE | NVME_DNR;
-+    def testActiveIOFlushed(self):
+     }
-+        self.doActiveIO(True)
-+
+@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
-+
+     int is_write = rw->opcode == NVME_CMD_WRITE ? 1 : 0;
-+
+     enum BlockAcctType acct = is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ;
-+if __name__ == '__main__':
-+    iotests.main(supported_fmts=['qcow2', 'raw'])
+-    if ((slba + nlb) > ns->id_ns.nsze) {
-diff --git a/tests/qemu-iotests/151.out b/tests/qemu-iotests/151.out
++    trace_nvme_rw(is_write ? "write" : "read", nlb, data_size, slba);
-new file mode 100644
++
-index XXXXXXX..XXXXXXX
++    if (unlikely((slba + nlb) > ns->id_ns.nsze)) {
---- /dev/null
+         block_acct_invalid(blk_get_stats(n->conf.blk), acct);
-+++ b/tests/qemu-iotests/151.out
++        trace_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
-@@ -XXX,XX +XXX,XX @@
+         return NVME_LBA_RANGE | NVME_DNR;
-+..
+     }
-+----------------------------------------------------------------------
-+Ran 2 tests
+@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
-+
+     NvmeNamespace *ns;
-+OK
+     uint32_t nsid = le32_to_cpu(cmd->nsid);
-diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
 -    if (nsid == 0 || nsid > n->num_namespaces) {
 +    if (unlikely(nsid == 0 || nsid > n->num_namespaces)) {
 +        trace_nvme_err_invalid_ns(nsid, n->num_namespaces);
          return NVME_INVALID_NSID | NVME_DNR;
      }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
      case NVME_CMD_READ:
          return nvme_rw(n, ns, cmd, req);
      default:
 +        trace_nvme_err_invalid_opc(cmd->opcode);
          return NVME_INVALID_OPCODE | NVME_DNR;
      }
  }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeCmd *cmd)
      NvmeCQueue *cq;
      uint16_t qid = le16_to_cpu(c->qid);
 -    if (!qid || nvme_check_sqid(n, qid)) {
 +    if (unlikely(!qid || nvme_check_sqid(n, qid))) {
 +        trace_nvme_err_invalid_del_sq(qid);
          return NVME_INVALID_QID | NVME_DNR;
      }
 +    trace_nvme_del_sq(qid);
 +
      sq = n->sq[qid];
      while (!QTAILQ_EMPTY(&sq->out_req_list)) {
          req = QTAILQ_FIRST(&sq->out_req_list);
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeCmd *cmd)
      uint16_t qflags = le16_to_cpu(c->sq_flags);
      uint64_t prp1 = le64_to_cpu(c->prp1);
 -    if (!cqid || nvme_check_cqid(n, cqid)) {
 +    trace_nvme_create_sq(prp1, sqid, cqid, qsize, qflags);
 +
 +    if (unlikely(!cqid || nvme_check_cqid(n, cqid))) {
 +        trace_nvme_err_invalid_create_sq_cqid(cqid);
          return NVME_INVALID_CQID | NVME_DNR;
      }
 -    if (!sqid || !nvme_check_sqid(n, sqid)) {
 +    if (unlikely(!sqid || !nvme_check_sqid(n, sqid))) {
 +        trace_nvme_err_invalid_create_sq_sqid(sqid);
          return NVME_INVALID_QID | NVME_DNR;
      }
 -    if (!qsize || qsize > NVME_CAP_MQES(n->bar.cap)) {
 +    if (unlikely(!qsize || qsize > NVME_CAP_MQES(n->bar.cap))) {
 +        trace_nvme_err_invalid_create_sq_size(qsize);
          return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
      }
 -    if (!prp1 || prp1 & (n->page_size - 1)) {
 +    if (unlikely(!prp1 || prp1 & (n->page_size - 1))) {
 +        trace_nvme_err_invalid_create_sq_addr(prp1);
          return NVME_INVALID_FIELD | NVME_DNR;
      }
 -    if (!(NVME_SQ_FLAGS_PC(qflags))) {
 +    if (unlikely(!(NVME_SQ_FLAGS_PC(qflags)))) {
 +        trace_nvme_err_invalid_create_sq_qflags(NVME_SQ_FLAGS_PC(qflags));
          return NVME_INVALID_FIELD | NVME_DNR;
      }
      sq = g_malloc0(sizeof(*sq));
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeCmd *cmd)
      NvmeCQueue *cq;
      uint16_t qid = le16_to_cpu(c->qid);
 -    if (!qid || nvme_check_cqid(n, qid)) {
 +    if (unlikely(!qid || nvme_check_cqid(n, qid))) {
 +        trace_nvme_err_invalid_del_cq_cqid(qid);
          return NVME_INVALID_CQID | NVME_DNR;
      }
      cq = n->cq[qid];
 -    if (!QTAILQ_EMPTY(&cq->sq_list)) {
 +    if (unlikely(!QTAILQ_EMPTY(&cq->sq_list))) {
 +        trace_nvme_err_invalid_del_cq_notempty(qid);
          return NVME_INVALID_QUEUE_DEL;
      }
 +    trace_nvme_del_cq(qid);
      nvme_free_cq(cq, n);
      return NVME_SUCCESS;
  }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeCmd *cmd)
      uint16_t qflags = le16_to_cpu(c->cq_flags);
      uint64_t prp1 = le64_to_cpu(c->prp1);
 -    if (!cqid || !nvme_check_cqid(n, cqid)) {
 +    trace_nvme_create_cq(prp1, cqid, vector, qsize, qflags,
 +                         NVME_CQ_FLAGS_IEN(qflags) != 0);
 +
 +    if (unlikely(!cqid || !nvme_check_cqid(n, cqid))) {
 +        trace_nvme_err_invalid_create_cq_cqid(cqid);
          return NVME_INVALID_CQID | NVME_DNR;
      }
 -    if (!qsize || qsize > NVME_CAP_MQES(n->bar.cap)) {
 +    if (unlikely(!qsize || qsize > NVME_CAP_MQES(n->bar.cap))) {
 +        trace_nvme_err_invalid_create_cq_size(qsize);
          return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
      }
 -    if (!prp1) {
 +    if (unlikely(!prp1)) {
 +        trace_nvme_err_invalid_create_cq_addr(prp1);
          return NVME_INVALID_FIELD | NVME_DNR;
      }
 -    if (vector > n->num_queues) {
 +    if (unlikely(vector > n->num_queues)) {
 +        trace_nvme_err_invalid_create_cq_vector(vector);
          return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
      }
 -    if (!(NVME_CQ_FLAGS_PC(qflags))) {
 +    if (unlikely(!(NVME_CQ_FLAGS_PC(qflags)))) {
 +        trace_nvme_err_invalid_create_cq_qflags(NVME_CQ_FLAGS_PC(qflags));
          return NVME_INVALID_FIELD | NVME_DNR;
      }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeIdentify *c)
      uint64_t prp1 = le64_to_cpu(c->prp1);
      uint64_t prp2 = le64_to_cpu(c->prp2);
 +    trace_nvme_identify_ctrl();
 +
      return nvme_dma_read_prp(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl),
          prp1, prp2);
  }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeIdentify *c)
      uint64_t prp1 = le64_to_cpu(c->prp1);
      uint64_t prp2 = le64_to_cpu(c->prp2);
 -    if (nsid == 0 || nsid > n->num_namespaces) {
 +    trace_nvme_identify_ns(nsid);
 +
 +    if (unlikely(nsid == 0 || nsid > n->num_namespaces)) {
 +        trace_nvme_err_invalid_ns(nsid, n->num_namespaces);
          return NVME_INVALID_NSID | NVME_DNR;
      }
      ns = &n->namespaces[nsid - 1];
 +
      return nvme_dma_read_prp(n, (uint8_t *)&ns->id_ns, sizeof(ns->id_ns),
          prp1, prp2);
  }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeIdentify *c)
      uint16_t ret;
      int i, j = 0;
 +    trace_nvme_identify_nslist(min_nsid);
 +
      list = g_malloc0(data_len);
      for (i = 0; i < n->num_namespaces; i++) {
          if (i < min_nsid) {
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeCmd *cmd)
      case 0x02:
          return nvme_identify_nslist(n, c);
      default:
 +        trace_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns));
          return NVME_INVALID_FIELD | NVME_DNR;
      }
  }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
      switch (dw10) {
      case NVME_VOLATILE_WRITE_CACHE:
          result = blk_enable_write_cache(n->conf.blk);
 +        trace_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
          break;
      case NVME_NUMBER_OF_QUEUES:
          result = cpu_to_le32((n->num_queues - 2) | ((n->num_queues - 2) << 16));
 +        trace_nvme_getfeat_numq(result);
          break;
      default:
 +        trace_nvme_err_invalid_getfeat(dw10);
          return NVME_INVALID_FIELD | NVME_DNR;
      }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
          blk_set_enable_write_cache(n->conf.blk, dw11 & 1);
          break;
      case NVME_NUMBER_OF_QUEUES:
 +        trace_nvme_setfeat_numq((dw11 & 0xFFFF) + 1,
 +                                ((dw11 >> 16) & 0xFFFF) + 1,
 +                                n->num_queues - 1, n->num_queues - 1);
          req->cqe.result =
              cpu_to_le32((n->num_queues - 2) | ((n->num_queues - 2) << 16));
          break;
      default:
 +        trace_nvme_err_invalid_setfeat(dw10);
          return NVME_INVALID_FIELD | NVME_DNR;
      }
      return NVME_SUCCESS;
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
      case NVME_ADM_CMD_GET_FEATURES:
          return nvme_get_feature(n, cmd, req);
      default:
 +        trace_nvme_err_invalid_admin_opc(cmd->opcode);
          return NVME_INVALID_OPCODE | NVME_DNR;
      }
  }
@@ -XXX,XX +XXX,XX @@ static int nvme_start_ctrl(NvmeCtrl *n)
      uint32_t page_bits = NVME_CC_MPS(n->bar.cc) + 12;
      uint32_t page_size = 1 << page_bits;
 -    if (n->cq[0] || n->sq[0] || !n->bar.asq || !n->bar.acq ||
 -            n->bar.asq & (page_size - 1) || n->bar.acq & (page_size - 1) ||
 -            NVME_CC_MPS(n->bar.cc) < NVME_CAP_MPSMIN(n->bar.cap) ||
 -            NVME_CC_MPS(n->bar.cc) > NVME_CAP_MPSMAX(n->bar.cap) ||
 -            NVME_CC_IOCQES(n->bar.cc) < NVME_CTRL_CQES_MIN(n->id_ctrl.cqes) ||
 -            NVME_CC_IOCQES(n->bar.cc) > NVME_CTRL_CQES_MAX(n->id_ctrl.cqes) ||
 -            NVME_CC_IOSQES(n->bar.cc) < NVME_CTRL_SQES_MIN(n->id_ctrl.sqes) ||
 -            NVME_CC_IOSQES(n->bar.cc) > NVME_CTRL_SQES_MAX(n->id_ctrl.sqes) ||
 -            !NVME_AQA_ASQS(n->bar.aqa) || !NVME_AQA_ACQS(n->bar.aqa)) {
 +    if (unlikely(n->cq[0])) {
 +        trace_nvme_err_startfail_cq();
 +        return -1;
 +    }
 +    if (unlikely(n->sq[0])) {
 +        trace_nvme_err_startfail_sq();
 +        return -1;
 +    }
 +    if (unlikely(!n->bar.asq)) {
 +        trace_nvme_err_startfail_nbarasq();
 +        return -1;
 +    }
 +    if (unlikely(!n->bar.acq)) {
 +        trace_nvme_err_startfail_nbaracq();
 +        return -1;
 +    }
 +    if (unlikely(n->bar.asq & (page_size - 1))) {
 +        trace_nvme_err_startfail_asq_misaligned(n->bar.asq);
 +        return -1;
 +    }
 +    if (unlikely(n->bar.acq & (page_size - 1))) {
 +        trace_nvme_err_startfail_acq_misaligned(n->bar.acq);
 +        return -1;
 +    }
 +    if (unlikely(NVME_CC_MPS(n->bar.cc) <
 +                 NVME_CAP_MPSMIN(n->bar.cap))) {
 +        trace_nvme_err_startfail_page_too_small(
 +                    NVME_CC_MPS(n->bar.cc),
 +                    NVME_CAP_MPSMIN(n->bar.cap));
 +        return -1;
 +    }
 +    if (unlikely(NVME_CC_MPS(n->bar.cc) >
 +                 NVME_CAP_MPSMAX(n->bar.cap))) {
 +        trace_nvme_err_startfail_page_too_large(
 +                    NVME_CC_MPS(n->bar.cc),
 +                    NVME_CAP_MPSMAX(n->bar.cap));
 +        return -1;
 +    }
 +    if (unlikely(NVME_CC_IOCQES(n->bar.cc) <
 +                 NVME_CTRL_CQES_MIN(n->id_ctrl.cqes))) {
 +        trace_nvme_err_startfail_cqent_too_small(
 +                    NVME_CC_IOCQES(n->bar.cc),
 +                    NVME_CTRL_CQES_MIN(n->bar.cap));
 +        return -1;
 +    }
 +    if (unlikely(NVME_CC_IOCQES(n->bar.cc) >
 +                 NVME_CTRL_CQES_MAX(n->id_ctrl.cqes))) {
 +        trace_nvme_err_startfail_cqent_too_large(
 +                    NVME_CC_IOCQES(n->bar.cc),
 +                    NVME_CTRL_CQES_MAX(n->bar.cap));
 +        return -1;
 +    }
 +    if (unlikely(NVME_CC_IOSQES(n->bar.cc) <
 +                 NVME_CTRL_SQES_MIN(n->id_ctrl.sqes))) {
 +        trace_nvme_err_startfail_sqent_too_small(
 +                    NVME_CC_IOSQES(n->bar.cc),
 +                    NVME_CTRL_SQES_MIN(n->bar.cap));
 +        return -1;
 +    }
 +    if (unlikely(NVME_CC_IOSQES(n->bar.cc) >
 +                 NVME_CTRL_SQES_MAX(n->id_ctrl.sqes))) {
 +        trace_nvme_err_startfail_sqent_too_large(
 +                    NVME_CC_IOSQES(n->bar.cc),
 +                    NVME_CTRL_SQES_MAX(n->bar.cap));
 +        return -1;
 +    }
 +    if (unlikely(!NVME_AQA_ASQS(n->bar.aqa))) {
 +        trace_nvme_err_startfail_asqent_sz_zero();
 +        return -1;
 +    }
 +    if (unlikely(!NVME_AQA_ACQS(n->bar.aqa))) {
 +        trace_nvme_err_startfail_acqent_sz_zero();
          return -1;
      }
@@ -XXX,XX +XXX,XX @@ static int nvme_start_ctrl(NvmeCtrl *n)
  static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
      unsigned size)
  {
 +    if (unlikely(offset & (sizeof(uint32_t) - 1))) {
 +        NVME_GUEST_ERR(nvme_ub_mmiowr_misaligned32,
 +                       "MMIO write not 32-bit aligned,"
 +                       " offset=0x%"PRIx64"", offset);
 +        /* should be ignored, fall through for now */
 +    }
 +
 +    if (unlikely(size < sizeof(uint32_t))) {
 +        NVME_GUEST_ERR(nvme_ub_mmiowr_toosmall,
 +                       "MMIO write smaller than 32-bits,"
 +                       " offset=0x%"PRIx64", size=%u",
 +                       offset, size);
 +        /* should be ignored, fall through for now */
 +    }
 +
      switch (offset) {
 -    case 0xc:
 +    case 0xc:   /* INTMS */
 +        if (unlikely(msix_enabled(&(n->parent_obj)))) {
 +            NVME_GUEST_ERR(nvme_ub_mmiowr_intmask_with_msix,
 +                           "undefined access to interrupt mask set"
 +                           " when MSI-X is enabled");
 +            /* should be ignored, fall through for now */
 +        }
          n->bar.intms |= data & 0xffffffff;
          n->bar.intmc = n->bar.intms;
 +        trace_nvme_mmio_intm_set(data & 0xffffffff,
 +                                 n->bar.intmc);
          break;
 -    case 0x10:
 +    case 0x10:  /* INTMC */
 +        if (unlikely(msix_enabled(&(n->parent_obj)))) {
 +            NVME_GUEST_ERR(nvme_ub_mmiowr_intmask_with_msix,
 +                           "undefined access to interrupt mask clr"
 +                           " when MSI-X is enabled");
 +            /* should be ignored, fall through for now */
 +        }
          n->bar.intms &= ~(data & 0xffffffff);
          n->bar.intmc = n->bar.intms;
 +        trace_nvme_mmio_intm_clr(data & 0xffffffff,
 +                                 n->bar.intmc);
          break;
 -    case 0x14:
 +    case 0x14:  /* CC */
 +        trace_nvme_mmio_cfg(data & 0xffffffff);
          /* Windows first sends data, then sends enable bit */
          if (!NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc) &&
              !NVME_CC_SHN(data) && !NVME_CC_SHN(n->bar.cc))
@@ -XXX,XX +XXX,XX @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
          if (NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc)) {
              n->bar.cc = data;
 -            if (nvme_start_ctrl(n)) {
 +            if (unlikely(nvme_start_ctrl(n))) {
 +                trace_nvme_err_startfail();
                  n->bar.csts = NVME_CSTS_FAILED;
              } else {
 +                trace_nvme_mmio_start_success();
                  n->bar.csts = NVME_CSTS_READY;
              }
          } else if (!NVME_CC_EN(data) && NVME_CC_EN(n->bar.cc)) {
 +            trace_nvme_mmio_stopped();
              nvme_clear_ctrl(n);
              n->bar.csts &= ~NVME_CSTS_READY;
          }
          if (NVME_CC_SHN(data) && !(NVME_CC_SHN(n->bar.cc))) {
 -                nvme_clear_ctrl(n);
 -                n->bar.cc = data;
 -                n->bar.csts |= NVME_CSTS_SHST_COMPLETE;
 +            trace_nvme_mmio_shutdown_set();
 +            nvme_clear_ctrl(n);
 +            n->bar.cc = data;
 +            n->bar.csts |= NVME_CSTS_SHST_COMPLETE;
          } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(n->bar.cc)) {
 -                n->bar.csts &= ~NVME_CSTS_SHST_COMPLETE;
 -                n->bar.cc = data;
 +            trace_nvme_mmio_shutdown_cleared();
 +            n->bar.csts &= ~NVME_CSTS_SHST_COMPLETE;
 +            n->bar.cc = data;
 +        }
 +        break;
 +    case 0x1C:  /* CSTS */
 +        if (data & (1 << 4)) {
 +            NVME_GUEST_ERR(nvme_ub_mmiowr_ssreset_w1c_unsupported,
 +                           "attempted to W1C CSTS.NSSRO"
 +                           " but CAP.NSSRS is zero (not supported)");
 +        } else if (data != 0) {
 +            NVME_GUEST_ERR(nvme_ub_mmiowr_ro_csts,
 +                           "attempted to set a read only bit"
 +                           " of controller status");
 +        }
 +        break;
 +    case 0x20:  /* NSSR */
 +        if (data == 0x4E564D65) {
 +            trace_nvme_ub_mmiowr_ssreset_unsupported();
 +        } else {
 +            /* The spec says that writes of other values have no effect */
 +            return;
          }
          break;
 -    case 0x24:
 +    case 0x24:  /* AQA */
          n->bar.aqa = data & 0xffffffff;
 +        trace_nvme_mmio_aqattr(data & 0xffffffff);
          break;
 -    case 0x28:
 +    case 0x28:  /* ASQ */
          n->bar.asq = data;
 +        trace_nvme_mmio_asqaddr(data);
          break;
 -    case 0x2c:
 +    case 0x2c:  /* ASQ hi */
          n->bar.asq |= data << 32;
 +        trace_nvme_mmio_asqaddr_hi(data, n->bar.asq);
          break;
 -    case 0x30:
 +    case 0x30:  /* ACQ */
 +        trace_nvme_mmio_acqaddr(data);
          n->bar.acq = data;
          break;
 -    case 0x34:
 +    case 0x34:  /* ACQ hi */
          n->bar.acq |= data << 32;
 +        trace_nvme_mmio_acqaddr_hi(data, n->bar.acq);
          break;
 +    case 0x38:  /* CMBLOC */
 +        NVME_GUEST_ERR(nvme_ub_mmiowr_cmbloc_reserved,
 +                       "invalid write to reserved CMBLOC"
 +                       " when CMBSZ is zero, ignored");
 +        return;
 +    case 0x3C:  /* CMBSZ */
 +        NVME_GUEST_ERR(nvme_ub_mmiowr_cmbsz_readonly,
 +                       "invalid write to read only CMBSZ, ignored");
 +        return;
      default:
 +        NVME_GUEST_ERR(nvme_ub_mmiowr_invalid,
 +                       "invalid MMIO write,"
 +                       " offset=0x%"PRIx64", data=%"PRIx64"",
 +                       offset, data);
          break;
      }
  }
@@ -XXX,XX +XXX,XX @@ static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
      uint8_t *ptr = (uint8_t *)&n->bar;
      uint64_t val = 0;
 +    if (unlikely(addr & (sizeof(uint32_t) - 1))) {
 +        NVME_GUEST_ERR(nvme_ub_mmiord_misaligned32,
 +                       "MMIO read not 32-bit aligned,"
 +                       " offset=0x%"PRIx64"", addr);
 +        /* should RAZ, fall through for now */
 +    } else if (unlikely(size < sizeof(uint32_t))) {
 +        NVME_GUEST_ERR(nvme_ub_mmiord_toosmall,
 +                       "MMIO read smaller than 32-bits,"
 +                       " offset=0x%"PRIx64"", addr);
 +        /* should RAZ, fall through for now */
 +    }
 +
      if (addr < sizeof(n->bar)) {
          memcpy(&val, ptr + addr, size);
 +    } else {
 +        NVME_GUEST_ERR(nvme_ub_mmiord_invalid_ofs,
 +                       "MMIO read beyond last register,"
 +                       " offset=0x%"PRIx64", returning 0", addr);
      }
 +
      return val;
  }
@@ -XXX,XX +XXX,XX @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
  {
      uint32_t qid;
 -    if (addr & ((1 << 2) - 1)) {
 +    if (unlikely(addr & ((1 << 2) - 1))) {
 +        NVME_GUEST_ERR(nvme_ub_db_wr_misaligned,
 +                       "doorbell write not 32-bit aligned,"
 +                       " offset=0x%"PRIx64", ignoring", addr);
          return;
      }
      if (((addr - 0x1000) >> 2) & 1) {
 +        /* Completion queue doorbell write */
 +
          uint16_t new_head = val & 0xffff;
          int start_sqs;
          NvmeCQueue *cq;
          qid = (addr - (0x1000 + (1 << 2))) >> 3;
 -        if (nvme_check_cqid(n, qid)) {
 +        if (unlikely(nvme_check_cqid(n, qid))) {
 +            NVME_GUEST_ERR(nvme_ub_db_wr_invalid_cq,
 +                           "completion queue doorbell write"
 +                           " for nonexistent queue,"
 +                           " sqid=%"PRIu32", ignoring", qid);
              return;
          }
          cq = n->cq[qid];
 -        if (new_head >= cq->size) {
 +        if (unlikely(new_head >= cq->size)) {
 +            NVME_GUEST_ERR(nvme_ub_db_wr_invalid_cqhead,
 +                           "completion queue doorbell write value"
 +                           " beyond queue size, sqid=%"PRIu32","
 +                           " new_head=%"PRIu16", ignoring",
 +                           qid, new_head);
              return;
          }
@@ -XXX,XX +XXX,XX @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
              nvme_isr_notify(n, cq);
          }
      } else {
 +        /* Submission queue doorbell write */
 +
          uint16_t new_tail = val & 0xffff;
          NvmeSQueue *sq;
          qid = (addr - 0x1000) >> 3;
 -        if (nvme_check_sqid(n, qid)) {
 +        if (unlikely(nvme_check_sqid(n, qid))) {
 +            NVME_GUEST_ERR(nvme_ub_db_wr_invalid_sq,
 +                           "submission queue doorbell write"
 +                           " for nonexistent queue,"
 +                           " sqid=%"PRIu32", ignoring", qid);
              return;
          }
          sq = n->sq[qid];
 -        if (new_tail >= sq->size) {
 +        if (unlikely(new_tail >= sq->size)) {
 +            NVME_GUEST_ERR(nvme_ub_db_wr_invalid_sqtail,
 +                           "submission queue doorbell write value"
 +                           " beyond queue size, sqid=%"PRIu32","
 +                           " new_tail=%"PRIu16", ignoring",
 +                           qid, new_tail);
              return;
          }
 diff --git a/hw/block/trace-events b/hw/block/trace-events
 index XXXXXXX..XXXXXXX 100644
---- a/tests/qemu-iotests/group
+--- a/hw/block/trace-events
-+++ b/tests/qemu-iotests/group
++++ b/hw/block/trace-events
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ virtio_blk_submit_multireq(void *vdev, void *mrb, int start, int num_reqs, uint6
-rw auto quick
+ hd_geometry_lchs_guess(void *blk, int cyls, int heads, int secs) "blk %p LCHS %d %d %d"
-rw auto sudo
+ hd_geometry_guess(void *blk, uint32_t cyls, uint32_t heads, uint32_t secs, int trans) "blk %p CHS %u %u %u trans %d"
-rw auto quick
-+151 rw auto
++# hw/block/nvme.c
-rw auto quick
++# nvme traces for successful events
-rw auto quick
++nvme_irq_msix(uint32_t vector) "raising MSI-X IRQ vector %u"
-rw auto backing quick
++nvme_irq_pin(void) "pulsing IRQ pin"
 +nvme_irq_masked(void) "IRQ is masked"
 +nvme_dma_read(uint64_t prp1, uint64_t prp2) "DMA read, prp1=0x%"PRIx64" prp2=0x%"PRIx64""
 +nvme_rw(char const *verb, uint32_t blk_count, uint64_t byte_count, uint64_t lba) "%s %"PRIu32" blocks (%"PRIu64" bytes) from LBA %"PRIu64""
 +nvme_create_sq(uint64_t addr, uint16_t sqid, uint16_t cqid, uint16_t qsize, uint16_t qflags) "create submission queue, addr=0x%"PRIx64", sqid=%"PRIu16", cqid=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16""
 +nvme_create_cq(uint64_t addr, uint16_t cqid, uint16_t vector, uint16_t size, uint16_t qflags, int ien) "create completion queue, addr=0x%"PRIx64", cqid=%"PRIu16", vector=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16", ien=%d"
 +nvme_del_sq(uint16_t qid) "deleting submission queue sqid=%"PRIu16""
 +nvme_del_cq(uint16_t cqid) "deleted completion queue, sqid=%"PRIu16""
 +nvme_identify_ctrl(void) "identify controller"
 +nvme_identify_ns(uint16_t ns) "identify namespace, nsid=%"PRIu16""
 +nvme_identify_nslist(uint16_t ns) "identify namespace list, nsid=%"PRIu16""
 +nvme_getfeat_vwcache(char const* result) "get feature volatile write cache, result=%s"
 +nvme_getfeat_numq(int result) "get feature number of queues, result=%d"
 +nvme_setfeat_numq(int reqcq, int reqsq, int gotcq, int gotsq) "requested cq_count=%d sq_count=%d, responding with cq_count=%d sq_count=%d"
 +nvme_mmio_intm_set(uint64_t data, uint64_t new_mask) "wrote MMIO, interrupt mask set, data=0x%"PRIx64", new_mask=0x%"PRIx64""
 +nvme_mmio_intm_clr(uint64_t data, uint64_t new_mask) "wrote MMIO, interrupt mask clr, data=0x%"PRIx64", new_mask=0x%"PRIx64""
 +nvme_mmio_cfg(uint64_t data) "wrote MMIO, config controller config=0x%"PRIx64""
 +nvme_mmio_aqattr(uint64_t data) "wrote MMIO, admin queue attributes=0x%"PRIx64""
 +nvme_mmio_asqaddr(uint64_t data) "wrote MMIO, admin submission queue address=0x%"PRIx64""
 +nvme_mmio_acqaddr(uint64_t data) "wrote MMIO, admin completion queue address=0x%"PRIx64""
 +nvme_mmio_asqaddr_hi(uint64_t data, uint64_t new_addr) "wrote MMIO, admin submission queue high half=0x%"PRIx64", new_address=0x%"PRIx64""
 +nvme_mmio_acqaddr_hi(uint64_t data, uint64_t new_addr) "wrote MMIO, admin completion queue high half=0x%"PRIx64", new_address=0x%"PRIx64""
 +nvme_mmio_start_success(void) "setting controller enable bit succeeded"
 +nvme_mmio_stopped(void) "cleared controller enable bit"
 +nvme_mmio_shutdown_set(void) "shutdown bit set"
 +nvme_mmio_shutdown_cleared(void) "shutdown bit cleared"
 +
 +# nvme traces for error conditions
 +nvme_err_invalid_dma(void) "PRP/SGL is too small for transfer size"
 +nvme_err_invalid_prplist_ent(uint64_t prplist) "PRP list entry is null or not page aligned: 0x%"PRIx64""
 +nvme_err_invalid_prp2_align(uint64_t prp2) "PRP2 is not page aligned: 0x%"PRIx64""
 +nvme_err_invalid_prp2_missing(void) "PRP2 is null and more data to be transferred"
 +nvme_err_invalid_field(void) "invalid field"
 +nvme_err_invalid_prp(void) "invalid PRP"
 +nvme_err_invalid_sgl(void) "invalid SGL"
 +nvme_err_invalid_ns(uint32_t ns, uint32_t limit) "invalid namespace %u not within 1-%u"
 +nvme_err_invalid_opc(uint8_t opc) "invalid opcode 0x%"PRIx8""
 +nvme_err_invalid_admin_opc(uint8_t opc) "invalid admin opcode 0x%"PRIx8""
 +nvme_err_invalid_lba_range(uint64_t start, uint64_t len, uint64_t limit) "Invalid LBA start=%"PRIu64" len=%"PRIu64" limit=%"PRIu64""
 +nvme_err_invalid_del_sq(uint16_t qid) "invalid submission queue deletion, sid=%"PRIu16""
 +nvme_err_invalid_create_sq_cqid(uint16_t cqid) "failed creating submission queue, invalid cqid=%"PRIu16""
 +nvme_err_invalid_create_sq_sqid(uint16_t sqid) "failed creating submission queue, invalid sqid=%"PRIu16""
 +nvme_err_invalid_create_sq_size(uint16_t qsize) "failed creating submission queue, invalid qsize=%"PRIu16""
 +nvme_err_invalid_create_sq_addr(uint64_t addr) "failed creating submission queue, addr=0x%"PRIx64""
 +nvme_err_invalid_create_sq_qflags(uint16_t qflags) "failed creating submission queue, qflags=%"PRIu16""
 +nvme_err_invalid_del_cq_cqid(uint16_t cqid) "failed deleting completion queue, cqid=%"PRIu16""
 +nvme_err_invalid_del_cq_notempty(uint16_t cqid) "failed deleting completion queue, it is not empty, cqid=%"PRIu16""
 +nvme_err_invalid_create_cq_cqid(uint16_t cqid) "failed creating completion queue, cqid=%"PRIu16""
 +nvme_err_invalid_create_cq_size(uint16_t size) "failed creating completion queue, size=%"PRIu16""
 +nvme_err_invalid_create_cq_addr(uint64_t addr) "failed creating completion queue, addr=0x%"PRIx64""
 +nvme_err_invalid_create_cq_vector(uint16_t vector) "failed creating completion queue, vector=%"PRIu16""
 +nvme_err_invalid_create_cq_qflags(uint16_t qflags) "failed creating completion queue, qflags=%"PRIu16""
 +nvme_err_invalid_identify_cns(uint16_t cns) "identify, invalid cns=0x%"PRIx16""
 +nvme_err_invalid_getfeat(int dw10) "invalid get features, dw10=0x%"PRIx32""
 +nvme_err_invalid_setfeat(uint32_t dw10) "invalid set features, dw10=0x%"PRIx32""
 +nvme_err_startfail_cq(void) "nvme_start_ctrl failed because there are non-admin completion queues"
 +nvme_err_startfail_sq(void) "nvme_start_ctrl failed because there are non-admin submission queues"
 +nvme_err_startfail_nbarasq(void) "nvme_start_ctrl failed because the admin submission queue address is null"
 +nvme_err_startfail_nbaracq(void) "nvme_start_ctrl failed because the admin completion queue address is null"
 +nvme_err_startfail_asq_misaligned(uint64_t addr) "nvme_start_ctrl failed because the admin submission queue address is misaligned: 0x%"PRIx64""
 +nvme_err_startfail_acq_misaligned(uint64_t addr) "nvme_start_ctrl failed because the admin completion queue address is misaligned: 0x%"PRIx64""
 +nvme_err_startfail_page_too_small(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the page size is too small: log2size=%u, min=%u"
 +nvme_err_startfail_page_too_large(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the page size is too large: log2size=%u, max=%u"
 +nvme_err_startfail_cqent_too_small(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the completion queue entry size is too small: log2size=%u, min=%u"
 +nvme_err_startfail_cqent_too_large(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the completion queue entry size is too large: log2size=%u, max=%u"
 +nvme_err_startfail_sqent_too_small(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the submission queue entry size is too small: log2size=%u, min=%u"
 +nvme_err_startfail_sqent_too_large(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the submission queue entry size is too large: log2size=%u, max=%u"
 +nvme_err_startfail_asqent_sz_zero(void) "nvme_start_ctrl failed because the admin submission queue size is zero"
 +nvme_err_startfail_acqent_sz_zero(void) "nvme_start_ctrl failed because the admin completion queue size is zero"
 +nvme_err_startfail(void) "setting controller enable bit failed"
 +
 +# Traces for undefined behavior
 +nvme_ub_mmiowr_misaligned32(uint64_t offset) "MMIO write not 32-bit aligned, offset=0x%"PRIx64""
 +nvme_ub_mmiowr_toosmall(uint64_t offset, unsigned size) "MMIO write smaller than 32 bits, offset=0x%"PRIx64", size=%u"
 +nvme_ub_mmiowr_intmask_with_msix(void) "undefined access to interrupt mask set when MSI-X is enabled"
 +nvme_ub_mmiowr_ro_csts(void) "attempted to set a read only bit of controller status"
 +nvme_ub_mmiowr_ssreset_w1c_unsupported(void) "attempted to W1C CSTS.NSSRO but CAP.NSSRS is zero (not supported)"
 +nvme_ub_mmiowr_ssreset_unsupported(void) "attempted NVM subsystem reset but CAP.NSSRS is zero (not supported)"
 +nvme_ub_mmiowr_cmbloc_reserved(void) "invalid write to reserved CMBLOC when CMBSZ is zero, ignored"
 +nvme_ub_mmiowr_cmbsz_readonly(void) "invalid write to read only CMBSZ, ignored"
 +nvme_ub_mmiowr_invalid(uint64_t offset, uint64_t data) "invalid MMIO write, offset=0x%"PRIx64", data=0x%"PRIx64""
 +nvme_ub_mmiord_misaligned32(uint64_t offset) "MMIO read not 32-bit aligned, offset=0x%"PRIx64""
 +nvme_ub_mmiord_toosmall(uint64_t offset) "MMIO read smaller than 32-bits, offset=0x%"PRIx64""
 +nvme_ub_mmiord_invalid_ofs(uint64_t offset) "MMIO read beyond last register, offset=0x%"PRIx64", returning 0"
 +nvme_ub_db_wr_misaligned(uint64_t offset) "doorbell write not 32-bit aligned, offset=0x%"PRIx64", ignoring"
 +nvme_ub_db_wr_invalid_cq(uint32_t qid) "completion queue doorbell write for nonexistent queue, cqid=%"PRIu32", ignoring"
 +nvme_ub_db_wr_invalid_cqhead(uint32_t qid, uint16_t new_head) "completion queue doorbell write value beyond queue size, cqid=%"PRIu32", new_head=%"PRIu16", ignoring"
 +nvme_ub_db_wr_invalid_sq(uint32_t qid) "submission queue doorbell write for nonexistent queue, sqid=%"PRIu32", ignoring"
 +nvme_ub_db_wr_invalid_sqtail(uint32_t qid, uint16_t new_tail) "submission queue doorbell write value beyond queue size, sqid=%"PRIu32", new_head=%"PRIu16", ignoring"
 +
  # hw/block/xen_disk.c
  xen_disk_alloc(char *name) "%s"
  xen_disk_init(char *name) "%s"
 --
 .13.6

-[Qemu-devel] [PULL 27/35] block: Generalize should_update_child() rule
+[Qemu-devel] [PULL v3 13/35] block: Open backing image in force share mode for size probe
-From: Max Reitz <mreitz@redhat.com>
+From: Fam Zheng <famz@redhat.com>
-Currently, bdrv_replace_node() refuses to create loops from one BDS to
+Management tools create overlays of running guests with qemu-img:
 itself if the BDS to be replaced is the backing node of the BDS to
 replace it: Say there is a node A and a node B.  Replacing B by A means
 making all references to B point to A.  If B is a child of A (i.e. A has
 a reference to B), that would mean we would have to make this reference
 point to A itself -- so we'd create a loop.
-bdrv_replace_node() (through should_update_child()) refuses to do so if
+  $ qemu-img create -b /image/in/use.qcow2 -f qcow2 /overlay/image.qcow2
 B is the backing node of A.  There is no reason why we should create
 loops if B is not the backing node of A, though.  The BDS graph should
 never contain loops, so we should always refuse to create them.
-If B is a child of A and B is to be replaced by A, we should simply
+but this doesn't work anymore due to image locking:
 leave B in place there because it is the most sensible choice.
-A more specific argument would be: Putting filter drivers into the BDS
+    qemu-img: /overlay/image.qcow2: Failed to get shared "write" lock
-graph is basically the same as appending an overlay to a backing chain.
+    Is another process using the image?
-But the main child BDS of a filter driver is not "backing" but "file",
+    Could not open backing image to determine size.
-so restricting the no-loop rule to backing nodes would fail here.
+Use the force share option to allow this use case again.
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+Cc: qemu-stable@nongnu.org
-Reviewed-by: Fam Zheng <famz@redhat.com>
+Signed-off-by: Fam Zheng <famz@redhat.com>
-Reviewed-by: Alberto Garcia <berto@igalia.com>
+Reviewed-by: Eric Blake <eblake@redhat.com>
-Message-id: 20180613181823.13618-7-mreitz@redhat.com
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 ---
- block.c | 44 ++++++++++++++++++++++++++++++++++----------
+ block.c | 3 ++-
-file changed, 34 insertions(+), 10 deletions(-)
+file changed, 2 insertions(+), 1 deletion(-)
 diff --git a/block.c b/block.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block.c
 +++ b/block.c
-@@ -XXX,XX +XXX,XX @@ static bool should_update_child(BdrvChild *c, BlockDriverState *to)
+@@ -XXX,XX +XXX,XX @@ void bdrv_img_create(const char *filename, const char *fmt,
-         return false;
+         back_flags = flags;
-     }
+         back_flags &= ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
--    if (c->role == &child_backing) {
++        backing_options = qdict_new();
--        /* If @from is a backing file of @to, ignore the child to avoid
+         if (backing_fmt) {
--         * creating a loop. We only want to change the pointer of other
+-            backing_options = qdict_new();
--         * parents. */
+             qdict_put_str(backing_options, "driver", backing_fmt);
 -        QLIST_FOREACH(to_c, &to->children, next) {
 -            if (to_c == c) {
 -                break;
 -            }
 -        }
 -        if (to_c) {
 +    /* If the child @c belongs to the BDS @to, replacing the current
 +     * c->bs by @to would mean to create a loop.
 +     *
 +     * Such a case occurs when appending a BDS to a backing chain.
 +     * For instance, imagine the following chain:
 +     *
 +     *   guest device -> node A -> further backing chain...
 +     *
 +     * Now we create a new BDS B which we want to put on top of this
 +     * chain, so we first attach A as its backing node:
 +     *
 +     *                   node B
 +     *                     |
 +     *                     v
 +     *   guest device -> node A -> further backing chain...
 +     *
 +     * Finally we want to replace A by B.  When doing that, we want to
 +     * replace all pointers to A by pointers to B -- except for the
 +     * pointer from B because (1) that would create a loop, and (2)
 +     * that pointer should simply stay intact:
 +     *
 +     *   guest device -> node B
 +     *                     |
 +     *                     v
 +     *                   node A -> further backing chain...
 +     *
 +     * In general, when replacing a node A (c->bs) by a node B (@to),
 +     * if A is a child of B, that means we cannot replace A by B there
 +     * because that would create a loop.  Silently detaching A from B
 +     * is also not really an option.  So overall just leaving A in
 +     * place there is the most sensible choice. */
 +    QLIST_FOREACH(to_c, &to->children, next) {
 +        if (to_c == c) {
              return false;
          }
-     }
++        qdict_put_bool(backing_options, BDRV_OPT_FORCE_SHARE, true);
-@@ -XXX,XX +XXX,XX @@ void bdrv_replace_node(BlockDriverState *from, BlockDriverState *to,
+         bs = bdrv_open(full_backing, NULL, backing_options, back_flags,
-     /* Put all parents into @list and calculate their cumulative permissions */
+                        &local_err);
      QLIST_FOREACH_SAFE(c, &from->parents, next_parent, next) {
 +        assert(c->bs == from);
          if (!should_update_child(c, to)) {
              continue;
          }
 --
 .13.6

-[Qemu-devel] [PULL 34/35] block/mirror: Add copy mode QAPI interface
+[Qemu-devel] [PULL v3 14/35] block: Remove the obsolete -drive boot=on|off parameter
-From: Max Reitz <mreitz@redhat.com>
+From: Thomas Huth <thuth@redhat.com>
-This patch allows the user to specify whether to use active or only
+It's not working anymore since QEMU v1.3.0 - time to remove it now.
 background mode for mirror block jobs.  Currently, this setting will
 remain constant for the duration of the entire block job.
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+Signed-off-by: Thomas Huth <thuth@redhat.com>
-Reviewed-by: Alberto Garcia <berto@igalia.com>
+Reviewed-by: John Snow <jsnow@redhat.com>
-Message-id: 20180613181823.13618-14-mreitz@redhat.com
+Reviewed-by: Markus Armbruster <armbru@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- qapi/block-core.json      | 11 +++++++++--
+ blockdev.c    | 11 -----------
- include/block/block_int.h |  4 +++-
+ qemu-doc.texi |  6 ------
- block/mirror.c            | 12 +++++++-----
+files changed, 17 deletions(-)
  blockdev.c                |  9 ++++++++-
 files changed, 27 insertions(+), 9 deletions(-)
-diff --git a/qapi/block-core.json b/qapi/block-core.json
-index XXXXXXX..XXXXXXX 100644
---- a/qapi/block-core.json
-+++ b/qapi/block-core.json
-@@ -XXX,XX +XXX,XX @@
- #         written. Both will result in identical contents.
- #         Default is true. (Since 2.4)
- #
-+# @copy-mode: when to copy data to the destination; defaults to 'background'
-+#             (Since: 3.0)
-+#
- # Since: 1.3
- ##
- { 'struct': 'DriveMirror',
-@@ -XXX,XX +XXX,XX @@
-             '*speed': 'int', '*granularity': 'uint32',
-             '*buf-size': 'int', '*on-source-error': 'BlockdevOnError',
-             '*on-target-error': 'BlockdevOnError',
--            '*unmap': 'bool' } }
-+            '*unmap': 'bool', '*copy-mode': 'MirrorCopyMode' } }
- ##
- # @BlockDirtyBitmap:
-@@ -XXX,XX +XXX,XX @@
- #                    above @device. If this option is not given, a node name is
- #                    autogenerated. (Since: 2.9)
- #
-+# @copy-mode: when to copy data to the destination; defaults to 'background'
-+#             (Since: 3.0)
-+#
- # Returns: nothing on success.
- #
- # Since: 2.6
-@@ -XXX,XX +XXX,XX @@
-             '*speed': 'int', '*granularity': 'uint32',
-             '*buf-size': 'int', '*on-source-error': 'BlockdevOnError',
-             '*on-target-error': 'BlockdevOnError',
--            '*filter-node-name': 'str' } }
-+            '*filter-node-name': 'str',
-+            '*copy-mode': 'MirrorCopyMode' } }
- ##
- # @block_set_io_throttle:
-diff --git a/include/block/block_int.h b/include/block/block_int.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/block/block_int.h
-+++ b/include/block/block_int.h
-@@ -XXX,XX +XXX,XX @@ void commit_active_start(const char *job_id, BlockDriverState *bs,
-  * @filter_node_name: The node name that should be assigned to the filter
-  * driver that the mirror job inserts into the graph above @bs. NULL means that
-  * a node name should be autogenerated.
-+ * @copy_mode: When to trigger writes to the target.
-  * @errp: Error object.
-  *
-  * Start a mirroring operation on @bs.  Clusters that are allocated
-@@ -XXX,XX +XXX,XX @@ void mirror_start(const char *job_id, BlockDriverState *bs,
-                   MirrorSyncMode mode, BlockMirrorBackingMode backing_mode,
-                   BlockdevOnError on_source_error,
-                   BlockdevOnError on_target_error,
--                  bool unmap, const char *filter_node_name, Error **errp);
-+                  bool unmap, const char *filter_node_name,
-+                  MirrorCopyMode copy_mode, Error **errp);
- /*
-  * backup_job_create:
-diff --git a/block/mirror.c b/block/mirror.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/mirror.c
-+++ b/block/mirror.c
-@@ -XXX,XX +XXX,XX @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs,
-                              const BlockJobDriver *driver,
-                              bool is_none_mode, BlockDriverState *base,
-                              bool auto_complete, const char *filter_node_name,
--                             bool is_mirror,
-+                             bool is_mirror, MirrorCopyMode copy_mode,
-                              Error **errp)
- {
-     MirrorBlockJob *s;
-@@ -XXX,XX +XXX,XX @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs,
-     s->on_target_error = on_target_error;
-     s->is_none_mode = is_none_mode;
-     s->backing_mode = backing_mode;
--    s->copy_mode = MIRROR_COPY_MODE_BACKGROUND;
-+    s->copy_mode = copy_mode;
-     s->base = base;
-     s->granularity = granularity;
-     s->buf_size = ROUND_UP(buf_size, granularity);
-@@ -XXX,XX +XXX,XX @@ void mirror_start(const char *job_id, BlockDriverState *bs,
-                   MirrorSyncMode mode, BlockMirrorBackingMode backing_mode,
-                   BlockdevOnError on_source_error,
-                   BlockdevOnError on_target_error,
--                  bool unmap, const char *filter_node_name, Error **errp)
-+                  bool unmap, const char *filter_node_name,
-+                  MirrorCopyMode copy_mode, Error **errp)
- {
-     bool is_none_mode;
-     BlockDriverState *base;
-@@ -XXX,XX +XXX,XX @@ void mirror_start(const char *job_id, BlockDriverState *bs,
-                      speed, granularity, buf_size, backing_mode,
-                      on_source_error, on_target_error, unmap, NULL, NULL,
-                      &mirror_job_driver, is_none_mode, base, false,
--                     filter_node_name, true, errp);
-+                     filter_node_name, true, copy_mode, errp);
- }
- void commit_active_start(const char *job_id, BlockDriverState *bs,
-@@ -XXX,XX +XXX,XX @@ void commit_active_start(const char *job_id, BlockDriverState *bs,
-                      MIRROR_LEAVE_BACKING_CHAIN,
-                      on_error, on_error, true, cb, opaque,
-                      &commit_active_job_driver, false, base, auto_complete,
--                     filter_node_name, false, &local_err);
-+                     filter_node_name, false, MIRROR_COPY_MODE_BACKGROUND,
-+                     &local_err);
-     if (local_err) {
-         error_propagate(errp, local_err);
-         goto error_restore_flags;
 diff --git a/blockdev.c b/blockdev.c
 index XXXXXXX..XXXXXXX 100644
 --- a/blockdev.c
 +++ b/blockdev.c
-@@ -XXX,XX +XXX,XX @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
+@@ -XXX,XX +XXX,XX @@ QemuOptsList qemu_legacy_drive_opts = {
-                                    bool has_unmap, bool unmap,
+             .type = QEMU_OPT_STRING,
-                                    bool has_filter_node_name,
+             .help = "chs translation (auto, lba, none)",
-                                    const char *filter_node_name,
+         },{
-+                                   bool has_copy_mode, MirrorCopyMode copy_mode,
+-            .name = "boot",
-                                    Error **errp)
+-            .type = QEMU_OPT_BOOL,
- {
+-            .help = "(deprecated, ignored)",
+-        },{
-@@ -XXX,XX +XXX,XX @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
+             .name = "addr",
-     if (!has_filter_node_name) {
+             .type = QEMU_OPT_STRING,
-         filter_node_name = NULL;
+             .help = "pci address (virtio only)",
@@ -XXX,XX +XXX,XX @@ DriveInfo *drive_new(QemuOpts *all_opts, BlockInterfaceType block_default_type)
          goto fail;
      }
-+    if (!has_copy_mode) {
-+        copy_mode = MIRROR_COPY_MODE_BACKGROUND;
+-    /* Deprecated option boot=[on|off] */
-+    }
+-    if (qemu_opt_get(legacy_opts, "boot") != NULL) {
+-        fprintf(stderr, "qemu-kvm: boot=on|off is deprecated and will be "
-     if (granularity != 0 && (granularity < 512 || granularity > 1048576 * 64)) {
+-                "ignored. Future versions will reject this parameter. Please "
-         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "granularity",
+-                "update your scripts.\n");
-@@ -XXX,XX +XXX,XX @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
+-    }
-                  has_replaces ? replaces : NULL,
+-
-                  speed, granularity, buf_size, sync, backing_mode,
+     /* Other deprecated options */
-                  on_source_error, on_target_error, unmap, filter_node_name,
+     if (!qtest_enabled()) {
--                 errp);
+         for (i = 0; i < ARRAY_SIZE(deprecated); i++) {
-+                 copy_mode, errp);
+diff --git a/qemu-doc.texi b/qemu-doc.texi
- }
+index XXXXXXX..XXXXXXX 100644
+--- a/qemu-doc.texi
- void qmp_drive_mirror(DriveMirror *arg, Error **errp)
++++ b/qemu-doc.texi
-@@ -XXX,XX +XXX,XX @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp)
+@@ -XXX,XX +XXX,XX @@ deprecated.
-                            arg->has_on_target_error, arg->on_target_error,
-                            arg->has_unmap, arg->unmap,
+ @section System emulator command line arguments
-                            false, NULL,
-+                           arg->has_copy_mode, arg->copy_mode,
+-@subsection -drive boot=on|off (since 1.3.0)
-                            &local_err);
+-
-     bdrv_unref(target_bs);
+-The ``boot=on|off'' option to the ``-drive'' argument is
-     error_propagate(errp, local_err);
+-ignored. Applications should use the ``bootindex=N'' parameter
-@@ -XXX,XX +XXX,XX @@ void qmp_blockdev_mirror(bool has_job_id, const char *job_id,
+-to set an absolute ordering between devices instead.
-                          BlockdevOnError on_target_error,
+-
-                          bool has_filter_node_name,
+ @subsection -tdf (since 1.3.0)
-                          const char *filter_node_name,
-+                         bool has_copy_mode, MirrorCopyMode copy_mode,
+ The ``-tdf'' argument is ignored. The behaviour implemented
                           Error **errp)
  {
      BlockDriverState *bs;
@@ -XXX,XX +XXX,XX @@ void qmp_blockdev_mirror(bool has_job_id, const char *job_id,
                             has_on_target_error, on_target_error,
                             true, true,
                             has_filter_node_name, filter_node_name,
 +                           has_copy_mode, copy_mode,
                             &local_err);
      error_propagate(errp, local_err);
 --
 .13.6

-[Qemu-devel] [PULL 31/35] block/mirror: Add MirrorBDSOpaque
+[Qemu-devel] [PULL v3 15/35] block: Remove the deprecated -hdachs option
-From: Max Reitz <mreitz@redhat.com>
+From: Thomas Huth <thuth@redhat.com>
-This will allow us to access the block job data when the mirror block
+It's been marked as deprecated since QEMU v2.10.0, and so far nobody
-driver becomes more complex.
+complained that we should keep it, so let's remove this legacy option
+now to simplify the code quite a bit.
-Signed-off-by: Max Reitz <mreitz@redhat.com>
-Reviewed-by: Fam Zheng <famz@redhat.com>
+Signed-off-by: Thomas Huth <thuth@redhat.com>
-Message-id: 20180613181823.13618-11-mreitz@redhat.com
+Reviewed-by: John Snow <jsnow@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+Reviewed-by: Markus Armbruster <armbru@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- block/mirror.c | 12 ++++++++++++
+ vl.c            | 86 ++-------------------------------------------------------
-file changed, 12 insertions(+)
+ qemu-doc.texi   |  8 ------
+ qemu-options.hx | 19 ++-----------
-diff --git a/block/mirror.c b/block/mirror.c
+files changed, 4 insertions(+), 109 deletions(-)
 diff --git a/vl.c b/vl.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/mirror.c
+--- a/vl.c
-+++ b/block/mirror.c
++++ b/vl.c
-@@ -XXX,XX +XXX,XX @@ typedef struct MirrorBlockJob {
+@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
-     bool initial_zeroing_ongoing;
+     const char *boot_order = NULL;
- } MirrorBlockJob;
+     const char *boot_once = NULL;
+     DisplayState *ds;
-+typedef struct MirrorBDSOpaque {
+-    int cyls, heads, secs, translation;
-+    MirrorBlockJob *job;
+     QemuOpts *opts, *machine_opts;
-+} MirrorBDSOpaque;
+-    QemuOpts *hda_opts = NULL, *icount_opts = NULL, *accel_opts = NULL;
-+
++    QemuOpts *icount_opts = NULL, *accel_opts = NULL;
- struct MirrorOp {
+     QemuOptsList *olist;
-     MirrorBlockJob *s;
+     int optind;
-     QEMUIOVector qiov;
+     const char *optarg;
-@@ -XXX,XX +XXX,XX @@ static void mirror_exit(Job *job, void *opaque)
+@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
-     MirrorBlockJob *s = container_of(job, MirrorBlockJob, common.job);
-     BlockJob *bjob = &s->common;
+     cpu_model = NULL;
-     MirrorExitData *data = opaque;
+     snapshot = 0;
-+    MirrorBDSOpaque *bs_opaque = s->mirror_top_bs->opaque;
+-    cyls = heads = secs = 0;
-     AioContext *replace_aio_context = NULL;
+-    translation = BIOS_ATA_TRANSLATION_AUTO;
-     BlockDriverState *src = s->mirror_top_bs->backing->bs;
-     BlockDriverState *target_bs = blk_bs(s->target);
+     nb_nics = 0;
-@@ -XXX,XX +XXX,XX @@ static void mirror_exit(Job *job, void *opaque)
-     blk_set_perm(bjob->blk, 0, BLK_PERM_ALL, &error_abort);
+@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
-     blk_insert_bs(bjob->blk, mirror_top_bs, &error_abort);
+         if (optind >= argc)
+             break;
-+    bs_opaque->job = NULL;
+         if (argv[optind][0] != '-') {
-     job_completed(job, data->ret, NULL);
+-            hda_opts = drive_add(IF_DEFAULT, 0, argv[optind++], HD_OPTS);
++            drive_add(IF_DEFAULT, 0, argv[optind++], HD_OPTS);
-     g_free(data);
+         } else {
-@@ -XXX,XX +XXX,XX @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs,
+             const QEMUOption *popt;
-                              Error **errp)
- {
+@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
-     MirrorBlockJob *s;
+                 cpu_model = optarg;
-+    MirrorBDSOpaque *bs_opaque;
+                 break;
-     BlockDriverState *mirror_top_bs;
+             case QEMU_OPTION_hda:
-     bool target_graph_mod;
+-                {
-     bool target_is_backing;
+-                    char buf[256];
-@@ -XXX,XX +XXX,XX @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs,
+-                    if (cyls == 0)
-     mirror_top_bs->total_sectors = bs->total_sectors;
+-                        snprintf(buf, sizeof(buf), "%s", HD_OPTS);
-     mirror_top_bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED;
+-                    else
-     mirror_top_bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED;
+-                        snprintf(buf, sizeof(buf),
-+    bs_opaque = g_new0(MirrorBDSOpaque, 1);
+-                                 "%s,cyls=%d,heads=%d,secs=%d%s",
-+    mirror_top_bs->opaque = bs_opaque;
+-                                 HD_OPTS , cyls, heads, secs,
-     bdrv_set_aio_context(mirror_top_bs, bdrv_get_aio_context(bs));
+-                                 translation == BIOS_ATA_TRANSLATION_LBA ?
+-                                 ",trans=lba" :
-     /* bdrv_append takes ownership of the mirror_top_bs reference, need to keep
+-                                 translation == BIOS_ATA_TRANSLATION_NONE ?
-@@ -XXX,XX +XXX,XX @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs,
+-                                 ",trans=none" : "");
-     if (!s) {
+-                    drive_add(IF_DEFAULT, 0, optarg, buf);
-         goto fail;
+-                    break;
-     }
+-                }
-+    bs_opaque->job = s;
+             case QEMU_OPTION_hdb:
-+
+             case QEMU_OPTION_hdc:
-     /* The block job now has a reference to this node */
+             case QEMU_OPTION_hdd:
-     bdrv_unref(mirror_top_bs);
+@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
+             case QEMU_OPTION_snapshot:
-@@ -XXX,XX +XXX,XX @@ fail:
+                 snapshot = 1;
+                 break;
-         g_free(s->replaces);
+-            case QEMU_OPTION_hdachs:
-         blk_unref(s->target);
+-                {
-+        bs_opaque->job = NULL;
+-                    const char *p;
-         job_early_fail(&s->common.job);
+-                    p = optarg;
-     }
+-                    cyls = strtol(p, (char **)&p, 0);
+-                    if (cyls < 1 || cyls > 16383)
 -                        goto chs_fail;
 -                    if (*p != ',')
 -                        goto chs_fail;
 -                    p++;
 -                    heads = strtol(p, (char **)&p, 0);
 -                    if (heads < 1 || heads > 16)
 -                        goto chs_fail;
 -                    if (*p != ',')
 -                        goto chs_fail;
 -                    p++;
 -                    secs = strtol(p, (char **)&p, 0);
 -                    if (secs < 1 || secs > 63)
 -                        goto chs_fail;
 -                    if (*p == ',') {
 -                        p++;
 -                        if (!strcmp(p, "large")) {
 -                            translation = BIOS_ATA_TRANSLATION_LARGE;
 -                        } else if (!strcmp(p, "rechs")) {
 -                            translation = BIOS_ATA_TRANSLATION_RECHS;
 -                        } else if (!strcmp(p, "none")) {
 -                            translation = BIOS_ATA_TRANSLATION_NONE;
 -                        } else if (!strcmp(p, "lba")) {
 -                            translation = BIOS_ATA_TRANSLATION_LBA;
 -                        } else if (!strcmp(p, "auto")) {
 -                            translation = BIOS_ATA_TRANSLATION_AUTO;
 -                        } else {
 -                            goto chs_fail;
 -                        }
 -                    } else if (*p != '\0') {
 -                    chs_fail:
 -                        error_report("invalid physical CHS format");
 -                        exit(1);
 -                    }
 -                    if (hda_opts != NULL) {
 -                        qemu_opt_set_number(hda_opts, "cyls", cyls,
 -                                            &error_abort);
 -                        qemu_opt_set_number(hda_opts, "heads", heads,
 -                                            &error_abort);
 -                        qemu_opt_set_number(hda_opts, "secs", secs,
 -                                            &error_abort);
 -                        if (translation == BIOS_ATA_TRANSLATION_LARGE) {
 -                            qemu_opt_set(hda_opts, "trans", "large",
 -                                         &error_abort);
 -                        } else if (translation == BIOS_ATA_TRANSLATION_RECHS) {
 -                            qemu_opt_set(hda_opts, "trans", "rechs",
 -                                         &error_abort);
 -                        } else if (translation == BIOS_ATA_TRANSLATION_LBA) {
 -                            qemu_opt_set(hda_opts, "trans", "lba",
 -                                         &error_abort);
 -                        } else if (translation == BIOS_ATA_TRANSLATION_NONE) {
 -                            qemu_opt_set(hda_opts, "trans", "none",
 -                                         &error_abort);
 -                        }
 -                    }
 -                }
 -                error_report("'-hdachs' is deprecated, please use '-device"
 -                             " ide-hd,cyls=c,heads=h,secs=s,...' instead");
 -                break;
              case QEMU_OPTION_numa:
                  opts = qemu_opts_parse_noisily(qemu_find_opts("numa"),
                                                 optarg, true);
 diff --git a/qemu-doc.texi b/qemu-doc.texi
 index XXXXXXX..XXXXXXX 100644
 --- a/qemu-doc.texi
 +++ b/qemu-doc.texi
@@ -XXX,XX +XXX,XX @@ The ``--net dump'' argument is now replaced with the
  ``-object filter-dump'' argument which works in combination
  with the modern ``-netdev`` backends instead.
 -@subsection -hdachs (since 2.10.0)
 -
 -The ``-hdachs'' argument is now a synonym for setting
 -the ``cyls'', ``heads'', ``secs'', and ``trans'' properties
 -on the ``ide-hd'' device using the ``-device'' argument.
 -The new syntax allows different settings to be provided
 -per disk.
 -
  @subsection -usbdevice (since 2.10.0)
  The ``-usbdevice DEV'' argument is now a synonym for setting
 diff --git a/qemu-options.hx b/qemu-options.hx
 index XXXXXXX..XXXXXXX 100644
 --- a/qemu-options.hx
 +++ b/qemu-options.hx
@@ -XXX,XX +XXX,XX @@ of available connectors of a given interface type.
  @item media=@var{media}
  This option defines the type of the media: disk or cdrom.
  @item cyls=@var{c},heads=@var{h},secs=@var{s}[,trans=@var{t}]
 -These options have the same definition as they have in @option{-hdachs}.
 -These parameters are deprecated, use the corresponding parameters
 +Force disk physical geometry and the optional BIOS translation (trans=none or
 +lba). These parameters are deprecated, use the corresponding parameters
  of @code{-device} instead.
  @item snapshot=@var{snapshot}
  @var{snapshot} is "on" or "off" and controls snapshot mode for the given drive
@@ -XXX,XX +XXX,XX @@ the raw disk image you use is not written back. You can however force
  the write back by pressing @key{C-a s} (@pxref{disk_images}).
  ETEXI
 -DEF("hdachs", HAS_ARG, QEMU_OPTION_hdachs, \
 -    "-hdachs c,h,s[,t]\n" \
 -    "                force hard disk 0 physical geometry and the optional BIOS\n" \
 -    "                translation (t=none or lba) (usually QEMU can guess them)\n",
 -    QEMU_ARCH_ALL)
 -STEXI
 -@item -hdachs @var{c},@var{h},@var{s},[,@var{t}]
 -@findex -hdachs
 -Force hard disk 0 physical geometry (1 <= @var{c} <= 16383, 1 <=
 -@var{h} <= 16, 1 <= @var{s} <= 63) and optionally force the BIOS
 -translation mode (@var{t}=none, lba or auto). Usually QEMU can guess
 -all those parameters. This option is deprecated, please use
 -@code{-device ide-hd,cyls=c,heads=h,secs=s,...} instead.
 -ETEXI
 -
  DEF("fsdev", HAS_ARG, QEMU_OPTION_fsdev,
      "-fsdev fsdriver,id=id[,path=path,][security_model={mapped-xattr|mapped-file|passthrough|none}]\n"
      " [,writeout=immediate][,readonly][,socket=socket|sock_fd=sock_fd][,fmode=fmode][,dmode=dmode]\n"
 --
 .13.6

-[Qemu-devel] [PULL 21/35] block: fix QEMU crash with scsi-hd and drive_del
+[Qemu-devel] [PULL v3 16/35] block: Mention -drive cyls/heads/secs/trans/serial/addr in deprecation chapter
-From: Greg Kurz <groug@kaod.org>
+From: Thomas Huth <thuth@redhat.com>
-Removing a drive with drive_del while it is being used to run an I/O
+Looks like we forgot to announce the deprecation of these options in
-intensive workload can cause QEMU to crash.
+the corresponding chapter of the qemu-doc text, so let's do that now.
-An AIO flush can yield at some point:
+Signed-off-by: Thomas Huth <thuth@redhat.com>
+Reviewed-by: John Snow <jsnow@redhat.com>
-blk_aio_flush_entry()
+Reviewed-by: Markus Armbruster <armbru@redhat.com>
  blk_co_flush(blk)
   bdrv_co_flush(blk->root->bs)
    ...
     qemu_coroutine_yield()
 and let the HMP command to run, free blk->root and give control
 back to the AIO flush:
     hmp_drive_del()
      blk_remove_bs()
       bdrv_root_unref_child(blk->root)
        child_bs = blk->root->bs
        bdrv_detach_child(blk->root)
         bdrv_replace_child(blk->root, NULL)
          blk->root->bs = NULL
         g_free(blk->root) <============== blk->root becomes stale
        bdrv_unref(child_bs)
         bdrv_delete(child_bs)
          bdrv_close()
           bdrv_drained_begin()
            bdrv_do_drained_begin()
             bdrv_drain_recurse()
              aio_poll()
               ...
               qemu_coroutine_switch()
 and the AIO flush completion ends up dereferencing blk->root:
   blk_aio_complete()
    scsi_aio_complete()
     blk_get_aio_context(blk)
      bs = blk_bs(blk)
  ie, bs = blk->root ? blk->root->bs : NULL
             ^^^^^
             stale
 The problem is that we should avoid making block driver graph
 changes while we have in-flight requests. Let's drain all I/O
 for this BB before calling bdrv_root_unref_child().
 Signed-off-by: Greg Kurz <groug@kaod.org>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- block/block-backend.c | 5 +++++
+ qemu-doc.texi | 15 +++++++++++++++
-file changed, 5 insertions(+)
+file changed, 15 insertions(+)
-diff --git a/block/block-backend.c b/block/block-backend.c
+diff --git a/qemu-doc.texi b/qemu-doc.texi
 index XXXXXXX..XXXXXXX 100644
---- a/block/block-backend.c
+--- a/qemu-doc.texi
-+++ b/block/block-backend.c
++++ b/qemu-doc.texi
-@@ -XXX,XX +XXX,XX @@ void blk_remove_bs(BlockBackend *blk)
+@@ -XXX,XX +XXX,XX @@ longer be directly supported in QEMU.
+ The ``-drive if=scsi'' argument is replaced by the the
-     blk_update_root_state(blk);
+ ``-device BUS-TYPE'' argument combined with ``-drive if=none''.
-+    /* bdrv_root_unref_child() will cause blk->root to become stale and may
++@subsection -drive cyls=...,heads=...,secs=...,trans=... (since 2.10.0)
-+     * switch to a completion coroutine later on. Let's drain all I/O here
++
-+     * to avoid that and a potential QEMU crash.
++The drive geometry arguments are replaced by the the geometry arguments
-+     */
++that can be specified with the ``-device'' parameter.
-+    blk_drain(blk);
++
-     bdrv_root_unref_child(blk->root);
++@subsection -drive serial=... (since 2.10.0)
-     blk->root = NULL;
++
- }
++The drive serial argument is replaced by the the serial argument
 +that can be specified with the ``-device'' parameter.
 +
 +@subsection -drive addr=... (since 2.10.0)
 +
 +The drive addr argument is replaced by the the addr argument
 +that can be specified with the ``-device'' parameter.
 +
  @subsection -net dump (since 2.10.0)
  The ``--net dump'' argument is now replaced with the
 --
 .13.6

-[Qemu-devel] [PULL 08/35] block: Remove bdrv_drain_recurse()
+[Qemu-devel] [PULL v3 17/35] block: Remove unused bdrv_requests_pending
-For bdrv_drain(), recursively waiting for child node requests is
+From: Fam Zheng <famz@redhat.com>
 pointless because we didn't quiesce their parents, so new requests could
 come in anyway. Letting the function work only on a single node makes it
 more consistent.
-For subtree drains and drain_all, we already have the recursion in
+Signed-off-by: Fam Zheng <famz@redhat.com>
-bdrv_do_drained_begin(), so the extra recursion doesn't add anything
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-either.
+---
  include/block/block_int.h |  1 -
  block/io.c                | 18 ------------------
 files changed, 19 deletions(-)
-Remove the useless code.
+diff --git a/include/block/block_int.h b/include/block/block_int.h
+index XXXXXXX..XXXXXXX 100644
-Signed-off-by: Kevin Wolf <kwolf@redhat.com>
+--- a/include/block/block_int.h
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
++++ b/include/block/block_int.h
----
+@@ -XXX,XX +XXX,XX @@ bool blk_dev_is_tray_open(BlockBackend *blk);
- block/io.c | 36 +++---------------------------------
+ bool blk_dev_is_medium_locked(BlockBackend *blk);
-file changed, 3 insertions(+), 33 deletions(-)
+ void bdrv_set_dirty(BlockDriverState *bs, int64_t offset, int64_t bytes);
 -bool bdrv_requests_pending(BlockDriverState *bs);
  void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out);
  void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in);
 diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/io.c
 +++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_poll_top_level(BlockDriverState *bs,
+@@ -XXX,XX +XXX,XX @@ void bdrv_disable_copy_on_read(BlockDriverState *bs)
-     return bdrv_drain_poll(bs, ignore_parent);
+     assert(old >= 1);
  }
--static bool bdrv_drain_recurse(BlockDriverState *bs, BdrvChild *parent)
+-/* Check if any requests are in-flight (including throttled requests) */
 -bool bdrv_requests_pending(BlockDriverState *bs)
 -{
--    BdrvChild *child, *tmp;
+-    BdrvChild *child;
 -    bool waited;
 -
--    /* Wait for drained requests to finish */
+-    if (atomic_read(&bs->in_flight)) {
--    waited = BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, parent));
+-        return true;
 -    }
 -
--    QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
+-    QLIST_FOREACH(child, &bs->children, next) {
--        BlockDriverState *bs = child->bs;
+-        if (bdrv_requests_pending(child->bs)) {
--        bool in_main_loop =
+-            return true;
 -            qemu_get_current_aio_context() == qemu_get_aio_context();
 -        assert(bs->refcnt > 0);
 -        if (in_main_loop) {
 -            /* In case the recursive bdrv_drain_recurse processes a
 -             * block_job_defer_to_main_loop BH and modifies the graph,
 -             * let's hold a reference to bs until we are done.
 -             *
 -             * IOThread doesn't have such a BH, and it is not safe to call
 -             * bdrv_unref without BQL, so skip doing it there.
 -             */
 -            bdrv_ref(bs);
 -        }
 -        waited |= bdrv_drain_recurse(bs, child);
 -        if (in_main_loop) {
 -            bdrv_unref(bs);
 -        }
 -    }
 -
--    return waited;
+-    return false;
 -}
 -
- static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
+ typedef struct {
-                                   BdrvChild *parent);
+     Coroutine *co;
- static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
+     BlockDriverState *bs;
@@ -XXX,XX +XXX,XX @@ void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
      bdrv_parent_drained_begin(bs, parent);
      bdrv_drain_invoke(bs, true);
 -    bdrv_drain_recurse(bs, parent);
 +
 +    /* Wait for drained requests to finish */
 +    BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, parent));
      if (recursive) {
          bs->recursive_quiesce_counter++;
 --
 .13.6

-[Qemu-devel] [PULL 26/35] block/mirror: Use source as a BdrvChild
+[Qemu-devel] [PULL v3 18/35] block: Assert drain_all is only called from main AioContext
-From: Max Reitz <mreitz@redhat.com>
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 Reviewed-by: Fam Zheng <famz@redhat.com>
 ---
  block/io.c | 6 ++++++
 file changed, 6 insertions(+)
-With this, the mirror_top_bs is no longer just a technically required
+diff --git a/block/io.c b/block/io.c
 node in the BDS graph but actually represents the block job operation.
 Also, drop MirrorBlockJob.source, as we can reach it through
 mirror_top_bs->backing.
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 Reviewed-by: Fam Zheng <famz@redhat.com>
 Reviewed-by: Alberto Garcia <berto@igalia.com>
 Message-id: 20180613181823.13618-6-mreitz@redhat.com
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 ---
  block/mirror.c | 14 ++++++--------
 file changed, 6 insertions(+), 8 deletions(-)
 diff --git a/block/mirror.c b/block/mirror.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/mirror.c
+--- a/block/io.c
-+++ b/block/mirror.c
++++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@ typedef struct MirrorBlockJob {
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
-     BlockJob common;
+     BdrvNextIterator it;
-     BlockBackend *target;
+     GSList *aio_ctxs = NULL, *ctx;
-     BlockDriverState *mirror_top_bs;
--    BlockDriverState *source;
++    /* BDRV_POLL_WHILE() for a node can only be called from its own I/O thread
-     BlockDriverState *base;
++     * or the main loop AioContext. We potentially use BDRV_POLL_WHILE() on
++     * nodes in several different AioContexts, so make sure we're in the main
-     /* The name of the graph node to replace */
++     * context. */
-@@ -XXX,XX +XXX,XX @@ static void coroutine_fn mirror_co_read(void *opaque)
++    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
- {
++
-     MirrorOp *op = opaque;
+     block_job_pause_all();
-     MirrorBlockJob *s = op->s;
--    BlockBackend *source = s->common.blk;
+     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
      int nb_chunks;
      uint64_t ret;
      uint64_t max_bytes;
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn mirror_co_read(void *opaque)
      s->bytes_in_flight += op->bytes;
      trace_mirror_one_iteration(s, op->offset, op->bytes);
 -    ret = blk_co_preadv(source, op->offset, op->bytes, &op->qiov, 0);
 +    ret = bdrv_co_preadv(s->mirror_top_bs->backing, op->offset, op->bytes,
 +                         &op->qiov, 0);
      mirror_read_complete(op, ret);
  }
@@ -XXX,XX +XXX,XX @@ static unsigned mirror_perform(MirrorBlockJob *s, int64_t offset,
  static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
  {
 -    BlockDriverState *source = s->source;
 +    BlockDriverState *source = s->mirror_top_bs->backing->bs;
      MirrorOp *pseudo_op;
      int64_t offset;
      uint64_t delay_ns = 0, ret = 0;
@@ -XXX,XX +XXX,XX @@ static void mirror_exit(Job *job, void *opaque)
      BlockJob *bjob = &s->common;
      MirrorExitData *data = opaque;
      AioContext *replace_aio_context = NULL;
 -    BlockDriverState *src = s->source;
 +    BlockDriverState *src = s->mirror_top_bs->backing->bs;
      BlockDriverState *target_bs = blk_bs(s->target);
      BlockDriverState *mirror_top_bs = s->mirror_top_bs;
      Error *local_err = NULL;
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
  {
      int64_t offset;
      BlockDriverState *base = s->base;
 -    BlockDriverState *bs = s->source;
 +    BlockDriverState *bs = s->mirror_top_bs->backing->bs;
      BlockDriverState *target_bs = blk_bs(s->target);
      int ret;
      int64_t count;
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn mirror_run(void *opaque)
  {
      MirrorBlockJob *s = opaque;
      MirrorExitData *data;
 -    BlockDriverState *bs = s->source;
 +    BlockDriverState *bs = s->mirror_top_bs->backing->bs;
      BlockDriverState *target_bs = blk_bs(s->target);
      bool need_drain = true;
      int64_t length;
@@ -XXX,XX +XXX,XX @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs,
      /* The block job now has a reference to this node */
      bdrv_unref(mirror_top_bs);
 -    s->source = bs;
      s->mirror_top_bs = mirror_top_bs;
      /* No resize for the target either; while the mirror is still running, a
 --
 .13.6

-[Qemu-devel] [PULL 14/35] block: Defer .bdrv_drain_begin callback to polling phase
+[Qemu-devel] [PULL v3 19/35] block: Make bdrv_drain() driver callbacks non-recursive
-We cannot allow aio_poll() in bdrv_drain_invoke(begin=true) until we're
+bdrv_drained_begin() doesn't increase bs->quiesce_counter recursively
-done with propagating the drain through the graph and are doing the
+and also doesn't notify other parent nodes of children, which both means
-single final BDRV_POLL_WHILE().
+that the child nodes are not actually drained, and bdrv_drained_begin()
 is providing useful functionality only on a single node.
-Just schedule the coroutine with the callback and increase bs->in_flight
+To keep things consistent, we also shouldn't call the block driver
-to make sure that the polling phase will wait for it.
+callbacks recursively.
 A proper recursive drain version that provides an actually working
 drained section for child nodes will be introduced later.
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
+Reviewed-by: Fam Zheng <famz@redhat.com>
 ---
- block/io.c | 28 +++++++++++++++++++++++-----
+ block/io.c | 16 +++++++++-------
-file changed, 23 insertions(+), 5 deletions(-)
+file changed, 9 insertions(+), 7 deletions(-)
 diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/io.c
 +++ b/block/io.c
 @@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
-     /* Set data->done before reading bs->wakeup.  */
-     atomic_mb_set(&data->done, true);
--    bdrv_wakeup(bs);
-+    bdrv_dec_in_flight(bs);
-+
-+    if (data->begin) {
-+        g_free(data);
-+    }
  }
  /* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
- static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
+-static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
 +static void bdrv_drain_invoke(BlockDriverState *bs, bool begin, bool recursive)
  {
--    BdrvCoDrainData data = { .bs = bs, .done = false, .begin = begin};
+     BdrvChild *child, *tmp;
-+    BdrvCoDrainData *data;
+     BdrvCoDrainData data = { .bs = bs, .done = false, .begin = begin};
+@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
-     if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) ||
+     bdrv_coroutine_enter(bs, data.co);
-             (!begin && !bs->drv->bdrv_co_drain_end)) {
+     BDRV_POLL_WHILE(bs, !data.done);
-         return;
 -    QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
 -        bdrv_drain_invoke(child->bs, begin);
 +    if (recursive) {
 +        QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
 +            bdrv_drain_invoke(child->bs, begin, true);
 +        }
      }
--    data.co = qemu_coroutine_create(bdrv_drain_invoke_entry, &data);
--    bdrv_coroutine_enter(bs, data.co);
--    BDRV_POLL_WHILE(bs, !data.done);
-+    data = g_new(BdrvCoDrainData, 1);
-+    *data = (BdrvCoDrainData) {
-+        .bs = bs,
-+        .done = false,
-+        .begin = begin
-+    };
-+
-+    /* Make sure the driver callback completes during the polling phase for
-+     * drain_begin. */
-+    bdrv_inc_in_flight(bs);
-+    data->co = qemu_coroutine_create(bdrv_drain_invoke_entry, data);
-+    aio_co_schedule(bdrv_get_aio_context(bs), data->co);
-+
-+    if (!begin) {
-+        BDRV_POLL_WHILE(bs, !data->done);
-+        g_free(data);
-+    }
  }
- /* Returns true if BDRV_POLL_WHILE() should go into a blocking aio_poll() */
+@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
          bdrv_parent_drained_begin(bs);
      }
 -    bdrv_drain_invoke(bs, true);
 +    bdrv_drain_invoke(bs, true, false);
      bdrv_drain_recurse(bs);
  }
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
      }
      /* Re-enable things in child-to-parent order */
 -    bdrv_drain_invoke(bs, false);
 +    bdrv_drain_invoke(bs, false, false);
      bdrv_parent_drained_end(bs);
      aio_enable_external(bdrv_get_aio_context(bs));
  }
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
          aio_context_acquire(aio_context);
          aio_disable_external(aio_context);
          bdrv_parent_drained_begin(bs);
 -        bdrv_drain_invoke(bs, true);
 +        bdrv_drain_invoke(bs, true, true);
          aio_context_release(aio_context);
          if (!g_slist_find(aio_ctxs, aio_context)) {
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
          /* Re-enable things in child-to-parent order */
          aio_context_acquire(aio_context);
 -        bdrv_drain_invoke(bs, false);
 +        bdrv_drain_invoke(bs, false, true);
          bdrv_parent_drained_end(bs);
          aio_enable_external(aio_context);
          aio_context_release(aio_context);
 --
 .13.6

-[Qemu-devel] [PULL 15/35] test-bdrv-drain: Test that bdrv_drain_invoke() doesn't poll
+[Qemu-devel] [PULL v3 20/35] test-bdrv-drain: Test callback for bdrv_drain
-This adds a test case that goes wrong if bdrv_drain_invoke() calls
+The existing test is for bdrv_drain_all_begin/end() only. Generalise the
-aio_poll().
+test case so that it can be run for the other variants as well. At the
 moment this is only bdrv_drain_begin/end(), but in a while, we'll add
 another one.
 Also, add a backing file to the test node to test whether the operations
 work recursively.
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- tests/test-bdrv-drain.c | 102 +++++++++++++++++++++++++++++++++++++++++-------
+ tests/test-bdrv-drain.c | 69 ++++++++++++++++++++++++++++++++++++++++++++-----
-file changed, 88 insertions(+), 14 deletions(-)
+file changed, 62 insertions(+), 7 deletions(-)
 diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tests/test-bdrv-drain.c
 +++ b/tests/test-bdrv-drain.c
-@@ -XXX,XX +XXX,XX @@ static QemuEvent done_event;
- typedef struct BDRVTestState {
-     int drain_count;
-     AioContext *bh_indirection_ctx;
-+    bool sleep_in_drain_begin;
- } BDRVTestState;
- static void coroutine_fn bdrv_test_co_drain_begin(BlockDriverState *bs)
- {
-     BDRVTestState *s = bs->opaque;
-     s->drain_count++;
-+    if (s->sleep_in_drain_begin) {
-+        qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100000);
-+    }
- }
- static void coroutine_fn bdrv_test_co_drain_end(BlockDriverState *bs)
-@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_test_co_preadv(BlockDriverState *bs,
-     return 0;
- }
-+static void bdrv_test_child_perm(BlockDriverState *bs, BdrvChild *c,
-+                                 const BdrvChildRole *role,
-+                                 BlockReopenQueue *reopen_queue,
-+                                 uint64_t perm, uint64_t shared,
-+                                 uint64_t *nperm, uint64_t *nshared)
-+{
-+    /* bdrv_format_default_perms() accepts only these two, so disguise
-+     * detach_by_driver_cb_role as one of them. */
-+    if (role != &child_file && role != &child_backing) {
-+        role = &child_file;
-+    }
-+
-+    bdrv_format_default_perms(bs, c, role, reopen_queue, perm, shared,
-+                              nperm, nshared);
-+}
-+
- static BlockDriver bdrv_test = {
-     .format_name            = "test",
-     .instance_size          = sizeof(BDRVTestState),
 @@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_test = {
      .bdrv_co_drain_begin    = bdrv_test_co_drain_begin,
      .bdrv_co_drain_end      = bdrv_test_co_drain_end,
++
--    .bdrv_child_perm        = bdrv_format_default_perms,
++    .bdrv_child_perm        = bdrv_format_default_perms,
 +    .bdrv_child_perm        = bdrv_test_child_perm,
  };
  static void aio_ret_cb(void *opaque, int ret)
-@@ -XXX,XX +XXX,XX @@ struct detach_by_parent_data {
+@@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret)
-     BdrvChild *child_b;
+     *aio_ret = ret;
      BlockDriverState *c;
      BdrvChild *child_c;
 +    bool by_parent_cb;
  };
 +static struct detach_by_parent_data detach_by_parent_data;
 -static void detach_by_parent_aio_cb(void *opaque, int ret)
 +static void detach_indirect_bh(void *opaque)
  {
      struct detach_by_parent_data *data = opaque;
 -    g_assert_cmpint(ret, ==, 0);
      bdrv_unref_child(data->parent_b, data->child_b);
      bdrv_ref(data->c);
@@ -XXX,XX +XXX,XX @@ static void detach_by_parent_aio_cb(void *opaque, int ret)
                                        &child_file, &error_abort);
  }
-+static void detach_by_parent_aio_cb(void *opaque, int ret)
+-static void test_drv_cb_drain_all(void)
 +enum drain_type {
 +    BDRV_DRAIN_ALL,
 +    BDRV_DRAIN,
 +};
 +
 +static void do_drain_begin(enum drain_type drain_type, BlockDriverState *bs)
 +{
-+    struct detach_by_parent_data *data = &detach_by_parent_data;
++    switch (drain_type) {
-+
++    case BDRV_DRAIN_ALL:        bdrv_drain_all_begin(); break;
-+    g_assert_cmpint(ret, ==, 0);
++    case BDRV_DRAIN:            bdrv_drained_begin(bs); break;
-+    if (data->by_parent_cb) {
++    default:                    g_assert_not_reached();
 +        detach_indirect_bh(data);
 +    }
 +}
 +
-+static void detach_by_driver_cb_drained_begin(BdrvChild *child)
++static void do_drain_end(enum drain_type drain_type, BlockDriverState *bs)
 +{
-+    aio_bh_schedule_oneshot(qemu_get_current_aio_context(),
++    switch (drain_type) {
-+                            detach_indirect_bh, &detach_by_parent_data);
++    case BDRV_DRAIN_ALL:        bdrv_drain_all_end(); break;
-+    child_file.drained_begin(child);
++    case BDRV_DRAIN:            bdrv_drained_end(bs); break;
 +    default:                    g_assert_not_reached();
 +    }
 +}
 +
-+static BdrvChildRole detach_by_driver_cb_role;
++static void test_drv_cb_common(enum drain_type drain_type, bool recursive)
 +
  /*
   * Initial graph:
   *
@@ -XXX,XX +XXX,XX @@ static void detach_by_parent_aio_cb(void *opaque, int ret)
   *    \ /   \
   *     A     B     C
   *
 - * PA has a pending write request whose callback changes the child nodes of PB:
 - * It removes B and adds C instead. The subtree of PB is drained, which will
 - * indirectly drain the write request, too.
 + * by_parent_cb == true:  Test that parent callbacks don't poll
 + *
 + *     PA has a pending write request whose callback changes the child nodes of
 + *     PB: It removes B and adds C instead. The subtree of PB is drained, which
 + *     will indirectly drain the write request, too.
 + *
 + * by_parent_cb == false: Test that bdrv_drain_invoke() doesn't poll
 + *
 + *     PA's BdrvChildRole has a .drained_begin callback that schedules a BH
 + *     that does the same graph change. If bdrv_drain_invoke() calls it, the
 + *     state is messed up, but if it is only polled in the single
 + *     BDRV_POLL_WHILE() at the end of the drain, this should work fine.
   */
 -static void test_detach_by_parent_cb(void)
 +static void test_detach_indirect(bool by_parent_cb)
  {
      BlockBackend *blk;
-     BlockDriverState *parent_a, *parent_b, *a, *b, *c;
+-    BlockDriverState *bs;
-     BdrvChild *child_a, *child_b;
+-    BDRVTestState *s;
 +    BlockDriverState *bs, *backing;
 +    BDRVTestState *s, *backing_s;
      BlockAIOCB *acb;
--    struct detach_by_parent_data data;
+     int aio_ret;
-     QEMUIOVector qiov;
+@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain_all(void)
-     struct iovec iov = {
+     s = bs->opaque;
-@@ -XXX,XX +XXX,XX @@ static void test_detach_by_parent_cb(void)
+     blk_insert_bs(blk, bs, &error_abort);
-     };
-     qemu_iovec_init_external(&qiov, &iov, 1);
++    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
++    backing_s = backing->opaque;
-+    if (!by_parent_cb) {
++    bdrv_set_backing_hd(bs, backing, &error_abort);
 +        detach_by_driver_cb_role = child_file;
 +        detach_by_driver_cb_role.drained_begin =
 +            detach_by_driver_cb_drained_begin;
 +    }
 +
-     /* Create all involved nodes */
+     /* Simple bdrv_drain_all_begin/end pair, check that CBs are called */
-     parent_a = bdrv_new_open_driver(&bdrv_test, "parent-a", BDRV_O_RDWR,
+     g_assert_cmpint(s->drain_count, ==, 0);
-                                     &error_abort);
+-    bdrv_drain_all_begin();
-@@ -XXX,XX +XXX,XX @@ static void test_detach_by_parent_cb(void)
++    g_assert_cmpint(backing_s->drain_count, ==, 0);
      blk_insert_bs(blk, parent_a, &error_abort);
      bdrv_unref(parent_a);
 +    /* If we want to get bdrv_drain_invoke() to call aio_poll(), the driver
 +     * callback must not return immediately. */
 +    if (!by_parent_cb) {
 +        BDRVTestState *s = parent_a->opaque;
 +        s->sleep_in_drain_begin = true;
 +    }
 +
-     /* Set child relationships */
++    do_drain_begin(drain_type, bs);
-     bdrv_ref(b);
++
-     bdrv_ref(a);
+     g_assert_cmpint(s->drain_count, ==, 1);
-@@ -XXX,XX +XXX,XX @@ static void test_detach_by_parent_cb(void)
+-    bdrv_drain_all_end();
-     child_a = bdrv_attach_child(parent_b, a, "PB-A", &child_backing, &error_abort);
++    g_assert_cmpint(backing_s->drain_count, ==, !!recursive);
++
-     bdrv_ref(a);
++    do_drain_end(drain_type, bs);
--    bdrv_attach_child(parent_a, a, "PA-A", &child_file, &error_abort);
++
-+    bdrv_attach_child(parent_a, a, "PA-A",
+     g_assert_cmpint(s->drain_count, ==, 0);
-+                      by_parent_cb ? &child_file : &detach_by_driver_cb_role,
++    g_assert_cmpint(backing_s->drain_count, ==, 0);
-+                      &error_abort);
+     /* Now do the same while a request is pending */
-     g_assert_cmpint(parent_a->refcnt, ==, 1);
+     aio_ret = -EINPROGRESS;
-     g_assert_cmpint(parent_b->refcnt, ==, 1);
+@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain_all(void)
-@@ -XXX,XX +XXX,XX @@ static void test_detach_by_parent_cb(void)
+     g_assert_cmpint(aio_ret, ==, -EINPROGRESS);
-     g_assert(QLIST_NEXT(child_b, next) == NULL);
+     g_assert_cmpint(s->drain_count, ==, 0);
-     /* Start the evil write request */
+-    bdrv_drain_all_begin();
--    data = (struct detach_by_parent_data) {
++    g_assert_cmpint(backing_s->drain_count, ==, 0);
-+    detach_by_parent_data = (struct detach_by_parent_data) {
++
-         .parent_b = parent_b,
++    do_drain_begin(drain_type, bs);
-         .child_b = child_b,
++
-         .c = c,
+     g_assert_cmpint(aio_ret, ==, 0);
-+        .by_parent_cb = by_parent_cb,
+     g_assert_cmpint(s->drain_count, ==, 1);
-     };
+-    bdrv_drain_all_end();
--    acb = blk_aio_preadv(blk, 0, &qiov, 0, detach_by_parent_aio_cb, &data);
++    g_assert_cmpint(backing_s->drain_count, ==, !!recursive);
-+    acb = blk_aio_preadv(blk, 0, &qiov, 0, detach_by_parent_aio_cb, NULL);
++
-     g_assert(acb != NULL);
++    do_drain_end(drain_type, bs);
++
-     /* Drain and check the expected result */
+     g_assert_cmpint(s->drain_count, ==, 0);
-     bdrv_subtree_drained_begin(parent_b);
++    g_assert_cmpint(backing_s->drain_count, ==, 0);
--    g_assert(data.child_c != NULL);
++    bdrv_unref(backing);
-+    g_assert(detach_by_parent_data.child_c != NULL);
+     bdrv_unref(bs);
+     blk_unref(blk);
      g_assert_cmpint(parent_a->refcnt, ==, 1);
      g_assert_cmpint(parent_b->refcnt, ==, 1);
@@ -XXX,XX +XXX,XX @@ static void test_detach_by_parent_cb(void)
      g_assert_cmpint(b->refcnt, ==, 1);
      g_assert_cmpint(c->refcnt, ==, 2);
 -    g_assert(QLIST_FIRST(&parent_b->children) == data.child_c);
 -    g_assert(QLIST_NEXT(data.child_c, next) == child_a);
 +    g_assert(QLIST_FIRST(&parent_b->children) == detach_by_parent_data.child_c);
 +    g_assert(QLIST_NEXT(detach_by_parent_data.child_c, next) == child_a);
      g_assert(QLIST_NEXT(child_a, next) == NULL);
      g_assert_cmpint(parent_a->quiesce_counter, ==, 1);
@@ -XXX,XX +XXX,XX @@ static void test_detach_by_parent_cb(void)
      bdrv_unref(c);
  }
-+static void test_detach_by_parent_cb(void)
++static void test_drv_cb_drain_all(void)
 +{
-+    test_detach_indirect(true);
++    test_drv_cb_common(BDRV_DRAIN_ALL, true);
 +}
 +
-+static void test_detach_by_driver_cb(void)
++static void test_drv_cb_drain(void)
 +{
-+    test_detach_indirect(false);
++    test_drv_cb_common(BDRV_DRAIN, false);
 +}
++
  int main(int argc, char **argv)
  {
+     bdrv_init();
 @@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
-     g_test_add_func("/bdrv-drain/detach/drain", test_detach_by_drain);
+     g_test_init(&argc, &argv, NULL);
-     g_test_add_func("/bdrv-drain/detach/drain_subtree", test_detach_by_drain_subtree);
-     g_test_add_func("/bdrv-drain/detach/parent_cb", test_detach_by_parent_cb);
+     g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
-+    g_test_add_func("/bdrv-drain/detach/driver_cb", test_detach_by_driver_cb);
++    g_test_add_func("/bdrv-drain/driver-cb/drain", test_drv_cb_drain);
-     ret = g_test_run();
+     return g_test_run();
-     qemu_event_destroy(&done_event);
+ }
 --
 .13.6

-[Qemu-devel] [PULL 13/35] test-bdrv-drain: Graph change through parent callback
+[Qemu-devel] [PULL v3 21/35] test-bdrv-drain: Test bs->quiesce_counter
+This is currently only working correctly for bdrv_drain(), not for
+bdrv_drain_all(). Leave a comment for the drain_all case, we'll address
+it later.
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- tests/test-bdrv-drain.c | 130 ++++++++++++++++++++++++++++++++++++++++++++++++
+ tests/test-bdrv-drain.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
-file changed, 130 insertions(+)
+file changed, 45 insertions(+)
 diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tests/test-bdrv-drain.c
 +++ b/tests/test-bdrv-drain.c
-@@ -XXX,XX +XXX,XX @@ static void test_detach_by_drain_subtree(void)
+@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain(void)
      test_drv_cb_common(BDRV_DRAIN, false);
  }
++static void test_quiesce_common(enum drain_type drain_type, bool recursive)
-+struct detach_by_parent_data {
++{
-+    BlockDriverState *parent_b;
++    BlockBackend *blk;
-+    BdrvChild *child_b;
++    BlockDriverState *bs, *backing;
 +    BlockDriverState *c;
 +    BdrvChild *child_c;
 +};
 +
-+static void detach_by_parent_aio_cb(void *opaque, int ret)
++    blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
-+{
++    bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
-+    struct detach_by_parent_data *data = opaque;
++                              &error_abort);
 +    blk_insert_bs(blk, bs, &error_abort);
 +
-+    g_assert_cmpint(ret, ==, 0);
++    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
-+    bdrv_unref_child(data->parent_b, data->child_b);
++    bdrv_set_backing_hd(bs, backing, &error_abort);
 +
-+    bdrv_ref(data->c);
++    g_assert_cmpint(bs->quiesce_counter, ==, 0);
-+    data->child_c = bdrv_attach_child(data->parent_b, data->c, "PB-C",
++    g_assert_cmpint(backing->quiesce_counter, ==, 0);
-+                                      &child_file, &error_abort);
++
 +    do_drain_begin(drain_type, bs);
 +
 +    g_assert_cmpint(bs->quiesce_counter, ==, 1);
 +    g_assert_cmpint(backing->quiesce_counter, ==, !!recursive);
 +
 +    do_drain_end(drain_type, bs);
 +
 +    g_assert_cmpint(bs->quiesce_counter, ==, 0);
 +    g_assert_cmpint(backing->quiesce_counter, ==, 0);
 +
 +    bdrv_unref(backing);
 +    bdrv_unref(bs);
 +    blk_unref(blk);
 +}
 +
-+/*
++static void test_quiesce_drain_all(void)
 + * Initial graph:
 + *
 + * PA     PB
 + *    \ /   \
 + *     A     B     C
 + *
 + * PA has a pending write request whose callback changes the child nodes of PB:
 + * It removes B and adds C instead. The subtree of PB is drained, which will
 + * indirectly drain the write request, too.
 + */
 +static void test_detach_by_parent_cb(void)
 +{
-+    BlockBackend *blk;
++    // XXX drain_all doesn't quiesce
-+    BlockDriverState *parent_a, *parent_b, *a, *b, *c;
++    //test_quiesce_common(BDRV_DRAIN_ALL, true);
 +    BdrvChild *child_a, *child_b;
 +    BlockAIOCB *acb;
 +    struct detach_by_parent_data data;
 +
 +    QEMUIOVector qiov;
 +    struct iovec iov = {
 +        .iov_base = NULL,
 +        .iov_len = 0,
 +    };
 +    qemu_iovec_init_external(&qiov, &iov, 1);
 +
 +    /* Create all involved nodes */
 +    parent_a = bdrv_new_open_driver(&bdrv_test, "parent-a", BDRV_O_RDWR,
 +                                    &error_abort);
 +    parent_b = bdrv_new_open_driver(&bdrv_test, "parent-b", 0,
 +                                    &error_abort);
 +
 +    a = bdrv_new_open_driver(&bdrv_test, "a", BDRV_O_RDWR, &error_abort);
 +    b = bdrv_new_open_driver(&bdrv_test, "b", BDRV_O_RDWR, &error_abort);
 +    c = bdrv_new_open_driver(&bdrv_test, "c", BDRV_O_RDWR, &error_abort);
 +
 +    /* blk is a BB for parent-a */
 +    blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
 +    blk_insert_bs(blk, parent_a, &error_abort);
 +    bdrv_unref(parent_a);
 +
 +    /* Set child relationships */
 +    bdrv_ref(b);
 +    bdrv_ref(a);
 +    child_b = bdrv_attach_child(parent_b, b, "PB-B", &child_file, &error_abort);
 +    child_a = bdrv_attach_child(parent_b, a, "PB-A", &child_backing, &error_abort);
 +
 +    bdrv_ref(a);
 +    bdrv_attach_child(parent_a, a, "PA-A", &child_file, &error_abort);
 +
 +    g_assert_cmpint(parent_a->refcnt, ==, 1);
 +    g_assert_cmpint(parent_b->refcnt, ==, 1);
 +    g_assert_cmpint(a->refcnt, ==, 3);
 +    g_assert_cmpint(b->refcnt, ==, 2);
 +    g_assert_cmpint(c->refcnt, ==, 1);
 +
 +    g_assert(QLIST_FIRST(&parent_b->children) == child_a);
 +    g_assert(QLIST_NEXT(child_a, next) == child_b);
 +    g_assert(QLIST_NEXT(child_b, next) == NULL);
 +
 +    /* Start the evil write request */
 +    data = (struct detach_by_parent_data) {
 +        .parent_b = parent_b,
 +        .child_b = child_b,
 +        .c = c,
 +    };
 +    acb = blk_aio_preadv(blk, 0, &qiov, 0, detach_by_parent_aio_cb, &data);
 +    g_assert(acb != NULL);
 +
 +    /* Drain and check the expected result */
 +    bdrv_subtree_drained_begin(parent_b);
 +
 +    g_assert(data.child_c != NULL);
 +
 +    g_assert_cmpint(parent_a->refcnt, ==, 1);
 +    g_assert_cmpint(parent_b->refcnt, ==, 1);
 +    g_assert_cmpint(a->refcnt, ==, 3);
 +    g_assert_cmpint(b->refcnt, ==, 1);
 +    g_assert_cmpint(c->refcnt, ==, 2);
 +
 +    g_assert(QLIST_FIRST(&parent_b->children) == data.child_c);
 +    g_assert(QLIST_NEXT(data.child_c, next) == child_a);
 +    g_assert(QLIST_NEXT(child_a, next) == NULL);
 +
 +    g_assert_cmpint(parent_a->quiesce_counter, ==, 1);
 +    g_assert_cmpint(parent_b->quiesce_counter, ==, 1);
 +    g_assert_cmpint(a->quiesce_counter, ==, 1);
 +    g_assert_cmpint(b->quiesce_counter, ==, 0);
 +    g_assert_cmpint(c->quiesce_counter, ==, 1);
 +
 +    bdrv_subtree_drained_end(parent_b);
 +
 +    bdrv_unref(parent_b);
 +    blk_unref(blk);
 +
 +    /* XXX Once bdrv_close() unref's children instead of just detaching them,
 +     * this won't be necessary any more. */
 +    bdrv_unref(a);
 +    bdrv_unref(a);
 +    bdrv_unref(c);
 +
 +    g_assert_cmpint(a->refcnt, ==, 1);
 +    g_assert_cmpint(b->refcnt, ==, 1);
 +    g_assert_cmpint(c->refcnt, ==, 1);
 +    bdrv_unref(a);
 +    bdrv_unref(b);
 +    bdrv_unref(c);
 +}
 +
++static void test_quiesce_drain(void)
++{
++    test_quiesce_common(BDRV_DRAIN, false);
++}
 +
  int main(int argc, char **argv)
  {
-     int ret;
+     bdrv_init();
 @@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
-     g_test_add_func("/bdrv-drain/deletion/drain", test_delete_by_drain);
+     g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
-     g_test_add_func("/bdrv-drain/detach/drain", test_detach_by_drain);
+     g_test_add_func("/bdrv-drain/driver-cb/drain", test_drv_cb_drain);
-     g_test_add_func("/bdrv-drain/detach/drain_subtree", test_detach_by_drain_subtree);
-+    g_test_add_func("/bdrv-drain/detach/parent_cb", test_detach_by_parent_cb);
++    g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
++    g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
-     ret = g_test_run();
++
-     qemu_event_destroy(&done_event);
+     return g_test_run();
  }
 --
 .13.6

-[Qemu-devel] [PULL 24/35] block/mirror: Use CoQueue to wait on in-flight ops
+[Qemu-devel] [PULL v3 22/35] blockjob: Pause job on draining any job BDS
-From: Max Reitz <mreitz@redhat.com>
+Block jobs already paused themselves when their main BlockBackend
 entered a drained section. This is not good enough: We also want to
 pause a block job and may not submit new requests if, for example, the
 mirror target node should be drained.
-Attach a CoQueue to each in-flight operation so if we need to wait for
+This implements .drained_begin/end callbacks in child_job in order to
-any we can use it to wait instead of just blindly yielding and hoping
+consider all block nodes related to the job, and removes the
-for some operation to wake us.
+BlockBackend callbacks which are unnecessary now because the root of the
 job main BlockBackend is always referenced with a child_job, too.
-A later patch will use this infrastructure to allow requests accessing
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-the same area of the virtual disk to specifically wait for each other.
+---
  blockjob.c | 22 +++++++++-------------
 file changed, 9 insertions(+), 13 deletions(-)
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+diff --git a/blockjob.c b/blockjob.c
 Reviewed-by: Fam Zheng <famz@redhat.com>
 Message-id: 20180613181823.13618-4-mreitz@redhat.com
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 ---
  block/mirror.c | 34 +++++++++++++++++++++++-----------
 file changed, 23 insertions(+), 11 deletions(-)
 diff --git a/block/mirror.c b/block/mirror.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/mirror.c
+--- a/blockjob.c
-+++ b/block/mirror.c
++++ b/blockjob.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static char *child_job_get_parent_desc(BdrvChild *c)
+                            job->id);
  #include "qemu/osdep.h"
  #include "qemu/cutils.h"
 +#include "qemu/coroutine.h"
  #include "trace.h"
  #include "block/blockjob_int.h"
  #include "block/block_int.h"
@@ -XXX,XX +XXX,XX @@ typedef struct MirrorBuffer {
      QSIMPLEQ_ENTRY(MirrorBuffer) next;
  } MirrorBuffer;
 +typedef struct MirrorOp MirrorOp;
 +
  typedef struct MirrorBlockJob {
      BlockJob common;
      BlockBackend *target;
@@ -XXX,XX +XXX,XX @@ typedef struct MirrorBlockJob {
      unsigned long *in_flight_bitmap;
      int in_flight;
      int64_t bytes_in_flight;
 +    QTAILQ_HEAD(MirrorOpList, MirrorOp) ops_in_flight;
      int ret;
      bool unmap;
 -    bool waiting_for_io;
      int target_cluster_size;
      int max_iov;
      bool initial_zeroing_ongoing;
  } MirrorBlockJob;
 -typedef struct MirrorOp {
 +struct MirrorOp {
      MirrorBlockJob *s;
      QEMUIOVector qiov;
      int64_t offset;
@@ -XXX,XX +XXX,XX @@ typedef struct MirrorOp {
      /* The pointee is set by mirror_co_read(), mirror_co_zero(), and
       * mirror_co_discard() before yielding for the first time */
      int64_t *bytes_handled;
 -} MirrorOp;
 +
 +    CoQueue waiting_requests;
 +
 +    QTAILQ_ENTRY(MirrorOp) next;
 +};
  typedef enum MirrorMethod {
      MIRROR_METHOD_COPY,
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn mirror_iteration_done(MirrorOp *op, int ret)
      chunk_num = op->offset / s->granularity;
      nb_chunks = DIV_ROUND_UP(op->bytes, s->granularity);
 +
      bitmap_clear(s->in_flight_bitmap, chunk_num, nb_chunks);
 +    QTAILQ_REMOVE(&s->ops_in_flight, op, next);
      if (ret >= 0) {
          if (s->cow_bitmap) {
              bitmap_set(s->cow_bitmap, chunk_num, nb_chunks);
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn mirror_iteration_done(MirrorOp *op, int ret)
          }
      }
      qemu_iovec_destroy(&op->qiov);
 -    g_free(op);
 -    if (s->waiting_for_io) {
 -        qemu_coroutine_enter(s->common.job.co);
 -    }
 +    qemu_co_queue_restart_all(&op->waiting_requests);
 +    g_free(op);
  }
- static void coroutine_fn mirror_write_complete(MirrorOp *op, int ret)
+-static const BdrvChildRole child_job = {
-@@ -XXX,XX +XXX,XX @@ static int mirror_cow_align(MirrorBlockJob *s, int64_t *offset,
+-    .get_parent_desc    = child_job_get_parent_desc,
+-    .stay_at_node       = true,
- static inline void mirror_wait_for_io(MirrorBlockJob *s)
+-};
 -
 -static void block_job_drained_begin(void *opaque)
 +static void child_job_drained_begin(BdrvChild *c)
  {
--    assert(!s->waiting_for_io);
+-    BlockJob *job = opaque;
--    s->waiting_for_io = true;
++    BlockJob *job = c->opaque;
--    qemu_coroutine_yield();
+     block_job_pause(job);
 -    s->waiting_for_io = false;
 +    MirrorOp *op;
 +
 +    op = QTAILQ_FIRST(&s->ops_in_flight);
 +    assert(op);
 +    qemu_co_queue_wait(&op->waiting_requests, NULL);
  }
- /* Perform a mirror copy operation.
+-static void block_job_drained_end(void *opaque)
-@@ -XXX,XX +XXX,XX @@ static unsigned mirror_perform(MirrorBlockJob *s, int64_t offset,
++static void child_job_drained_end(BdrvChild *c)
-         .bytes          = bytes,
+ {
-         .bytes_handled  = &bytes_handled,
+-    BlockJob *job = opaque;
-     };
++    BlockJob *job = c->opaque;
-+    qemu_co_queue_init(&op->waiting_requests);
+     block_job_resume(job);
+ }
-     switch (mirror_method) {
-     case MIRROR_METHOD_COPY:
+-static const BlockDevOps block_job_dev_ops = {
-@@ -XXX,XX +XXX,XX @@ static unsigned mirror_perform(MirrorBlockJob *s, int64_t offset,
+-    .drained_begin = block_job_drained_begin,
-         abort();
+-    .drained_end = block_job_drained_end,
-     }
++static const BdrvChildRole child_job = {
++    .get_parent_desc    = child_job_get_parent_desc,
-+    QTAILQ_INSERT_TAIL(&s->ops_in_flight, op, next);
++    .drained_begin      = child_job_drained_begin,
-     qemu_coroutine_enter(co);
++    .drained_end        = child_job_drained_end,
-     /* At this point, ownership of op has been moved to the coroutine
++    .stay_at_node       = true,
-      * and the object may already be freed */
+ };
-@@ -XXX,XX +XXX,XX @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs,
-         }
+ void block_job_remove_all_bdrv(BlockJob *job)
-     }
+@@ -XXX,XX +XXX,XX @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver,
+     block_job_add_bdrv(job, "main node", bs, 0, BLK_PERM_ALL, &error_abort);
-+    QTAILQ_INIT(&s->ops_in_flight);
+     bs->job = job;
-+
-     trace_mirror_start(bs, s, opaque);
+-    blk_set_dev_ops(blk, &block_job_dev_ops, job);
-     job_start(&s->common.job);
+     bdrv_op_unblock(bs, BLOCK_OP_TYPE_DATAPLANE, job->blocker);
-     return;
      QLIST_INSERT_HEAD(&block_jobs, job, job_list);
 --
 .13.6

-[Qemu-devel] [PULL 09/35] test-bdrv-drain: Add test for node deletion
+[Qemu-devel] [PULL v3 23/35] test-bdrv-drain: Test drain vs. block jobs
-From: Max Reitz <mreitz@redhat.com>
+Block jobs must be paused if any of the involved nodes are drained.
-This patch adds two bdrv-drain tests for what happens if some BDS goes
-away during the drainage.
-The basic idea is that you have a parent BDS with some child nodes.
-Then, you drain one of the children.  Because of that, the party who
-actually owns the parent decides to (A) delete it, or (B) detach all its
-children from it -- both while the child is still being drained.
-A real-world case where this can happen is the mirror block job, which
-may exit if you drain one of its children.
-Signed-off-by: Max Reitz <mreitz@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- tests/test-bdrv-drain.c | 169 ++++++++++++++++++++++++++++++++++++++++++++++++
+ tests/test-bdrv-drain.c | 121 ++++++++++++++++++++++++++++++++++++++++++++++++
-file changed, 169 insertions(+)
+file changed, 121 insertions(+)
 diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tests/test-bdrv-drain.c
 +++ b/tests/test-bdrv-drain.c
-@@ -XXX,XX +XXX,XX @@ static void test_blockjob_drain_subtree(void)
+@@ -XXX,XX +XXX,XX @@
-     test_blockjob_common(BDRV_SUBTREE_DRAIN);
  #include "qemu/osdep.h"
  #include "block/block.h"
 +#include "block/blockjob_int.h"
  #include "sysemu/block-backend.h"
  #include "qapi/error.h"
@@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain(void)
      test_quiesce_common(BDRV_DRAIN, false);
  }
 +
-+typedef struct BDRVTestTopState {
++typedef struct TestBlockJob {
-+    BdrvChild *wait_child;
++    BlockJob common;
-+} BDRVTestTopState;
++    bool should_complete;
 +} TestBlockJob;
 +
-+static void bdrv_test_top_close(BlockDriverState *bs)
++static void test_job_completed(BlockJob *job, void *opaque)
 +{
-+    BdrvChild *c, *next_c;
++    block_job_completed(job, 0);
 +    QLIST_FOREACH_SAFE(c, &bs->children, next, next_c) {
 +        bdrv_unref_child(bs, c);
 +    }
 +}
 +
-+static int coroutine_fn bdrv_test_top_co_preadv(BlockDriverState *bs,
++static void coroutine_fn test_job_start(void *opaque)
 +                                                uint64_t offset, uint64_t bytes,
 +                                                QEMUIOVector *qiov, int flags)
 +{
-+    BDRVTestTopState *tts = bs->opaque;
++    TestBlockJob *s = opaque;
-+    return bdrv_co_preadv(tts->wait_child, offset, bytes, qiov, flags);
++
 +    while (!s->should_complete) {
 +        block_job_sleep_ns(&s->common, 100000);
 +    }
 +
 +    block_job_defer_to_main_loop(&s->common, test_job_completed, NULL);
 +}
 +
-+static BlockDriver bdrv_test_top_driver = {
++static void test_job_complete(BlockJob *job, Error **errp)
-+    .format_name            = "test_top_driver",
++{
-+    .instance_size          = sizeof(BDRVTestTopState),
++    TestBlockJob *s = container_of(job, TestBlockJob, common);
 +    s->should_complete = true;
 +}
 +
-+    .bdrv_close             = bdrv_test_top_close,
++BlockJobDriver test_job_driver = {
-+    .bdrv_co_preadv         = bdrv_test_top_co_preadv,
++    .instance_size  = sizeof(TestBlockJob),
-+
++    .start          = test_job_start,
-+    .bdrv_child_perm        = bdrv_format_default_perms,
++    .complete       = test_job_complete,
 +};
 +
-+typedef struct TestCoDeleteByDrainData {
++static void test_blockjob_common(enum drain_type drain_type)
-+    BlockBackend *blk;
++{
-+    bool detach_instead_of_delete;
++    BlockBackend *blk_src, *blk_target;
-+    bool done;
++    BlockDriverState *src, *target;
-+} TestCoDeleteByDrainData;
++    BlockJob *job;
 +    int ret;
 +
-+static void coroutine_fn test_co_delete_by_drain(void *opaque)
++    src = bdrv_new_open_driver(&bdrv_test, "source", BDRV_O_RDWR,
-+{
++                               &error_abort);
-+    TestCoDeleteByDrainData *dbdd = opaque;
++    blk_src = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
-+    BlockBackend *blk = dbdd->blk;
++    blk_insert_bs(blk_src, src, &error_abort);
 +    BlockDriverState *bs = blk_bs(blk);
 +    BDRVTestTopState *tts = bs->opaque;
 +    void *buffer = g_malloc(65536);
 +    QEMUIOVector qiov;
 +    struct iovec iov = {
 +        .iov_base = buffer,
 +        .iov_len  = 65536,
 +    };
 +
-+    qemu_iovec_init_external(&qiov, &iov, 1);
++    target = bdrv_new_open_driver(&bdrv_test, "target", BDRV_O_RDWR,
 +                                  &error_abort);
 +    blk_target = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
 +    blk_insert_bs(blk_target, target, &error_abort);
 +
-+    /* Pretend some internal write operation from parent to child.
++    job = block_job_create("job0", &test_job_driver, src, 0, BLK_PERM_ALL, 0,
-+     * Important: We have to read from the child, not from the parent!
++                           0, NULL, NULL, &error_abort);
-+     * Draining works by first propagating it all up the tree to the
++    block_job_add_bdrv(job, "target", target, 0, BLK_PERM_ALL, &error_abort);
-+     * root and then waiting for drainage from root to the leaves
++    block_job_start(job);
 +     * (protocol nodes).  If we have a request waiting on the root,
 +     * everything will be drained before we go back down the tree, but
 +     * we do not want that.  We want to be in the middle of draining
 +     * when this following requests returns. */
 +    bdrv_co_preadv(tts->wait_child, 0, 65536, &qiov, 0);
 +
-+    g_assert_cmpint(bs->refcnt, ==, 1);
++    g_assert_cmpint(job->pause_count, ==, 0);
 +    g_assert_false(job->paused);
 +    g_assert_false(job->busy); /* We're in block_job_sleep_ns() */
 +
-+    if (!dbdd->detach_instead_of_delete) {
++    do_drain_begin(drain_type, src);
-+        blk_unref(blk);
++
 +    if (drain_type == BDRV_DRAIN_ALL) {
 +        /* bdrv_drain_all() drains both src and target, and involves an
 +         * additional block_job_pause_all() */
 +        g_assert_cmpint(job->pause_count, ==, 3);
 +    } else {
-+        BdrvChild *c, *next_c;
++        g_assert_cmpint(job->pause_count, ==, 1);
 +        QLIST_FOREACH_SAFE(c, &bs->children, next, next_c) {
 +            bdrv_unref_child(bs, c);
 +        }
 +    }
++    /* XXX We don't wait until the job is actually paused. Is this okay? */
++    /* g_assert_true(job->paused); */
++    g_assert_false(job->busy); /* The job is paused */
 +
-+    dbdd->done = true;
++    do_drain_end(drain_type, src);
 +
 +    g_assert_cmpint(job->pause_count, ==, 0);
 +    g_assert_false(job->paused);
 +    g_assert_false(job->busy); /* We're in block_job_sleep_ns() */
 +
 +    do_drain_begin(drain_type, target);
 +
 +    if (drain_type == BDRV_DRAIN_ALL) {
 +        /* bdrv_drain_all() drains both src and target, and involves an
 +         * additional block_job_pause_all() */
 +        g_assert_cmpint(job->pause_count, ==, 3);
 +    } else {
 +        g_assert_cmpint(job->pause_count, ==, 1);
 +    }
 +    /* XXX We don't wait until the job is actually paused. Is this okay? */
 +    /* g_assert_true(job->paused); */
 +    g_assert_false(job->busy); /* The job is paused */
 +
 +    do_drain_end(drain_type, target);
 +
 +    g_assert_cmpint(job->pause_count, ==, 0);
 +    g_assert_false(job->paused);
 +    g_assert_false(job->busy); /* We're in block_job_sleep_ns() */
 +
 +    ret = block_job_complete_sync(job, &error_abort);
 +    g_assert_cmpint(ret, ==, 0);
 +
 +    blk_unref(blk_src);
 +    blk_unref(blk_target);
 +    bdrv_unref(src);
 +    bdrv_unref(target);
 +}
 +
-+/**
++static void test_blockjob_drain_all(void)
 + * Test what happens when some BDS has some children, you drain one of
 + * them and this results in the BDS being deleted.
 + *
 + * If @detach_instead_of_delete is set, the BDS is not going to be
 + * deleted but will only detach all of its children.
 + */
 +static void do_test_delete_by_drain(bool detach_instead_of_delete)
 +{
-+    BlockBackend *blk;
++    test_blockjob_common(BDRV_DRAIN_ALL);
 +    BlockDriverState *bs, *child_bs, *null_bs;
 +    BDRVTestTopState *tts;
 +    TestCoDeleteByDrainData dbdd;
 +    Coroutine *co;
 +
 +    bs = bdrv_new_open_driver(&bdrv_test_top_driver, "top", BDRV_O_RDWR,
 +                              &error_abort);
 +    bs->total_sectors = 65536 >> BDRV_SECTOR_BITS;
 +    tts = bs->opaque;
 +
 +    null_bs = bdrv_open("null-co://", NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
 +                        &error_abort);
 +    bdrv_attach_child(bs, null_bs, "null-child", &child_file, &error_abort);
 +
 +    /* This child will be the one to pass to requests through to, and
 +     * it will stall until a drain occurs */
 +    child_bs = bdrv_new_open_driver(&bdrv_test, "child", BDRV_O_RDWR,
 +                                    &error_abort);
 +    child_bs->total_sectors = 65536 >> BDRV_SECTOR_BITS;
 +    /* Takes our reference to child_bs */
 +    tts->wait_child = bdrv_attach_child(bs, child_bs, "wait-child", &child_file,
 +                                        &error_abort);
 +
 +    /* This child is just there to be deleted
 +     * (for detach_instead_of_delete == true) */
 +    null_bs = bdrv_open("null-co://", NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
 +                        &error_abort);
 +    bdrv_attach_child(bs, null_bs, "null-child", &child_file, &error_abort);
 +
 +    blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
 +    blk_insert_bs(blk, bs, &error_abort);
 +
 +    /* Referenced by blk now */
 +    bdrv_unref(bs);
 +
 +    g_assert_cmpint(bs->refcnt, ==, 1);
 +    g_assert_cmpint(child_bs->refcnt, ==, 1);
 +    g_assert_cmpint(null_bs->refcnt, ==, 1);
 +
 +
 +    dbdd = (TestCoDeleteByDrainData){
 +        .blk = blk,
 +        .detach_instead_of_delete = detach_instead_of_delete,
 +        .done = false,
 +    };
 +    co = qemu_coroutine_create(test_co_delete_by_drain, &dbdd);
 +    qemu_coroutine_enter(co);
 +
 +    /* Drain the child while the read operation is still pending.
 +     * This should result in the operation finishing and
 +     * test_co_delete_by_drain() resuming.  Thus, @bs will be deleted
 +     * and the coroutine will exit while this drain operation is still
 +     * in progress. */
 +    bdrv_ref(child_bs);
 +    bdrv_drain(child_bs);
 +    bdrv_unref(child_bs);
 +
 +    while (!dbdd.done) {
 +        aio_poll(qemu_get_aio_context(), true);
 +    }
 +
 +    if (detach_instead_of_delete) {
 +        /* Here, the reference has not passed over to the coroutine,
 +         * so we have to delete the BB ourselves */
 +        blk_unref(blk);
 +    }
 +}
 +
-+
++static void test_blockjob_drain(void)
 +static void test_delete_by_drain(void)
 +{
-+    do_test_delete_by_drain(false);
++    test_blockjob_common(BDRV_DRAIN);
 +}
-+
-+static void test_detach_by_drain(void)
-+{
-+    do_test_delete_by_drain(true);
-+}
-+
 +
  int main(int argc, char **argv)
  {
-     int ret;
+     bdrv_init();
 @@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
-     g_test_add_func("/bdrv-drain/blockjob/drain_subtree",
+     g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
-                     test_blockjob_drain_subtree);
+     g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
-+    g_test_add_func("/bdrv-drain/deletion", test_delete_by_drain);
++    g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
-+    g_test_add_func("/bdrv-drain/detach", test_detach_by_drain);
++    g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
 +
-     ret = g_test_run();
+     return g_test_run();
-     qemu_event_destroy(&done_event);
+ }
      return ret;
 --
 .13.6

-[Qemu-devel] [PULL 17/35] block: Move bdrv_drain_all_begin() out of coroutine context
+[Qemu-devel] [PULL v3 24/35] block: Don't block_job_pause_all() in bdrv_drain_all()
-Before we can introduce a single polling loop for all nodes in
+Block jobs are already paused using the BdrvChildRole drain callbacks,
-bdrv_drain_all_begin(), we must make sure to run it outside of coroutine
+so we don't need an additional block_job_pause_all() call.
 context like we already do for bdrv_do_drained_begin().
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- block/io.c | 22 +++++++++++++++++-----
+ block/io.c              |  4 ----
-file changed, 17 insertions(+), 5 deletions(-)
+ tests/test-bdrv-drain.c | 10 ++++------
 files changed, 4 insertions(+), 10 deletions(-)
 diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/io.c
 +++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
-     Coroutine *co = data->co;
+      * context. */
-     BlockDriverState *bs = data->bs;
+     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
--    bdrv_dec_in_flight(bs);
+-    block_job_pause_all();
--    if (data->begin) {
+-
--        bdrv_do_drained_begin(bs, data->recursive, data->parent, data->poll);
+     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
-+    if (bs) {
+         AioContext *aio_context = bdrv_get_aio_context(bs);
-+        bdrv_dec_in_flight(bs);
-+        if (data->begin) {
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
-+            bdrv_do_drained_begin(bs, data->recursive, data->parent, data->poll);
+         aio_enable_external(aio_context);
-+        } else {
+         aio_context_release(aio_context);
-+            bdrv_do_drained_end(bs, data->recursive, data->parent);
+     }
-+        }
+-
 -    block_job_resume_all();
  }
  void bdrv_drain_all(void)
 diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tests/test-bdrv-drain.c
 +++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static void test_blockjob_common(enum drain_type drain_type)
      do_drain_begin(drain_type, src);
      if (drain_type == BDRV_DRAIN_ALL) {
 -        /* bdrv_drain_all() drains both src and target, and involves an
 -         * additional block_job_pause_all() */
 -        g_assert_cmpint(job->pause_count, ==, 3);
 +        /* bdrv_drain_all() drains both src and target */
 +        g_assert_cmpint(job->pause_count, ==, 2);
      } else {
--        bdrv_do_drained_end(bs, data->recursive, data->parent);
+         g_assert_cmpint(job->pause_count, ==, 1);
 +        assert(data->begin);
 +        bdrv_drain_all_begin();
      }
+@@ -XXX,XX +XXX,XX @@ static void test_blockjob_common(enum drain_type drain_type)
-     data->done = true;
+     do_drain_begin(drain_type, target);
-@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
-         .parent = parent,
+     if (drain_type == BDRV_DRAIN_ALL) {
-         .poll = poll,
+-        /* bdrv_drain_all() drains both src and target, and involves an
-     };
+-         * additional block_job_pause_all() */
--    bdrv_inc_in_flight(bs);
+-        g_assert_cmpint(job->pause_count, ==, 3);
-+    if (bs) {
++        /* bdrv_drain_all() drains both src and target */
-+        bdrv_inc_in_flight(bs);
++        g_assert_cmpint(job->pause_count, ==, 2);
-+    }
+     } else {
-     aio_bh_schedule_oneshot(bdrv_get_aio_context(bs),
+         g_assert_cmpint(job->pause_count, ==, 1);
-                             bdrv_co_drain_bh_cb, &data);
+     }
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
      BlockDriverState *bs;
      BdrvNextIterator it;
 +    if (qemu_in_coroutine()) {
 +        bdrv_co_yield_to_drain(NULL, true, false, NULL, true);
 +        return;
 +    }
 +
      /* BDRV_POLL_WHILE() for a node can only be called from its own I/O thread
       * or the main loop AioContext. We potentially use BDRV_POLL_WHILE() on
       * nodes in several different AioContexts, so make sure we're in the main
 --
 .13.6

-[Qemu-devel] [PULL 12/35] block: Don't poll in parent drain callbacks
+[Qemu-devel] [PULL v3 25/35] block: Nested drain_end must still call callbacks
-bdrv_do_drained_begin() is only safe if we have a single
+bdrv_do_drained_begin() restricts the call of parent callbacks and
-BDRV_POLL_WHILE() after quiescing all affected nodes. We cannot allow
+aio_disable_external() to the outermost drain section, but the block
-that parent callbacks introduce a nested polling loop that could cause
+driver callbacks are always called. bdrv_do_drained_end() must match
-graph changes while we're traversing the graph.
+this behaviour, otherwise nodes stay drained even if begin/end calls
+were balanced.
 Split off bdrv_do_drained_begin_quiesce(), which only quiesces a single
 node without waiting for its requests to complete. These requests will
 be waited for in the BDRV_POLL_WHILE() call down the call chain.
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- include/block/block.h |  9 +++++++++
+ block/io.c | 12 +++++++-----
- block.c               |  2 +-
+file changed, 7 insertions(+), 5 deletions(-)
  block/io.c            | 24 ++++++++++++++++--------
 files changed, 26 insertions(+), 9 deletions(-)
-diff --git a/include/block/block.h b/include/block/block.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/block/block.h
-+++ b/include/block/block.h
-@@ -XXX,XX +XXX,XX @@ bool bdrv_drain_poll(BlockDriverState *bs, bool recursive,
- void bdrv_drained_begin(BlockDriverState *bs);
- /**
-+ * bdrv_do_drained_begin_quiesce:
-+ *
-+ * Quiesces a BDS like bdrv_drained_begin(), but does not wait for already
-+ * running requests to complete.
-+ */
-+void bdrv_do_drained_begin_quiesce(BlockDriverState *bs,
-+                                   BdrvChild *parent);
-+
-+/**
-  * Like bdrv_drained_begin, but recursively begins a quiesced section for
-  * exclusive access to all child nodes as well.
-  */
-diff --git a/block.c b/block.c
-index XXXXXXX..XXXXXXX 100644
---- a/block.c
-+++ b/block.c
-@@ -XXX,XX +XXX,XX @@ static char *bdrv_child_get_parent_desc(BdrvChild *c)
- static void bdrv_child_cb_drained_begin(BdrvChild *child)
- {
-     BlockDriverState *bs = child->opaque;
--    bdrv_drained_begin(bs);
-+    bdrv_do_drained_begin_quiesce(bs, NULL);
- }
- static bool bdrv_child_cb_drained_poll(BdrvChild *child)
 diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/io.c
 +++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
+@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
-     assert(data.done);
- }
+ void bdrv_drained_end(BlockDriverState *bs)
 -void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
 -                           BdrvChild *parent, bool poll)
 +void bdrv_do_drained_begin_quiesce(BlockDriverState *bs,
 +                                   BdrvChild *parent)
  {
--    BdrvChild *child, *next;
++    int old_quiesce_counter;
--
++
--    if (qemu_in_coroutine()) {
+     if (qemu_in_coroutine()) {
--        bdrv_co_yield_to_drain(bs, true, recursive, parent, poll);
+         bdrv_co_yield_to_drain(bs, false);
          return;
      }
      assert(bs->quiesce_counter > 0);
 -    if (atomic_fetch_dec(&bs->quiesce_counter) > 1) {
 -        return;
 -    }
-+    assert(!qemu_in_coroutine());
++    old_quiesce_counter = atomic_fetch_dec(&bs->quiesce_counter);
-     /* Stop things in parent-to-child order */
+     /* Re-enable things in child-to-parent order */
-     if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
+     bdrv_drain_invoke(bs, false, false);
-@@ -XXX,XX +XXX,XX @@ void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
+-    bdrv_parent_drained_end(bs);
+-    aio_enable_external(bdrv_get_aio_context(bs));
-     bdrv_parent_drained_begin(bs, parent);
++    if (old_quiesce_counter == 1) {
-     bdrv_drain_invoke(bs, true);
++        bdrv_parent_drained_end(bs);
-+}
++        aio_enable_external(bdrv_get_aio_context(bs));
 +
 +static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
 +                                  BdrvChild *parent, bool poll)
 +{
 +    BdrvChild *child, *next;
 +
 +    if (qemu_in_coroutine()) {
 +        bdrv_co_yield_to_drain(bs, true, recursive, parent, poll);
 +        return;
 +    }
-+
+ }
-+    bdrv_do_drained_begin_quiesce(bs, parent);
+ /*
      if (recursive) {
          bs->recursive_quiesce_counter++;
 --
 .13.6

-[Qemu-devel] [PULL 30/35] block/dirty-bitmap: Add bdrv_dirty_iter_next_area
+[Qemu-devel] [PULL v3 26/35] test-bdrv-drain: Test nested drain sections
-From: Max Reitz <mreitz@redhat.com>
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
  tests/test-bdrv-drain.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++
 file changed, 57 insertions(+)
-This new function allows to look for a consecutively dirty area in a
+diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
 dirty bitmap.
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 Reviewed-by: Fam Zheng <famz@redhat.com>
 Reviewed-by: John Snow <jsnow@redhat.com>
 Message-id: 20180613181823.13618-10-mreitz@redhat.com
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 ---
  include/block/dirty-bitmap.h |  2 ++
  block/dirty-bitmap.c         | 55 ++++++++++++++++++++++++++++++++++++++++++++
 files changed, 57 insertions(+)
 diff --git a/include/block/dirty-bitmap.h b/include/block/dirty-bitmap.h
 index XXXXXXX..XXXXXXX 100644
---- a/include/block/dirty-bitmap.h
+--- a/tests/test-bdrv-drain.c
-+++ b/include/block/dirty-bitmap.h
++++ b/tests/test-bdrv-drain.c
-@@ -XXX,XX +XXX,XX @@ void bdrv_set_dirty_bitmap_locked(BdrvDirtyBitmap *bitmap,
+@@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret)
- void bdrv_reset_dirty_bitmap_locked(BdrvDirtyBitmap *bitmap,
+ enum drain_type {
-                                     int64_t offset, int64_t bytes);
+     BDRV_DRAIN_ALL,
- int64_t bdrv_dirty_iter_next(BdrvDirtyBitmapIter *iter);
+     BDRV_DRAIN,
-+bool bdrv_dirty_iter_next_area(BdrvDirtyBitmapIter *iter, uint64_t max_offset,
++    DRAIN_TYPE_MAX,
-+                               uint64_t *offset, int *bytes);
+ };
- void bdrv_set_dirty_iter(BdrvDirtyBitmapIter *hbi, int64_t offset);
- int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap);
+ static void do_drain_begin(enum drain_type drain_type, BlockDriverState *bs)
- int64_t bdrv_get_meta_dirty_count(BdrvDirtyBitmap *bitmap);
+@@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain(void)
-diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c
+     test_quiesce_common(BDRV_DRAIN, false);
 index XXXXXXX..XXXXXXX 100644
 --- a/block/dirty-bitmap.c
 +++ b/block/dirty-bitmap.c
@@ -XXX,XX +XXX,XX @@ int64_t bdrv_dirty_iter_next(BdrvDirtyBitmapIter *iter)
      return hbitmap_iter_next(&iter->hbi, true);
  }
-+/**
++static void test_nested(void)
 + * Return the next consecutively dirty area in the dirty bitmap
 + * belonging to the given iterator @iter.
 + *
 + * @max_offset: Maximum value that may be returned for
 + *              *offset + *bytes
 + * @offset:     Will contain the start offset of the next dirty area
 + * @bytes:      Will contain the length of the next dirty area
 + *
 + * Returns: True if a dirty area could be found before max_offset
 + *          (which means that *offset and *bytes then contain valid
 + *          values), false otherwise.
 + *
 + * Note that @iter is never advanced if false is returned.  If an area
 + * is found (which means that true is returned), it will be advanced
 + * past that area.
 + */
 +bool bdrv_dirty_iter_next_area(BdrvDirtyBitmapIter *iter, uint64_t max_offset,
 +                               uint64_t *offset, int *bytes)
 +{
-+    uint32_t granularity = bdrv_dirty_bitmap_granularity(iter->bitmap);
++    BlockBackend *blk;
-+    uint64_t gran_max_offset;
++    BlockDriverState *bs, *backing;
-+    int64_t ret;
++    BDRVTestState *s, *backing_s;
-+    int size;
++    enum drain_type outer, inner;
 +
-+    if (max_offset == iter->bitmap->size) {
++    blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
-+        /* If max_offset points to the image end, round it up by the
++    bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
-+         * bitmap granularity */
++                              &error_abort);
-+        gran_max_offset = ROUND_UP(max_offset, granularity);
++    s = bs->opaque;
-+    } else {
++    blk_insert_bs(blk, bs, &error_abort);
-+        gran_max_offset = max_offset;
++
 +    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
 +    backing_s = backing->opaque;
 +    bdrv_set_backing_hd(bs, backing, &error_abort);
 +
 +    for (outer = 0; outer < DRAIN_TYPE_MAX; outer++) {
 +        for (inner = 0; inner < DRAIN_TYPE_MAX; inner++) {
 +            /* XXX bdrv_drain_all() doesn't increase the quiesce_counter */
 +            int bs_quiesce      = (outer != BDRV_DRAIN_ALL) +
 +                                  (inner != BDRV_DRAIN_ALL);
 +            int backing_quiesce = 0;
 +            int backing_cb_cnt  = (outer != BDRV_DRAIN) +
 +                                  (inner != BDRV_DRAIN);
 +
 +            g_assert_cmpint(bs->quiesce_counter, ==, 0);
 +            g_assert_cmpint(backing->quiesce_counter, ==, 0);
 +            g_assert_cmpint(s->drain_count, ==, 0);
 +            g_assert_cmpint(backing_s->drain_count, ==, 0);
 +
 +            do_drain_begin(outer, bs);
 +            do_drain_begin(inner, bs);
 +
 +            g_assert_cmpint(bs->quiesce_counter, ==, bs_quiesce);
 +            g_assert_cmpint(backing->quiesce_counter, ==, backing_quiesce);
 +            g_assert_cmpint(s->drain_count, ==, 2);
 +            g_assert_cmpint(backing_s->drain_count, ==, backing_cb_cnt);
 +
 +            do_drain_end(inner, bs);
 +            do_drain_end(outer, bs);
 +
 +            g_assert_cmpint(bs->quiesce_counter, ==, 0);
 +            g_assert_cmpint(backing->quiesce_counter, ==, 0);
 +            g_assert_cmpint(s->drain_count, ==, 0);
 +            g_assert_cmpint(backing_s->drain_count, ==, 0);
 +        }
 +    }
 +
-+    ret = hbitmap_iter_next(&iter->hbi, false);
++    bdrv_unref(backing);
-+    if (ret < 0 || ret + granularity > gran_max_offset) {
++    bdrv_unref(bs);
-+        return false;
++    blk_unref(blk);
 +    }
 +
 +    *offset = ret;
 +    size = 0;
 +
 +    assert(granularity <= INT_MAX);
 +
 +    do {
 +        /* Advance iterator */
 +        ret = hbitmap_iter_next(&iter->hbi, true);
 +        size += granularity;
 +    } while (ret + granularity <= gran_max_offset &&
 +             hbitmap_iter_next(&iter->hbi, false) == ret + granularity &&
 +             size <= INT_MAX - granularity);
 +
 +    *bytes = MIN(size, max_offset - *offset);
 +    return true;
 +}
 +
- /* Called within bdrv_dirty_bitmap_lock..unlock */
- void bdrv_set_dirty_bitmap_locked(BdrvDirtyBitmap *bitmap,
+ typedef struct TestBlockJob {
-                                   int64_t offset, int64_t bytes)
+     BlockJob common;
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
      g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
      g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
 +    g_test_add_func("/bdrv-drain/nested", test_nested);
 +
      g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
      g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
 --
 .13.6

-[Qemu-devel] [PULL 18/35] block: ignore_bds_parents parameter for drain functions
+[Qemu-devel] [PULL v3 27/35] block: Don't notify parents in drain call chain
-In the future, bdrv_drained_all_begin/end() will drain all invidiual
+This is in preparation for subtree drains, i.e. drained sections that
-nodes separately rather than whole subtrees. This means that we don't
+affect not only a single node, but recursively all child nodes, too.
-want to propagate the drain to all parents any more: If the parent is a
-BDS, it will already be drained separately. Recursing to all parents is
+Calling the parent callbacks for drain is pointless when we just came
-unnecessary work and would make it an O(n²) operation.
+from that parent node recursively and leads to multiple increases of
+bs->quiesce_counter in a single drain call. Don't do it.
-Prepare the drain function for the changed drain_all by adding an
-ignore_bds_parents parameter to the internal implementation that
+In order for this to work correctly, the parent callback must be called
-prevents the propagation of the drain to BDS parents. We still (have to)
+for every bdrv_drain_begin/end() call, not only for the outermost one:
-propagate it to non-BDS parents like BlockBackends or Jobs because those
-are not drained separately.
+If we have a node N with two parents A and B, recursive draining of A
 should cause the quiesce_counter of B to increase because its child N is
 drained independently of B. If now B is recursively drained, too, A must
 increase its quiesce_counter because N is drained independently of A
 only now, even if N is going from quiesce_counter 1 to 2.
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- include/block/block.h     | 16 ++++++---
+ include/block/block.h |  4 ++--
- include/block/block_int.h |  6 ++++
+ block.c               | 13 +++++++++----
- block.c                   | 11 +++---
+ block/io.c            | 47 ++++++++++++++++++++++++++++++++++-------------
- block/io.c                | 88 ++++++++++++++++++++++++++++-------------------
+files changed, 45 insertions(+), 19 deletions(-)
  block/vvfat.c             |  1 +
 files changed, 78 insertions(+), 44 deletions(-)
 diff --git a/include/block/block.h b/include/block/block.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/block/block.h
 +++ b/include/block/block.h
 @@ -XXX,XX +XXX,XX @@ void bdrv_io_unplug(BlockDriverState *bs);
   * Begin a quiesced section of all users of @bs. This is part of
   * bdrv_drained_begin.
   */
--void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore);
+-void bdrv_parent_drained_begin(BlockDriverState *bs);
-+void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore,
++void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore);
 +                               bool ignore_bds_parents);
  /**
   * bdrv_parent_drained_end:
-@@ -XXX,XX +XXX,XX @@ void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore);
+@@ -XXX,XX +XXX,XX @@ void bdrv_parent_drained_begin(BlockDriverState *bs);
   * End a quiesced section of all users of @bs. This is part of
   * bdrv_drained_end.
   */
--void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore);
+-void bdrv_parent_drained_end(BlockDriverState *bs);
-+void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore,
++void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore);
 +                             bool ignore_bds_parents);
  /**
   * bdrv_drain_poll:
   *
   * Poll for pending requests in @bs, its parents (except for @ignore_parent),
 - * and if @recursive is true its children as well.
 + * and if @recursive is true its children as well (used for subtree drain).
 + *
 + * If @ignore_bds_parents is true, parents that are BlockDriverStates must
 + * ignore the drain request because they will be drained separately (used for
 + * drain_all).
   *
   * This is part of bdrv_drained_begin.
   */
  bool bdrv_drain_poll(BlockDriverState *bs, bool recursive,
 -                     BdrvChild *ignore_parent);
 +                     BdrvChild *ignore_parent, bool ignore_bds_parents);
  /**
   * bdrv_drained_begin:
-@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs);
-  * running requests to complete.
-  */
- void bdrv_do_drained_begin_quiesce(BlockDriverState *bs,
--                                   BdrvChild *parent);
-+                                   BdrvChild *parent, bool ignore_bds_parents);
- /**
-  * Like bdrv_drained_begin, but recursively begins a quiesced section for
-diff --git a/include/block/block_int.h b/include/block/block_int.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/block/block_int.h
-+++ b/include/block/block_int.h
-@@ -XXX,XX +XXX,XX @@ struct BdrvChildRole {
-      * points to. */
-     bool stay_at_node;
-+    /* If true, the parent is a BlockDriverState and bdrv_next_all_states()
-+     * will return it. This information is used for drain_all, where every node
-+     * will be drained separately, so the drain only needs to be propagated to
-+     * non-BDS parents. */
-+    bool parent_is_bds;
-+
-     void (*inherit_options)(int *child_flags, QDict *child_options,
-                             int parent_flags, QDict *parent_options);
 diff --git a/block.c b/block.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block.c
 +++ b/block.c
-@@ -XXX,XX +XXX,XX @@ static char *bdrv_child_get_parent_desc(BdrvChild *c)
+@@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child,
- static void bdrv_child_cb_drained_begin(BdrvChild *child)
+                                       BlockDriverState *new_bs)
  {
-     BlockDriverState *bs = child->opaque;
+     BlockDriverState *old_bs = child->bs;
--    bdrv_do_drained_begin_quiesce(bs, NULL);
++    int i;
-+    bdrv_do_drained_begin_quiesce(bs, NULL, false);
- }
+     if (old_bs && new_bs) {
+         assert(bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs));
- static bool bdrv_child_cb_drained_poll(BdrvChild *child)
+     }
- {
+     if (old_bs) {
-     BlockDriverState *bs = child->opaque;
+         if (old_bs->quiesce_counter && child->role->drained_end) {
--    return bdrv_drain_poll(bs, false, NULL);
+-            child->role->drained_end(child);
-+    return bdrv_drain_poll(bs, false, NULL, false);
++            for (i = 0; i < old_bs->quiesce_counter; i++) {
- }
++                child->role->drained_end(child);
++            }
- static void bdrv_child_cb_drained_end(BdrvChild *child)
+         }
-@@ -XXX,XX +XXX,XX @@ static void bdrv_inherited_options(int *child_flags, QDict *child_options,
+         if (child->role->detach) {
- }
+             child->role->detach(child);
+@@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child,
- const BdrvChildRole child_file = {
+     if (new_bs) {
-+    .parent_is_bds   = true,
+         QLIST_INSERT_HEAD(&new_bs->parents, child, next_parent);
-     .get_parent_desc = bdrv_child_get_parent_desc,
+         if (new_bs->quiesce_counter && child->role->drained_begin) {
-     .inherit_options = bdrv_inherited_options,
+-            child->role->drained_begin(child);
-     .drained_begin   = bdrv_child_cb_drained_begin,
++            for (i = 0; i < new_bs->quiesce_counter; i++) {
-@@ -XXX,XX +XXX,XX @@ static void bdrv_inherited_fmt_options(int *child_flags, QDict *child_options,
++                child->role->drained_begin(child);
- }
++            }
+         }
- const BdrvChildRole child_format = {
-+    .parent_is_bds   = true,
+         if (child->role->attach) {
      .get_parent_desc = bdrv_child_get_parent_desc,
      .inherit_options = bdrv_inherited_fmt_options,
      .drained_begin   = bdrv_child_cb_drained_begin,
@@ -XXX,XX +XXX,XX @@ static int bdrv_backing_update_filename(BdrvChild *c, BlockDriverState *base,
  }
  const BdrvChildRole child_backing = {
 +    .parent_is_bds   = true,
      .get_parent_desc = bdrv_child_get_parent_desc,
      .attach          = bdrv_backing_attach,
      .detach          = bdrv_backing_detach,
 @@ -XXX,XX +XXX,XX @@ void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
      AioContext *ctx = bdrv_get_aio_context(bs);
      aio_disable_external(ctx);
--    bdrv_parent_drained_begin(bs, NULL);
+-    bdrv_parent_drained_begin(bs);
-+    bdrv_parent_drained_begin(bs, NULL, false);
++    bdrv_parent_drained_begin(bs, NULL);
      bdrv_drain(bs); /* ensure there are no in-flight requests */
      while (aio_poll(ctx, false)) {
 @@ -XXX,XX +XXX,XX @@ void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
       */
      aio_context_acquire(new_context);
      bdrv_attach_aio_context(bs, new_context);
--    bdrv_parent_drained_end(bs, NULL);
+-    bdrv_parent_drained_end(bs);
-+    bdrv_parent_drained_end(bs, NULL, false);
++    bdrv_parent_drained_end(bs, NULL);
      aio_enable_external(ctx);
      aio_context_release(new_context);
  }
 diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/io.c
 +++ b/block/io.c
 @@ -XXX,XX +XXX,XX @@
  static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
      int64_t offset, int bytes, BdrvRequestFlags flags);
--void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore)
+-void bdrv_parent_drained_begin(BlockDriverState *bs)
-+void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore,
++void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore)
 +                               bool ignore_bds_parents)
  {
      BdrvChild *c, *next;
      QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
--        if (c == ignore) {
++        if (c == ignore) {
-+        if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) {
++            continue;
-             continue;
++        }
          }
          if (c->role->drained_begin) {
-@@ -XXX,XX +XXX,XX @@ void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore)
+             c->role->drained_begin(c);
-     }
+         }
- }
+     }
+ }
--void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore)
-+void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore,
+-void bdrv_parent_drained_end(BlockDriverState *bs)
-+                             bool ignore_bds_parents)
++void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore)
  {
      BdrvChild *c, *next;
      QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
--        if (c == ignore) {
++        if (c == ignore) {
-+        if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) {
++            continue;
-             continue;
++        }
          }
          if (c->role->drained_end) {
-@@ -XXX,XX +XXX,XX @@ void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore)
+             c->role->drained_end(c);
-     }
+         }
  }
 -static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore)
 +static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore,
 +                                     bool ignore_bds_parents)
  {
      BdrvChild *c, *next;
      bool busy = false;
      QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
 -        if (c == ignore) {
 +        if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) {
              continue;
          }
          if (c->role->drained_poll) {
 @@ -XXX,XX +XXX,XX @@ typedef struct {
-     bool recursive;
+     BlockDriverState *bs;
-     bool poll;
+     bool done;
-     BdrvChild *parent;
+     bool begin;
-+    bool ignore_bds_parents;
++    BdrvChild *parent;
  } BdrvCoDrainData;
  static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
-@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
+@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs)
+     return waited;
- /* Returns true if BDRV_POLL_WHILE() should go into a blocking aio_poll() */
+ }
- bool bdrv_drain_poll(BlockDriverState *bs, bool recursive,
--                     BdrvChild *ignore_parent)
++static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent);
-+                     BdrvChild *ignore_parent, bool ignore_bds_parents)
++static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent);
- {
++
      BdrvChild *child, *next;
 -    if (bdrv_parent_drained_poll(bs, ignore_parent)) {
 +    if (bdrv_parent_drained_poll(bs, ignore_parent, ignore_bds_parents)) {
          return true;
      }
@@ -XXX,XX +XXX,XX @@ bool bdrv_drain_poll(BlockDriverState *bs, bool recursive,
      }
      if (recursive) {
 +        assert(!ignore_bds_parents);
          QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
 -            if (bdrv_drain_poll(child->bs, recursive, child)) {
 +            if (bdrv_drain_poll(child->bs, recursive, child, false)) {
                  return true;
              }
          }
@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_poll_top_level(BlockDriverState *bs, bool recursive,
       * have executed. */
      while (aio_poll(bs->aio_context, false));
 -    return bdrv_drain_poll(bs, recursive, ignore_parent);
 +    return bdrv_drain_poll(bs, recursive, ignore_parent, false);
  }
  static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
 -                                  BdrvChild *parent, bool poll);
 +                                  BdrvChild *parent, bool ignore_bds_parents,
 +                                  bool poll);
  static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
 -                                BdrvChild *parent);
 +                                BdrvChild *parent, bool ignore_bds_parents);
  static void bdrv_co_drain_bh_cb(void *opaque)
  {
+     BdrvCoDrainData *data = opaque;
 @@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
-     if (bs) {
-         bdrv_dec_in_flight(bs);
+     bdrv_dec_in_flight(bs);
-         if (data->begin) {
+     if (data->begin) {
--            bdrv_do_drained_begin(bs, data->recursive, data->parent, data->poll);
+-        bdrv_drained_begin(bs);
-+            bdrv_do_drained_begin(bs, data->recursive, data->parent,
++        bdrv_do_drained_begin(bs, data->parent);
 +                                  data->ignore_bds_parents, data->poll);
          } else {
 -            bdrv_do_drained_end(bs, data->recursive, data->parent);
 +            bdrv_do_drained_end(bs, data->recursive, data->parent,
 +                                data->ignore_bds_parents);
          }
      } else {
-         assert(data->begin);
+-        bdrv_drained_end(bs);
 +        bdrv_do_drained_end(bs, data->parent);
      }
      data->done = true;
 @@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
+ }
  static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
-                                                 bool begin, bool recursive,
+-                                                bool begin)
--                                                BdrvChild *parent, bool poll)
++                                                bool begin, BdrvChild *parent)
 +                                                BdrvChild *parent,
 +                                                bool ignore_bds_parents,
 +                                                bool poll)
  {
      BdrvCoDrainData data;
 @@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
+         .bs = bs,
+         .done = false,
          .begin = begin,
-         .recursive = recursive,
++        .parent = parent,
          .parent = parent,
 +        .ignore_bds_parents = ignore_bds_parents,
          .poll = poll,
      };
-     if (bs) {
+     bdrv_inc_in_flight(bs);
      aio_bh_schedule_oneshot(bdrv_get_aio_context(bs),
 @@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
- }
+     assert(data.done);
+ }
- void bdrv_do_drained_begin_quiesce(BlockDriverState *bs,
--                                   BdrvChild *parent)
+-void bdrv_drained_begin(BlockDriverState *bs)
-+                                   BdrvChild *parent, bool ignore_bds_parents)
++static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent)
  {
-     assert(!qemu_in_coroutine());
+     if (qemu_in_coroutine()) {
+-        bdrv_co_yield_to_drain(bs, true);
-@@ -XXX,XX +XXX,XX @@ void bdrv_do_drained_begin_quiesce(BlockDriverState *bs,
++        bdrv_co_yield_to_drain(bs, true, parent);
          return;
      }
      /* Stop things in parent-to-child order */
      if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
          aio_disable_external(bdrv_get_aio_context(bs));
-     }
+-        bdrv_parent_drained_begin(bs);
+     }
--    bdrv_parent_drained_begin(bs, parent);
-+    bdrv_parent_drained_begin(bs, parent, ignore_bds_parents);
++    bdrv_parent_drained_begin(bs, parent);
-     bdrv_drain_invoke(bs, true);
+     bdrv_drain_invoke(bs, true, false);
- }
+     bdrv_drain_recurse(bs);
+ }
- static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
--                                  BdrvChild *parent, bool poll)
+-void bdrv_drained_end(BlockDriverState *bs)
-+                                  BdrvChild *parent, bool ignore_bds_parents,
++void bdrv_drained_begin(BlockDriverState *bs)
-+                                  bool poll)
++{
- {
++    bdrv_do_drained_begin(bs, NULL);
-     BdrvChild *child, *next;
++}
 +
 +static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent)
  {
      int old_quiesce_counter;
      if (qemu_in_coroutine()) {
--        bdrv_co_yield_to_drain(bs, true, recursive, parent, poll);
+-        bdrv_co_yield_to_drain(bs, false);
-+        bdrv_co_yield_to_drain(bs, true, recursive, parent, ignore_bds_parents,
++        bdrv_co_yield_to_drain(bs, false, parent);
 +                               poll);
          return;
      }
--    bdrv_do_drained_begin_quiesce(bs, parent);
-+    bdrv_do_drained_begin_quiesce(bs, parent, ignore_bds_parents);
-     if (recursive) {
-+        assert(!ignore_bds_parents);
-         bs->recursive_quiesce_counter++;
-         QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
--            bdrv_do_drained_begin(child->bs, true, child, false);
-+            bdrv_do_drained_begin(child->bs, true, child, ignore_bds_parents,
-+                                  false);
-         }
-     }
-@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
-      * nodes.
-      */
-     if (poll) {
-+        assert(!ignore_bds_parents);
-         BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, recursive, parent));
-     }
- }
- void bdrv_drained_begin(BlockDriverState *bs)
- {
--    bdrv_do_drained_begin(bs, false, NULL, true);
-+    bdrv_do_drained_begin(bs, false, NULL, false, true);
- }
- void bdrv_subtree_drained_begin(BlockDriverState *bs)
- {
--    bdrv_do_drained_begin(bs, true, NULL, true);
-+    bdrv_do_drained_begin(bs, true, NULL, false, true);
- }
--void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
--                         BdrvChild *parent)
-+static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
-+                                BdrvChild *parent, bool ignore_bds_parents)
- {
-     BdrvChild *child, *next;
-     int old_quiesce_counter;
-     if (qemu_in_coroutine()) {
--        bdrv_co_yield_to_drain(bs, false, recursive, parent, false);
-+        bdrv_co_yield_to_drain(bs, false, recursive, parent, ignore_bds_parents,
-+                               false);
-         return;
-     }
      assert(bs->quiesce_counter > 0);
-@@ -XXX,XX +XXX,XX @@ void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
+@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
      /* Re-enable things in child-to-parent order */
-     bdrv_drain_invoke(bs, false);
+     bdrv_drain_invoke(bs, false, false);
--    bdrv_parent_drained_end(bs, parent);
++    bdrv_parent_drained_end(bs, parent);
 +    bdrv_parent_drained_end(bs, parent, ignore_bds_parents);
      if (old_quiesce_counter == 1) {
+-        bdrv_parent_drained_end(bs);
          aio_enable_external(bdrv_get_aio_context(bs));
      }
+ }
-     if (recursive) {
-+        assert(!ignore_bds_parents);
++void bdrv_drained_end(BlockDriverState *bs)
-         bs->recursive_quiesce_counter--;
++{
-         QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
++    bdrv_do_drained_end(bs, NULL);
--            bdrv_do_drained_end(child->bs, true, child);
++}
-+            bdrv_do_drained_end(child->bs, true, child, ignore_bds_parents);
++
-         }
+ /*
-     }
+  * Wait for pending requests to complete on a single BlockDriverState subtree,
- }
+  * and suspend block driver's internal I/O until next request arrives.
  void bdrv_drained_end(BlockDriverState *bs)
  {
 -    bdrv_do_drained_end(bs, false, NULL);
 +    bdrv_do_drained_end(bs, false, NULL, false);
  }
  void bdrv_subtree_drained_end(BlockDriverState *bs)
  {
 -    bdrv_do_drained_end(bs, true, NULL);
 +    bdrv_do_drained_end(bs, true, NULL, false);
  }
  void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent)
@@ -XXX,XX +XXX,XX @@ void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent)
      int i;
      for (i = 0; i < new_parent->recursive_quiesce_counter; i++) {
 -        bdrv_do_drained_begin(child->bs, true, child, true);
 +        bdrv_do_drained_begin(child->bs, true, child, false, true);
      }
  }
@@ -XXX,XX +XXX,XX @@ void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent)
      int i;
      for (i = 0; i < old_parent->recursive_quiesce_counter; i++) {
 -        bdrv_do_drained_end(child->bs, true, child);
 +        bdrv_do_drained_end(child->bs, true, child, false);
      }
  }
 @@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
-     BdrvNextIterator it;
+         /* Stop things in parent-to-child order */
      if (qemu_in_coroutine()) {
 -        bdrv_co_yield_to_drain(NULL, true, false, NULL, true);
 +        bdrv_co_yield_to_drain(NULL, true, false, NULL, false, true);
          return;
      }
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
          AioContext *aio_context = bdrv_get_aio_context(bs);
          aio_context_acquire(aio_context);
--        bdrv_do_drained_begin(bs, true, NULL, true);
+         aio_disable_external(aio_context);
-+        bdrv_do_drained_begin(bs, true, NULL, false, true);
+-        bdrv_parent_drained_begin(bs);
 +        bdrv_parent_drained_begin(bs, NULL);
          bdrv_drain_invoke(bs, true, true);
          aio_context_release(aio_context);
-     }
 @@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
-         AioContext *aio_context = bdrv_get_aio_context(bs);
+         /* Re-enable things in child-to-parent order */
          aio_context_acquire(aio_context);
--        bdrv_do_drained_end(bs, true, NULL);
+         bdrv_drain_invoke(bs, false, true);
-+        bdrv_do_drained_end(bs, true, NULL, false);
+-        bdrv_parent_drained_end(bs);
 +        bdrv_parent_drained_end(bs, NULL);
          aio_enable_external(aio_context);
          aio_context_release(aio_context);
      }
- }
-diff --git a/block/vvfat.c b/block/vvfat.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/vvfat.c
-+++ b/block/vvfat.c
-@@ -XXX,XX +XXX,XX @@ static void vvfat_qcow_options(int *child_flags, QDict *child_options,
- }
- static const BdrvChildRole child_vvfat_qcow = {
-+    .parent_is_bds      = true,
-     .inherit_options    = vvfat_qcow_options,
- };
 --
 .13.6

-[Qemu-devel] [PULL 10/35] block: Drain recursively with a single BDRV_POLL_WHILE()
+[Qemu-devel] [PULL v3 28/35] block: Add bdrv_subtree_drained_begin/end()
-Anything can happen inside BDRV_POLL_WHILE(), including graph
+bdrv_drained_begin() waits for the completion of requests in the whole
-changes that may interfere with its callers (e.g. child list iteration
+subtree, but it only actually keeps its immediate bs parameter quiesced
-in recursive callers of bdrv_do_drained_begin).
+until bdrv_drained_end().
-Switch to a single BDRV_POLL_WHILE() call for the whole subtree at the
+Add a version that keeps the whole subtree drained. As of this commit,
-end of bdrv_do_drained_begin() to avoid such effects. The recursion
+graph changes cannot be allowed during a subtree drained section, but
-happens now inside the loop condition. As the graph can only change
+this will be fixed soon.
 between bdrv_drain_poll() calls, but not inside of it, doing the
 recursion here is safe.
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- include/block/block.h |  9 +++++---
+ include/block/block.h | 13 +++++++++++++
- block.c               |  2 +-
+ block/io.c            | 54 ++++++++++++++++++++++++++++++++++++++++-----------
- block/io.c            | 63 ++++++++++++++++++++++++++++++++++++---------------
+files changed, 56 insertions(+), 11 deletions(-)
 files changed, 52 insertions(+), 22 deletions(-)
 diff --git a/include/block/block.h b/include/block/block.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/block/block.h
 +++ b/include/block/block.h
 @@ -XXX,XX +XXX,XX @@ void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore);
+ void bdrv_drained_begin(BlockDriverState *bs);
  /**
-  * bdrv_drain_poll:
++ * Like bdrv_drained_begin, but recursively begins a quiesced section for
 + * exclusive access to all child nodes as well.
 + *
 + * Graph changes are not allowed during a subtree drain section.
 + */
 +void bdrv_subtree_drained_begin(BlockDriverState *bs);
 +
 +/**
   * bdrv_drained_end:
   *
-- * Poll for pending requests in @bs and its parents (except for
+  * End a quiescent section started by bdrv_drained_begin().
 - * @ignore_parent). This is part of bdrv_drained_begin.
 + * Poll for pending requests in @bs, its parents (except for @ignore_parent),
 + * and if @recursive is true its children as well.
 + *
 + * This is part of bdrv_drained_begin.
   */
--bool bdrv_drain_poll(BlockDriverState *bs, BdrvChild *ignore_parent);
+ void bdrv_drained_end(BlockDriverState *bs);
-+bool bdrv_drain_poll(BlockDriverState *bs, bool recursive,
-+                     BdrvChild *ignore_parent);
++/**
++ * End a quiescent section started by bdrv_subtree_drained_begin().
- /**
++ */
-  * bdrv_drained_begin:
++void bdrv_subtree_drained_end(BlockDriverState *bs);
-diff --git a/block.c b/block.c
++
-index XXXXXXX..XXXXXXX 100644
+ void bdrv_add_child(BlockDriverState *parent, BlockDriverState *child,
---- a/block.c
+                     Error **errp);
-+++ b/block.c
+ void bdrv_del_child(BlockDriverState *parent, BdrvChild *child, Error **errp);
@@ -XXX,XX +XXX,XX @@ static void bdrv_child_cb_drained_begin(BdrvChild *child)
  static bool bdrv_child_cb_drained_poll(BdrvChild *child)
  {
      BlockDriverState *bs = child->opaque;
 -    return bdrv_drain_poll(bs, NULL);
 +    return bdrv_drain_poll(bs, false, NULL);
  }
  static void bdrv_child_cb_drained_end(BdrvChild *child)
 diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/io.c
 +++ b/block/io.c
 @@ -XXX,XX +XXX,XX @@ typedef struct {
+     BlockDriverState *bs;
      bool done;
      bool begin;
-     bool recursive;
++    bool recursive;
 +    bool poll;
      BdrvChild *parent;
  } BdrvCoDrainData;
-@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
+@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs)
      return waited;
  }
- /* Returns true if BDRV_POLL_WHILE() should go into a blocking aio_poll() */
+-static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent);
--bool bdrv_drain_poll(BlockDriverState *bs, BdrvChild *ignore_parent)
+-static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent);
-+bool bdrv_drain_poll(BlockDriverState *bs, bool recursive,
++static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
-+                     BdrvChild *ignore_parent)
++                                  BdrvChild *parent);
 +static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
 +                                BdrvChild *parent);
  static void bdrv_co_drain_bh_cb(void *opaque)
  {
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
      bdrv_dec_in_flight(bs);
      if (data->begin) {
 -        bdrv_do_drained_begin(bs, data->parent);
 +        bdrv_do_drained_begin(bs, data->recursive, data->parent);
      } else {
 -        bdrv_do_drained_end(bs, data->parent);
 +        bdrv_do_drained_end(bs, data->recursive, data->parent);
      }
      data->done = true;
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
  }
  static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
 -                                                bool begin, BdrvChild *parent)
 +                                                bool begin, bool recursive,
 +                                                BdrvChild *parent)
  {
      BdrvCoDrainData data;
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
          .bs = bs,
          .done = false,
          .begin = begin,
 +        .recursive = recursive,
          .parent = parent,
      };
      bdrv_inc_in_flight(bs);
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
      assert(data.done);
  }
 -static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent)
 +static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
 +                                  BdrvChild *parent)
  {
 +    BdrvChild *child, *next;
 +
-     if (bdrv_parent_drained_poll(bs, ignore_parent)) {
+     if (qemu_in_coroutine()) {
-         return true;
+-        bdrv_co_yield_to_drain(bs, true, parent);
 +        bdrv_co_yield_to_drain(bs, true, recursive, parent);
          return;
      }
--    return atomic_read(&bs->in_flight);
+@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent)
-+    if (atomic_read(&bs->in_flight)) {
+     bdrv_parent_drained_begin(bs, parent);
-+        return true;
+     bdrv_drain_invoke(bs, true, false);
-+    }
+     bdrv_drain_recurse(bs);
 +
 +    if (recursive) {
 +        QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
-+            if (bdrv_drain_poll(child->bs, recursive, child)) {
++            bdrv_do_drained_begin(child->bs, true, child);
 +                return true;
 +            }
 +        }
 +    }
-+
-+    return false;
  }
--static bool bdrv_drain_poll_top_level(BlockDriverState *bs,
-+static bool bdrv_drain_poll_top_level(BlockDriverState *bs, bool recursive,
-                                       BdrvChild *ignore_parent)
- {
-     /* Execute pending BHs first and check everything else only after the BHs
-      * have executed. */
-     while (aio_poll(bs->aio_context, false));
--    return bdrv_drain_poll(bs, ignore_parent);
-+    return bdrv_drain_poll(bs, recursive, ignore_parent);
- }
- static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
--                                  BdrvChild *parent);
-+                                  BdrvChild *parent, bool poll);
- static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
-                                 BdrvChild *parent);
-@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
-     bdrv_dec_in_flight(bs);
-     if (data->begin) {
--        bdrv_do_drained_begin(bs, data->recursive, data->parent);
-+        bdrv_do_drained_begin(bs, data->recursive, data->parent, data->poll);
-     } else {
-         bdrv_do_drained_end(bs, data->recursive, data->parent);
-     }
-@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
- static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
-                                                 bool begin, bool recursive,
--                                                BdrvChild *parent)
-+                                                BdrvChild *parent, bool poll)
- {
-     BdrvCoDrainData data;
-@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
-         .begin = begin,
-         .recursive = recursive,
-         .parent = parent,
-+        .poll = poll,
-     };
-     bdrv_inc_in_flight(bs);
-     aio_bh_schedule_oneshot(bdrv_get_aio_context(bs),
-@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
- }
- void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
--                           BdrvChild *parent)
-+                           BdrvChild *parent, bool poll)
- {
-     BdrvChild *child, *next;
-     if (qemu_in_coroutine()) {
--        bdrv_co_yield_to_drain(bs, true, recursive, parent);
-+        bdrv_co_yield_to_drain(bs, true, recursive, parent, poll);
-         return;
-     }
-@@ -XXX,XX +XXX,XX @@ void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
-     bdrv_parent_drained_begin(bs, parent);
-     bdrv_drain_invoke(bs, true);
--    /* Wait for drained requests to finish */
--    BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, parent));
--
-     if (recursive) {
-         bs->recursive_quiesce_counter++;
-         QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
--            bdrv_do_drained_begin(child->bs, true, child);
-+            bdrv_do_drained_begin(child->bs, true, child, false);
-         }
-     }
-+
-+    /*
-+     * Wait for drained requests to finish.
-+     *
-+     * Calling BDRV_POLL_WHILE() only once for the top-level node is okay: The
-+     * call is needed so things in this AioContext can make progress even
-+     * though we don't return to the main AioContext loop - this automatically
-+     * includes other nodes in the same AioContext and therefore all child
-+     * nodes.
-+     */
-+    if (poll) {
-+        BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, recursive, parent));
-+    }
- }
  void bdrv_drained_begin(BlockDriverState *bs)
  {
--    bdrv_do_drained_begin(bs, false, NULL);
+-    bdrv_do_drained_begin(bs, NULL);
-+    bdrv_do_drained_begin(bs, false, NULL, true);
++    bdrv_do_drained_begin(bs, false, NULL);
 +}
 +
 +void bdrv_subtree_drained_begin(BlockDriverState *bs)
 +{
 +    bdrv_do_drained_begin(bs, true, NULL);
  }
- void bdrv_subtree_drained_begin(BlockDriverState *bs)
+-static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent)
 +static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
 +                                BdrvChild *parent)
  {
--    bdrv_do_drained_begin(bs, true, NULL);
++    BdrvChild *child, *next;
 +    bdrv_do_drained_begin(bs, true, NULL, true);
  }
  void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
@@ -XXX,XX +XXX,XX @@ void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
      int old_quiesce_counter;
      if (qemu_in_coroutine()) {
--        bdrv_co_yield_to_drain(bs, false, recursive, parent);
+-        bdrv_co_yield_to_drain(bs, false, parent);
-+        bdrv_co_yield_to_drain(bs, false, recursive, parent, false);
++        bdrv_co_yield_to_drain(bs, false, recursive, parent);
          return;
      }
      assert(bs->quiesce_counter > 0);
-@@ -XXX,XX +XXX,XX @@ void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent)
+@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent)
-     int i;
+     if (old_quiesce_counter == 1) {
+         aio_enable_external(bdrv_get_aio_context(bs));
      for (i = 0; i < new_parent->recursive_quiesce_counter; i++) {
 -        bdrv_do_drained_begin(child->bs, true, child);
 +        bdrv_do_drained_begin(child->bs, true, child, true);
      }
++
++    if (recursive) {
++        QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
++            bdrv_do_drained_end(child->bs, true, child);
++        }
++    }
  }
-@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
+ void bdrv_drained_end(BlockDriverState *bs)
-         AioContext *aio_context = bdrv_get_aio_context(bs);
+ {
+-    bdrv_do_drained_end(bs, NULL);
-         aio_context_acquire(aio_context);
++    bdrv_do_drained_end(bs, false, NULL);
--        bdrv_do_drained_begin(bs, true, NULL);
++}
-+        bdrv_do_drained_begin(bs, true, NULL, true);
++
-         aio_context_release(aio_context);
++void bdrv_subtree_drained_end(BlockDriverState *bs)
-     }
++{
++    bdrv_do_drained_end(bs, true, NULL);
  }
  /*
 --
 .13.6

-[Qemu-devel] [PULL 23/35] block/mirror: Convert to coroutines
+[Qemu-devel] [PULL v3 29/35] test-bdrv-drain: Tests for bdrv_subtree_drain
-From: Max Reitz <mreitz@redhat.com>
+Add a subtree drain version to the existing test cases.
-In order to talk to the source BDS (and maybe in the future to the
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-target BDS as well) directly, we need to convert our existing AIO
+---
-requests into coroutine I/O requests.
+ tests/test-bdrv-drain.c | 27 ++++++++++++++++++++++++++-
 file changed, 26 insertions(+), 1 deletion(-)
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
 Reviewed-by: Fam Zheng <famz@redhat.com>
 Message-id: 20180613181823.13618-3-mreitz@redhat.com
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 ---
  block/mirror.c | 152 ++++++++++++++++++++++++++++++++++-----------------------
 file changed, 90 insertions(+), 62 deletions(-)
 diff --git a/block/mirror.c b/block/mirror.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/mirror.c
+--- a/tests/test-bdrv-drain.c
-+++ b/block/mirror.c
++++ b/tests/test-bdrv-drain.c
-@@ -XXX,XX +XXX,XX @@ typedef struct MirrorOp {
+@@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret)
-     QEMUIOVector qiov;
+ enum drain_type {
-     int64_t offset;
+     BDRV_DRAIN_ALL,
-     uint64_t bytes;
+     BDRV_DRAIN,
-+
++    BDRV_SUBTREE_DRAIN,
-+    /* The pointee is set by mirror_co_read(), mirror_co_zero(), and
+     DRAIN_TYPE_MAX,
-+     * mirror_co_discard() before yielding for the first time */
+ };
-+    int64_t *bytes_handled;
- } MirrorOp;
+@@ -XXX,XX +XXX,XX @@ static void do_drain_begin(enum drain_type drain_type, BlockDriverState *bs)
+     switch (drain_type) {
- typedef enum MirrorMethod {
+     case BDRV_DRAIN_ALL:        bdrv_drain_all_begin(); break;
-@@ -XXX,XX +XXX,XX @@ static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read,
+     case BDRV_DRAIN:            bdrv_drained_begin(bs); break;
 +    case BDRV_SUBTREE_DRAIN:    bdrv_subtree_drained_begin(bs); break;
      default:                    g_assert_not_reached();
      }
  }
+@@ -XXX,XX +XXX,XX @@ static void do_drain_end(enum drain_type drain_type, BlockDriverState *bs)
--static void mirror_iteration_done(MirrorOp *op, int ret)
+     switch (drain_type) {
-+static void coroutine_fn mirror_iteration_done(MirrorOp *op, int ret)
+     case BDRV_DRAIN_ALL:        bdrv_drain_all_end(); break;
- {
+     case BDRV_DRAIN:            bdrv_drained_end(bs); break;
-     MirrorBlockJob *s = op->s;
++    case BDRV_SUBTREE_DRAIN:    bdrv_subtree_drained_end(bs); break;
-     struct iovec *iov;
+     default:                    g_assert_not_reached();
@@ -XXX,XX +XXX,XX @@ static void mirror_iteration_done(MirrorOp *op, int ret)
      }
  }
+@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain(void)
--static void mirror_write_complete(void *opaque, int ret)
+     test_drv_cb_common(BDRV_DRAIN, false);
 +static void coroutine_fn mirror_write_complete(MirrorOp *op, int ret)
  {
 -    MirrorOp *op = opaque;
      MirrorBlockJob *s = op->s;
      aio_context_acquire(blk_get_aio_context(s->common.blk));
@@ -XXX,XX +XXX,XX @@ static void mirror_write_complete(void *opaque, int ret)
      aio_context_release(blk_get_aio_context(s->common.blk));
  }
--static void mirror_read_complete(void *opaque, int ret)
++static void test_drv_cb_drain_subtree(void)
-+static void coroutine_fn mirror_read_complete(MirrorOp *op, int ret)
++{
- {
++    test_drv_cb_common(BDRV_SUBTREE_DRAIN, true);
 -    MirrorOp *op = opaque;
      MirrorBlockJob *s = op->s;
      aio_context_acquire(blk_get_aio_context(s->common.blk));
@@ -XXX,XX +XXX,XX @@ static void mirror_read_complete(void *opaque, int ret)
          mirror_iteration_done(op, ret);
      } else {
 -        blk_aio_pwritev(s->target, op->offset, &op->qiov,
 -                        0, mirror_write_complete, op);
 +        ret = blk_co_pwritev(s->target, op->offset,
 +                             op->qiov.size, &op->qiov, 0);
 +        mirror_write_complete(op, ret);
      }
      aio_context_release(blk_get_aio_context(s->common.blk));
  }
@@ -XXX,XX +XXX,XX @@ static inline void mirror_wait_for_io(MirrorBlockJob *s)
      s->waiting_for_io = false;
  }
 -/* Submit async read while handling COW.
 - * Returns: The number of bytes copied after and including offset,
 - *          excluding any bytes copied prior to offset due to alignment.
 - *          This will be @bytes if no alignment is necessary, or
 - *          (new_end - offset) if tail is rounded up or down due to
 - *          alignment or buffer limit.
 +/* Perform a mirror copy operation.
 + *
 + * *op->bytes_handled is set to the number of bytes copied after and
 + * including offset, excluding any bytes copied prior to offset due
 + * to alignment.  This will be op->bytes if no alignment is necessary,
 + * or (new_end - op->offset) if the tail is rounded up or down due to
 + * alignment or buffer limit.
   */
 -static uint64_t mirror_do_read(MirrorBlockJob *s, int64_t offset,
 -                               uint64_t bytes)
 +static void coroutine_fn mirror_co_read(void *opaque)
  {
 +    MirrorOp *op = opaque;
 +    MirrorBlockJob *s = op->s;
      BlockBackend *source = s->common.blk;
      int nb_chunks;
      uint64_t ret;
 -    MirrorOp *op;
      uint64_t max_bytes;
      max_bytes = s->granularity * s->max_iov;
      /* We can only handle as much as buf_size at a time. */
 -    bytes = MIN(s->buf_size, MIN(max_bytes, bytes));
 -    assert(bytes);
 -    assert(bytes < BDRV_REQUEST_MAX_BYTES);
 -    ret = bytes;
 +    op->bytes = MIN(s->buf_size, MIN(max_bytes, op->bytes));
 +    assert(op->bytes);
 +    assert(op->bytes < BDRV_REQUEST_MAX_BYTES);
 +    *op->bytes_handled = op->bytes;
      if (s->cow_bitmap) {
 -        ret += mirror_cow_align(s, &offset, &bytes);
 +        *op->bytes_handled += mirror_cow_align(s, &op->offset, &op->bytes);
      }
 -    assert(bytes <= s->buf_size);
 +    /* Cannot exceed BDRV_REQUEST_MAX_BYTES + INT_MAX */
 +    assert(*op->bytes_handled <= UINT_MAX);
 +    assert(op->bytes <= s->buf_size);
      /* The offset is granularity-aligned because:
       * 1) Caller passes in aligned values;
       * 2) mirror_cow_align is used only when target cluster is larger. */
 -    assert(QEMU_IS_ALIGNED(offset, s->granularity));
 +    assert(QEMU_IS_ALIGNED(op->offset, s->granularity));
      /* The range is sector-aligned, since bdrv_getlength() rounds up. */
 -    assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
 -    nb_chunks = DIV_ROUND_UP(bytes, s->granularity);
 +    assert(QEMU_IS_ALIGNED(op->bytes, BDRV_SECTOR_SIZE));
 +    nb_chunks = DIV_ROUND_UP(op->bytes, s->granularity);
      while (s->buf_free_count < nb_chunks) {
 -        trace_mirror_yield_in_flight(s, offset, s->in_flight);
 +        trace_mirror_yield_in_flight(s, op->offset, s->in_flight);
          mirror_wait_for_io(s);
      }
 -    /* Allocate a MirrorOp that is used as an AIO callback.  */
 -    op = g_new(MirrorOp, 1);
 -    op->s = s;
 -    op->offset = offset;
 -    op->bytes = bytes;
 -
      /* Now make a QEMUIOVector taking enough granularity-sized chunks
       * from s->buf_free.
       */
      qemu_iovec_init(&op->qiov, nb_chunks);
      while (nb_chunks-- > 0) {
          MirrorBuffer *buf = QSIMPLEQ_FIRST(&s->buf_free);
 -        size_t remaining = bytes - op->qiov.size;
 +        size_t remaining = op->bytes - op->qiov.size;
          QSIMPLEQ_REMOVE_HEAD(&s->buf_free, next);
          s->buf_free_count--;
@@ -XXX,XX +XXX,XX @@ static uint64_t mirror_do_read(MirrorBlockJob *s, int64_t offset,
      /* Copy the dirty cluster.  */
      s->in_flight++;
 -    s->bytes_in_flight += bytes;
 -    trace_mirror_one_iteration(s, offset, bytes);
 +    s->bytes_in_flight += op->bytes;
 +    trace_mirror_one_iteration(s, op->offset, op->bytes);
 -    blk_aio_preadv(source, offset, &op->qiov, 0, mirror_read_complete, op);
 -    return ret;
 +    ret = blk_co_preadv(source, op->offset, op->bytes, &op->qiov, 0);
 +    mirror_read_complete(op, ret);
  }
 -static void mirror_do_zero_or_discard(MirrorBlockJob *s,
 -                                      int64_t offset,
 -                                      uint64_t bytes,
 -                                      bool is_discard)
 +static void coroutine_fn mirror_co_zero(void *opaque)
  {
 -    MirrorOp *op;
 +    MirrorOp *op = opaque;
 +    int ret;
 -    /* Allocate a MirrorOp that is used as an AIO callback. The qiov is zeroed
 -     * so the freeing in mirror_iteration_done is nop. */
 -    op = g_new0(MirrorOp, 1);
 -    op->s = s;
 -    op->offset = offset;
 -    op->bytes = bytes;
 +    op->s->in_flight++;
 +    op->s->bytes_in_flight += op->bytes;
 +    *op->bytes_handled = op->bytes;
 -    s->in_flight++;
 -    s->bytes_in_flight += bytes;
 -    if (is_discard) {
 -        blk_aio_pdiscard(s->target, offset,
 -                         op->bytes, mirror_write_complete, op);
 -    } else {
 -        blk_aio_pwrite_zeroes(s->target, offset,
 -                              op->bytes, s->unmap ? BDRV_REQ_MAY_UNMAP : 0,
 -                              mirror_write_complete, op);
 -    }
 +    ret = blk_co_pwrite_zeroes(op->s->target, op->offset, op->bytes,
 +                               op->s->unmap ? BDRV_REQ_MAY_UNMAP : 0);
 +    mirror_write_complete(op, ret);
 +}
 +
-+static void coroutine_fn mirror_co_discard(void *opaque)
+ static void test_quiesce_common(enum drain_type drain_type, bool recursive)
  {
      BlockBackend *blk;
@@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain(void)
      test_quiesce_common(BDRV_DRAIN, false);
  }
 +static void test_quiesce_drain_subtree(void)
 +{
-+    MirrorOp *op = opaque;
++    test_quiesce_common(BDRV_SUBTREE_DRAIN, true);
-+    int ret;
++}
 +
-+    op->s->in_flight++;
+ static void test_nested(void)
-+    op->s->bytes_in_flight += op->bytes;
+ {
-+    *op->bytes_handled = op->bytes;
+     BlockBackend *blk;
@@ -XXX,XX +XXX,XX @@ static void test_nested(void)
              /* XXX bdrv_drain_all() doesn't increase the quiesce_counter */
              int bs_quiesce      = (outer != BDRV_DRAIN_ALL) +
                                    (inner != BDRV_DRAIN_ALL);
 -            int backing_quiesce = 0;
 +            int backing_quiesce = (outer == BDRV_SUBTREE_DRAIN) +
 +                                  (inner == BDRV_SUBTREE_DRAIN);
              int backing_cb_cnt  = (outer != BDRV_DRAIN) +
                                    (inner != BDRV_DRAIN);
@@ -XXX,XX +XXX,XX @@ static void test_blockjob_drain(void)
      test_blockjob_common(BDRV_DRAIN);
  }
 +static void test_blockjob_drain_subtree(void)
 +{
 +    test_blockjob_common(BDRV_SUBTREE_DRAIN);
 +}
 +
-+    ret = blk_co_pdiscard(op->s->target, op->offset, op->bytes);
+ int main(int argc, char **argv)
-+    mirror_write_complete(op, ret);
+ {
      bdrv_init();
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
      g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
      g_test_add_func("/bdrv-drain/driver-cb/drain", test_drv_cb_drain);
 +    g_test_add_func("/bdrv-drain/driver-cb/drain_subtree",
 +                    test_drv_cb_drain_subtree);
      g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
      g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
 +    g_test_add_func("/bdrv-drain/quiesce/drain_subtree",
 +                    test_quiesce_drain_subtree);
      g_test_add_func("/bdrv-drain/nested", test_nested);
      g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
      g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
 +    g_test_add_func("/bdrv-drain/blockjob/drain_subtree",
 +                    test_blockjob_drain_subtree);
      return g_test_run();
  }
- static unsigned mirror_perform(MirrorBlockJob *s, int64_t offset,
-                                unsigned bytes, MirrorMethod mirror_method)
- {
-+    MirrorOp *op;
-+    Coroutine *co;
-+    int64_t bytes_handled = -1;
-+
-+    op = g_new(MirrorOp, 1);
-+    *op = (MirrorOp){
-+        .s              = s,
-+        .offset         = offset,
-+        .bytes          = bytes,
-+        .bytes_handled  = &bytes_handled,
-+    };
-+
-     switch (mirror_method) {
-     case MIRROR_METHOD_COPY:
--        return mirror_do_read(s, offset, bytes);
-+        co = qemu_coroutine_create(mirror_co_read, op);
-+        break;
-     case MIRROR_METHOD_ZERO:
-+        co = qemu_coroutine_create(mirror_co_zero, op);
-+        break;
-     case MIRROR_METHOD_DISCARD:
--        mirror_do_zero_or_discard(s, offset, bytes,
--                                  mirror_method == MIRROR_METHOD_DISCARD);
--        return bytes;
-+        co = qemu_coroutine_create(mirror_co_discard, op);
-+        break;
-     default:
-         abort();
-     }
-+
-+    qemu_coroutine_enter(co);
-+    /* At this point, ownership of op has been moved to the coroutine
-+     * and the object may already be freed */
-+
-+    /* Assert that this value has been set */
-+    assert(bytes_handled >= 0);
-+
-+    /* Same assertion as in mirror_co_read() (and for mirror_co_read()
-+     * and mirror_co_discard(), bytes_handled == op->bytes, which
-+     * is the @bytes parameter given to this function) */
-+    assert(bytes_handled <= UINT_MAX);
-+    return bytes_handled;
- }
- static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
 --
 .13.6

-[Qemu-devel] [PULL 05/35] tests/test-bdrv-drain: bdrv_drain_all() works in coroutines now
+[Qemu-devel] [PULL v3 30/35] test-bdrv-drain: Test behaviour in coroutine context
-Since we use bdrv_do_drained_begin/end() for bdrv_drain_all_begin/end(),
+If bdrv_do_drained_begin/end() are called in coroutine context, they
-coroutine context is automatically left with a BH, preventing the
+first use a BH to get out of the coroutine context. Call some existing
-deadlocks that made bdrv_drain_all*() unsafe in coroutine context. Now
+tests again from a coroutine to cover this code path.
 that we even removed the old polling code as dead code, it's obvious
 that it's compatible now.
 Enable the coroutine test cases for bdrv_drain_all().
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- tests/test-bdrv-drain.c | 16 ++++++++++++++--
+ tests/test-bdrv-drain.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++
-file changed, 14 insertions(+), 2 deletions(-)
+file changed, 59 insertions(+)
 diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tests/test-bdrv-drain.c
 +++ b/tests/test-bdrv-drain.c
+@@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret)
+     *aio_ret = ret;
+ }
++typedef struct CallInCoroutineData {
++    void (*entry)(void);
++    bool done;
++} CallInCoroutineData;
++
++static coroutine_fn void call_in_coroutine_entry(void *opaque)
++{
++    CallInCoroutineData *data = opaque;
++
++    data->entry();
++    data->done = true;
++}
++
++static void call_in_coroutine(void (*entry)(void))
++{
++    Coroutine *co;
++    CallInCoroutineData data = {
++        .entry  = entry,
++        .done   = false,
++    };
++
++    co = qemu_coroutine_create(call_in_coroutine_entry, &data);
++    qemu_coroutine_enter(co);
++    while (!data.done) {
++        aio_poll(qemu_get_aio_context(), true);
++    }
++}
++
+ enum drain_type {
+     BDRV_DRAIN_ALL,
+     BDRV_DRAIN,
 @@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain_subtree(void)
      test_drv_cb_common(BDRV_SUBTREE_DRAIN, true);
  }
-+static void test_drv_cb_co_drain_all(void)
++static void test_drv_cb_co_drain(void)
 +{
-+    call_in_coroutine(test_drv_cb_drain_all);
++    call_in_coroutine(test_drv_cb_drain);
 +}
 +
- static void test_drv_cb_co_drain(void)
++static void test_drv_cb_co_drain_subtree(void)
 +{
 +    call_in_coroutine(test_drv_cb_drain_subtree);
 +}
 +
  static void test_quiesce_common(enum drain_type drain_type, bool recursive)
  {
-     call_in_coroutine(test_drv_cb_drain);
+     BlockBackend *blk;
 @@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain_subtree(void)
      test_quiesce_common(BDRV_SUBTREE_DRAIN, true);
  }
-+static void test_quiesce_co_drain_all(void)
++static void test_quiesce_co_drain(void)
 +{
-+    call_in_coroutine(test_quiesce_drain_all);
++    call_in_coroutine(test_quiesce_drain);
 +}
 +
- static void test_quiesce_co_drain(void)
++static void test_quiesce_co_drain_subtree(void)
 +{
 +    call_in_coroutine(test_quiesce_drain_subtree);
 +}
 +
  static void test_nested(void)
  {
-     call_in_coroutine(test_quiesce_drain);
+     BlockBackend *blk;
 @@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
      g_test_add_func("/bdrv-drain/driver-cb/drain_subtree",
                      test_drv_cb_drain_subtree);
--    // XXX bdrv_drain_all() doesn't work in coroutine context
++    // XXX bdrv_drain_all() doesn't work in coroutine context
-+    g_test_add_func("/bdrv-drain/driver-cb/co/drain_all",
++    g_test_add_func("/bdrv-drain/driver-cb/co/drain", test_drv_cb_co_drain);
-+                    test_drv_cb_co_drain_all);
++    g_test_add_func("/bdrv-drain/driver-cb/co/drain_subtree",
-     g_test_add_func("/bdrv-drain/driver-cb/co/drain", test_drv_cb_co_drain);
++                    test_drv_cb_co_drain_subtree);
-     g_test_add_func("/bdrv-drain/driver-cb/co/drain_subtree",
++
-                     test_drv_cb_co_drain_subtree);
++
-@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
+     g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
      g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
      g_test_add_func("/bdrv-drain/quiesce/drain_subtree",
                      test_quiesce_drain_subtree);
--    // XXX bdrv_drain_all() doesn't work in coroutine context
++    // XXX bdrv_drain_all() doesn't work in coroutine context
-+    g_test_add_func("/bdrv-drain/quiesce/co/drain_all",
++    g_test_add_func("/bdrv-drain/quiesce/co/drain", test_quiesce_co_drain);
-+                    test_quiesce_co_drain_all);
++    g_test_add_func("/bdrv-drain/quiesce/co/drain_subtree",
-     g_test_add_func("/bdrv-drain/quiesce/co/drain", test_quiesce_co_drain);
++                    test_quiesce_co_drain_subtree);
-     g_test_add_func("/bdrv-drain/quiesce/co/drain_subtree",
++
-                     test_quiesce_co_drain_subtree);
+     g_test_add_func("/bdrv-drain/nested", test_nested);
      g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
 --
 .13.6

-[Qemu-devel] [PULL 11/35] test-bdrv-drain: Test node deletion in subtree recursion
+[Qemu-devel] [PULL v3 31/35] test-bdrv-drain: Recursive draining with multiple parents
-If bdrv_do_drained_begin() polls during its subtree recursion, the graph
+Test that drain sections are correctly propagated through the graph.
 can change and mess up the bs->children iteration. Test that this
 doesn't happen.
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- tests/test-bdrv-drain.c | 38 +++++++++++++++++++++++++++++---------
+ tests/test-bdrv-drain.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++
-file changed, 29 insertions(+), 9 deletions(-)
+file changed, 74 insertions(+)
 diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tests/test-bdrv-drain.c
 +++ b/tests/test-bdrv-drain.c
-@@ -XXX,XX +XXX,XX @@ static void coroutine_fn test_co_delete_by_drain(void *opaque)
+@@ -XXX,XX +XXX,XX @@ static void test_nested(void)
-  * If @detach_instead_of_delete is set, the BDS is not going to be
+     blk_unref(blk);
   * deleted but will only detach all of its children.
   */
 -static void do_test_delete_by_drain(bool detach_instead_of_delete)
 +static void do_test_delete_by_drain(bool detach_instead_of_delete,
 +                                    enum drain_type drain_type)
  {
      BlockBackend *blk;
      BlockDriverState *bs, *child_bs, *null_bs;
@@ -XXX,XX +XXX,XX @@ static void do_test_delete_by_drain(bool detach_instead_of_delete)
       * test_co_delete_by_drain() resuming.  Thus, @bs will be deleted
       * and the coroutine will exit while this drain operation is still
       * in progress. */
 -    bdrv_ref(child_bs);
 -    bdrv_drain(child_bs);
 -    bdrv_unref(child_bs);
 +    switch (drain_type) {
 +    case BDRV_DRAIN:
 +        bdrv_ref(child_bs);
 +        bdrv_drain(child_bs);
 +        bdrv_unref(child_bs);
 +        break;
 +    case BDRV_SUBTREE_DRAIN:
 +        /* Would have to ref/unref bs here for !detach_instead_of_delete, but
 +         * then the whole test becomes pointless because the graph changes
 +         * don't occur during the drain any more. */
 +        assert(detach_instead_of_delete);
 +        bdrv_subtree_drained_begin(bs);
 +        bdrv_subtree_drained_end(bs);
 +        break;
 +    default:
 +        g_assert_not_reached();
 +    }
      while (!dbdd.done) {
          aio_poll(qemu_get_aio_context(), true);
@@ -XXX,XX +XXX,XX @@ static void do_test_delete_by_drain(bool detach_instead_of_delete)
      }
  }
--
++static void test_multiparent(void)
- static void test_delete_by_drain(void)
++{
- {
++    BlockBackend *blk_a, *blk_b;
--    do_test_delete_by_drain(false);
++    BlockDriverState *bs_a, *bs_b, *backing;
-+    do_test_delete_by_drain(false, BDRV_DRAIN);
++    BDRVTestState *a_s, *b_s, *backing_s;
- }
++
++    blk_a = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
- static void test_detach_by_drain(void)
++    bs_a = bdrv_new_open_driver(&bdrv_test, "test-node-a", BDRV_O_RDWR,
- {
++                                &error_abort);
--    do_test_delete_by_drain(true);
++    a_s = bs_a->opaque;
-+    do_test_delete_by_drain(true, BDRV_DRAIN);
++    blk_insert_bs(blk_a, bs_a, &error_abort);
 +
 +    blk_b = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
 +    bs_b = bdrv_new_open_driver(&bdrv_test, "test-node-b", BDRV_O_RDWR,
 +                                &error_abort);
 +    b_s = bs_b->opaque;
 +    blk_insert_bs(blk_b, bs_b, &error_abort);
 +
 +    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
 +    backing_s = backing->opaque;
 +    bdrv_set_backing_hd(bs_a, backing, &error_abort);
 +    bdrv_set_backing_hd(bs_b, backing, &error_abort);
 +
 +    g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
 +    g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
 +    g_assert_cmpint(backing->quiesce_counter, ==, 0);
 +    g_assert_cmpint(a_s->drain_count, ==, 0);
 +    g_assert_cmpint(b_s->drain_count, ==, 0);
 +    g_assert_cmpint(backing_s->drain_count, ==, 0);
 +
 +    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
 +
 +    g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
 +    g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
 +    g_assert_cmpint(backing->quiesce_counter, ==, 1);
 +    g_assert_cmpint(a_s->drain_count, ==, 1);
 +    g_assert_cmpint(b_s->drain_count, ==, 1);
 +    g_assert_cmpint(backing_s->drain_count, ==, 1);
 +
 +    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b);
 +
 +    g_assert_cmpint(bs_a->quiesce_counter, ==, 2);
 +    g_assert_cmpint(bs_b->quiesce_counter, ==, 2);
 +    g_assert_cmpint(backing->quiesce_counter, ==, 2);
 +    g_assert_cmpint(a_s->drain_count, ==, 2);
 +    g_assert_cmpint(b_s->drain_count, ==, 2);
 +    g_assert_cmpint(backing_s->drain_count, ==, 2);
 +
 +    do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
 +
 +    g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
 +    g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
 +    g_assert_cmpint(backing->quiesce_counter, ==, 1);
 +    g_assert_cmpint(a_s->drain_count, ==, 1);
 +    g_assert_cmpint(b_s->drain_count, ==, 1);
 +    g_assert_cmpint(backing_s->drain_count, ==, 1);
 +
 +    do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
 +
 +    g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
 +    g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
 +    g_assert_cmpint(backing->quiesce_counter, ==, 0);
 +    g_assert_cmpint(a_s->drain_count, ==, 0);
 +    g_assert_cmpint(b_s->drain_count, ==, 0);
 +    g_assert_cmpint(backing_s->drain_count, ==, 0);
 +
 +    bdrv_unref(backing);
 +    bdrv_unref(bs_a);
 +    bdrv_unref(bs_b);
 +    blk_unref(blk_a);
 +    blk_unref(blk_b);
 +}
 +
-+static void test_detach_by_drain_subtree(void)
-+{
+ typedef struct TestBlockJob {
-+    do_test_delete_by_drain(true, BDRV_SUBTREE_DRAIN);
+     BlockJob common;
  }
 @@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
-     g_test_add_func("/bdrv-drain/blockjob/drain_subtree",
+                     test_quiesce_co_drain_subtree);
-                     test_blockjob_drain_subtree);
+     g_test_add_func("/bdrv-drain/nested", test_nested);
--    g_test_add_func("/bdrv-drain/deletion", test_delete_by_drain);
++    g_test_add_func("/bdrv-drain/multiparent", test_multiparent);
--    g_test_add_func("/bdrv-drain/detach", test_detach_by_drain);
-+    g_test_add_func("/bdrv-drain/deletion/drain", test_delete_by_drain);
+     g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
-+    g_test_add_func("/bdrv-drain/detach/drain", test_detach_by_drain);
+     g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
 +    g_test_add_func("/bdrv-drain/detach/drain_subtree", test_detach_by_drain_subtree);
      ret = g_test_run();
      qemu_event_destroy(&done_event);
 --
 .13.6

-[Qemu-devel] [PULL 19/35] block: Allow graph changes in bdrv_drain_all_begin/end sections
+[Qemu-devel] [PULL v3 32/35] block: Allow graph changes in subtree drained section
-bdrv_drain_all_*() used bdrv_next() to iterate over all root nodes and
+We need to remember how many of the drain sections in which a node is
-did a subtree drain for each of them. This works fine as long as the
+were recursive (i.e. subtree drain rather than node drain), so that they
-graph is static, but sadly, reality looks different.
+can be correctly applied when children are added or removed during the
+drained section.
-If the graph changes so that root nodes are added or removed, we would
-have to compensate for this. bdrv_next() returns each root node only
+With this change, it is safe to modify the graph even inside a
-once even if it's the root node for multiple BlockBackends or for a
+bdrv_subtree_drained_begin/end() section.
 monitor-owned block driver tree, which would only complicate things.
 The much easier and more obviously correct way is to fundamentally
 change the way the functions work: Iterate over all BlockDriverStates,
 no matter who owns them, and drain them individually. Compensation is
 only necessary when a new BDS is created inside a drain_all section.
 Removal of a BDS doesn't require any action because it's gone afterwards
 anyway.
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- include/block/block.h     |  1 +
+ include/block/block.h     |  2 --
- include/block/block_int.h |  1 +
+ include/block/block_int.h |  5 +++++
- block.c                   | 34 ++++++++++++++++++++++++---
+ block.c                   | 32 +++++++++++++++++++++++++++++---
- block/io.c                | 60 ++++++++++++++++++++++++++++++++++++-----------
+ block/io.c                | 28 ++++++++++++++++++++++++----
-files changed, 79 insertions(+), 17 deletions(-)
+files changed, 58 insertions(+), 9 deletions(-)
 diff --git a/include/block/block.h b/include/block/block.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/block/block.h
 +++ b/include/block/block.h
-@@ -XXX,XX +XXX,XX @@ BlockDriverState *bdrv_lookup_bs(const char *device,
+@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs);
-                                  Error **errp);
+ /**
- bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base);
+  * Like bdrv_drained_begin, but recursively begins a quiesced section for
- BlockDriverState *bdrv_next_node(BlockDriverState *bs);
+  * exclusive access to all child nodes as well.
-+BlockDriverState *bdrv_next_all_states(BlockDriverState *bs);
+- *
+- * Graph changes are not allowed during a subtree drain section.
- typedef struct BdrvNextIterator {
+  */
-     enum {
+ void bdrv_subtree_drained_begin(BlockDriverState *bs);
 diff --git a/include/block/block_int.h b/include/block/block_int.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/block/block_int.h
 +++ b/include/block/block_int.h
+@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
+     /* Accessed with atomic ops.  */
+     int quiesce_counter;
++    int recursive_quiesce_counter;
++
+     unsigned int write_gen;               /* Current data generation */
+     /* Protected by reqs_lock.  */
 @@ -XXX,XX +XXX,XX @@ int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
      int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
      BdrvRequestFlags flags);
-+extern unsigned int bdrv_drain_all_count;
++void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent);
- void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent);
++void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent);
- void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent);
++
+ int get_tmp_filename(char *filename, int size);
  BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
                              const char *filename);
 diff --git a/block.c b/block.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block.c
 +++ b/block.c
-@@ -XXX,XX +XXX,XX @@ BlockDriverState *bdrv_new(void)
+@@ -XXX,XX +XXX,XX @@ static void bdrv_child_cb_drained_end(BdrvChild *child)
+     bdrv_drained_end(bs);
-     qemu_co_queue_init(&bs->flush_queue);
+ }
-+    for (i = 0; i < bdrv_drain_all_count; i++) {
++static void bdrv_child_cb_attach(BdrvChild *child)
-+        bdrv_drained_begin(bs);
++{
-+    }
++    BlockDriverState *bs = child->opaque;
-+
++    bdrv_apply_subtree_drain(child, bs);
-     QTAILQ_INSERT_TAIL(&all_bdrv_states, bs, bs_list);
++}
++
-     return bs;
++static void bdrv_child_cb_detach(BdrvChild *child)
-@@ -XXX,XX +XXX,XX @@ static int bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv,
++{
-                             int open_flags, Error **errp)
++    BlockDriverState *bs = child->opaque;
 +    bdrv_unapply_subtree_drain(child, bs);
 +}
 +
  static int bdrv_child_cb_inactivate(BdrvChild *child)
  {
-     Error *local_err = NULL;
+     BlockDriverState *bs = child->opaque;
--    int ret;
+@@ -XXX,XX +XXX,XX @@ const BdrvChildRole child_file = {
-+    int i, ret;
+     .inherit_options = bdrv_inherited_options,
+     .drained_begin   = bdrv_child_cb_drained_begin,
-     bdrv_assign_node_name(bs, node_name, &local_err);
+     .drained_end     = bdrv_child_cb_drained_end,
-     if (local_err) {
++    .attach          = bdrv_child_cb_attach,
-@@ -XXX,XX +XXX,XX @@ static int bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv,
++    .detach          = bdrv_child_cb_detach,
-     assert(bdrv_min_mem_align(bs) != 0);
+     .inactivate      = bdrv_child_cb_inactivate,
-     assert(is_power_of_2(bs->bl.request_alignment));
+ };
-+    for (i = 0; i < bs->quiesce_counter; i++) {
+@@ -XXX,XX +XXX,XX @@ const BdrvChildRole child_format = {
-+        if (drv->bdrv_co_drain_begin) {
+     .inherit_options = bdrv_inherited_fmt_options,
-+            drv->bdrv_co_drain_begin(bs);
+     .drained_begin   = bdrv_child_cb_drained_begin,
      .drained_end     = bdrv_child_cb_drained_end,
 +    .attach          = bdrv_child_cb_attach,
 +    .detach          = bdrv_child_cb_detach,
      .inactivate      = bdrv_child_cb_inactivate,
  };
@@ -XXX,XX +XXX,XX @@ static void bdrv_backing_attach(BdrvChild *c)
                      parent->backing_blocker);
      bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_BACKUP_TARGET,
                      parent->backing_blocker);
 +
 +    bdrv_child_cb_attach(c);
  }
  static void bdrv_backing_detach(BdrvChild *c)
@@ -XXX,XX +XXX,XX @@ static void bdrv_backing_detach(BdrvChild *c)
      bdrv_op_unblock_all(c->bs, parent->backing_blocker);
      error_free(parent->backing_blocker);
      parent->backing_blocker = NULL;
 +
 +    bdrv_child_cb_detach(c);
  }
  /*
@@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child,
          assert(bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs));
      }
      if (old_bs) {
 +        /* Detach first so that the recursive drain sections coming from @child
 +         * are already gone and we only end the drain sections that came from
 +         * elsewhere. */
 +        if (child->role->detach) {
 +            child->role->detach(child);
 +        }
-+    }
-+
-     return 0;
- open_failed:
-     bs->drv = NULL;
-@@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child,
-             child->role->detach(child);
-         }
          if (old_bs->quiesce_counter && child->role->drained_end) {
--            for (i = 0; i < old_bs->quiesce_counter; i++) {
+             for (i = 0; i < old_bs->quiesce_counter; i++) {
 +            int num = old_bs->quiesce_counter;
 +            if (child->role->parent_is_bds) {
 +                num -= bdrv_drain_all_count;
 +            }
 +            assert(num >= 0);
 +            for (i = 0; i < num; i++) {
                  child->role->drained_end(child);
              }
          }
+-        if (child->role->detach) {
+-            child->role->detach(child);
+-        }
+         QLIST_REMOVE(child, next_parent);
+     }
 @@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child,
-     if (new_bs) {
-         QLIST_INSERT_HEAD(&new_bs->parents, child, next_parent);
-         if (new_bs->quiesce_counter && child->role->drained_begin) {
--            for (i = 0; i < new_bs->quiesce_counter; i++) {
-+            int num = new_bs->quiesce_counter;
-+            if (child->role->parent_is_bds) {
-+                num -= bdrv_drain_all_count;
-+            }
-+            assert(num >= 0);
-+            for (i = 0; i < num; i++) {
-                 child->role->drained_begin(child);
              }
          }
-@@ -XXX,XX +XXX,XX @@ BlockDriverState *bdrv_next_node(BlockDriverState *bs)
-     return QTAILQ_NEXT(bs, node_list);
++        /* Attach only after starting new drained sections, so that recursive
- }
++         * drain sections coming from @child don't get an extra .drained_begin
++         * callback. */
-+BlockDriverState *bdrv_next_all_states(BlockDriverState *bs)
+         if (child->role->attach) {
-+{
+             child->role->attach(child);
-+    if (!bs) {
+         }
 +        return QTAILQ_FIRST(&all_bdrv_states);
 +    }
 +    return QTAILQ_NEXT(bs, bs_list);
 +}
 +
  const char *bdrv_get_node_name(const BlockDriverState *bs)
  {
      return bs->node_name;
 diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/io.c
 +++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
- /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */
+     assert(data.done);
- #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)
+ }
-+static AioWait drain_all_aio_wait;
+-static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
-+
+-                                  BdrvChild *parent)
- static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
++void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
-     int64_t offset, int bytes, BdrvRequestFlags flags);
++                           BdrvChild *parent)
+ {
-@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_assert_idle(BlockDriverState *bs)
+     BdrvChild *child, *next;
@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
      bdrv_drain_recurse(bs);
      if (recursive) {
 +        bs->recursive_quiesce_counter++;
          QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
              bdrv_do_drained_begin(child->bs, true, child);
          }
@@ -XXX,XX +XXX,XX @@ void bdrv_subtree_drained_begin(BlockDriverState *bs)
      bdrv_do_drained_begin(bs, true, NULL);
  }
 -static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
 -                                BdrvChild *parent)
 +void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
 +                         BdrvChild *parent)
  {
      BdrvChild *child, *next;
      int old_quiesce_counter;
@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
      }
- }
+     if (recursive) {
-+unsigned int bdrv_drain_all_count = 0;
++        bs->recursive_quiesce_counter--;
-+
+         QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
-+static bool bdrv_drain_all_poll(void)
+             bdrv_do_drained_end(child->bs, true, child);
-+{
+         }
-+    BlockDriverState *bs = NULL;
+@@ -XXX,XX +XXX,XX @@ void bdrv_subtree_drained_end(BlockDriverState *bs)
-+    bool result = false;
+     bdrv_do_drained_end(bs, true, NULL);
-+
+ }
-+    /* Execute pending BHs first (may modify the graph) and check everything
-+     * else only after the BHs have executed. */
++void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent)
-+    while (aio_poll(qemu_get_aio_context(), false));
++{
-+
++    int i;
-+    /* bdrv_drain_poll() can't make changes to the graph and we are holding the
++
-+     * main AioContext lock, so iterating bdrv_next_all_states() is safe. */
++    for (i = 0; i < new_parent->recursive_quiesce_counter; i++) {
-+    while ((bs = bdrv_next_all_states(bs))) {
++        bdrv_do_drained_begin(child->bs, true, child);
 +        AioContext *aio_context = bdrv_get_aio_context(bs);
 +        aio_context_acquire(aio_context);
 +        result |= bdrv_drain_poll(bs, false, NULL, true);
 +        aio_context_release(aio_context);
 +    }
-+
++}
-+    return result;
++
 +void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent)
 +{
 +    int i;
 +
 +    for (i = 0; i < old_parent->recursive_quiesce_counter; i++) {
 +        bdrv_do_drained_end(child->bs, true, child);
 +    }
 +}
 +
  /*
-  * Wait for pending requests to complete across all BlockDriverStates
+  * Wait for pending requests to complete on a single BlockDriverState subtree,
-  *
+  * and suspend block driver's internal I/O until next request arrives.
@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_assert_idle(BlockDriverState *bs)
   */
  void bdrv_drain_all_begin(void)
  {
 -    BlockDriverState *bs;
 -    BdrvNextIterator it;
 +    BlockDriverState *bs = NULL;
      if (qemu_in_coroutine()) {
 -        bdrv_co_yield_to_drain(NULL, true, false, NULL, false, true);
 +        bdrv_co_yield_to_drain(NULL, true, false, NULL, true, true);
          return;
      }
 -    /* BDRV_POLL_WHILE() for a node can only be called from its own I/O thread
 -     * or the main loop AioContext. We potentially use BDRV_POLL_WHILE() on
 -     * nodes in several different AioContexts, so make sure we're in the main
 -     * context. */
 +    /* AIO_WAIT_WHILE() with a NULL context can only be called from the main
 +     * loop AioContext, so make sure we're in the main context. */
      assert(qemu_get_current_aio_context() == qemu_get_aio_context());
 +    assert(bdrv_drain_all_count < INT_MAX);
 +    bdrv_drain_all_count++;
 -    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
 +    /* Quiesce all nodes, without polling in-flight requests yet. The graph
 +     * cannot change during this loop. */
 +    while ((bs = bdrv_next_all_states(bs))) {
          AioContext *aio_context = bdrv_get_aio_context(bs);
          aio_context_acquire(aio_context);
 -        bdrv_do_drained_begin(bs, true, NULL, false, true);
 +        bdrv_do_drained_begin(bs, false, NULL, true, false);
          aio_context_release(aio_context);
      }
 -    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
 +    /* Now poll the in-flight requests */
 +    AIO_WAIT_WHILE(&drain_all_aio_wait, NULL, bdrv_drain_all_poll());
 +
 +    while ((bs = bdrv_next_all_states(bs))) {
          bdrv_drain_assert_idle(bs);
      }
  }
  void bdrv_drain_all_end(void)
  {
 -    BlockDriverState *bs;
 -    BdrvNextIterator it;
 +    BlockDriverState *bs = NULL;
 -    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
 +    while ((bs = bdrv_next_all_states(bs))) {
          AioContext *aio_context = bdrv_get_aio_context(bs);
          aio_context_acquire(aio_context);
 -        bdrv_do_drained_end(bs, true, NULL, false);
 +        bdrv_do_drained_end(bs, false, NULL, true);
          aio_context_release(aio_context);
      }
 +
 +    assert(bdrv_drain_all_count > 0);
 +    bdrv_drain_all_count--;
  }
  void bdrv_drain_all(void)
@@ -XXX,XX +XXX,XX @@ void bdrv_inc_in_flight(BlockDriverState *bs)
  void bdrv_wakeup(BlockDriverState *bs)
  {
      aio_wait_kick(bdrv_get_aio_wait(bs));
 +    aio_wait_kick(&drain_all_aio_wait);
  }
  void bdrv_dec_in_flight(BlockDriverState *bs)
 --
 .13.6

-[Qemu-devel] [PULL 20/35] test-bdrv-drain: Test graph changes in drain_all section
+[Qemu-devel] [PULL v3 33/35] test-bdrv-drain: Test graph changes in drained section
-This tests both adding and remove a node between bdrv_drain_all_begin()
-and bdrv_drain_all_end(), and enabled the existing detach test for
-drain_all.
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- tests/test-bdrv-drain.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++--
+ tests/test-bdrv-drain.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++
-file changed, 73 insertions(+), 2 deletions(-)
+file changed, 80 insertions(+)
 diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tests/test-bdrv-drain.c
 +++ b/tests/test-bdrv-drain.c
 @@ -XXX,XX +XXX,XX @@ static void test_multiparent(void)
      blk_unref(blk_b);
  }
--static void test_graph_change(void)
++static void test_graph_change(void)
 +static void test_graph_change_drain_subtree(void)
  {
      BlockBackend *blk_a, *blk_b;
      BlockDriverState *bs_a, *bs_b, *backing;
@@ -XXX,XX +XXX,XX @@ static void test_graph_change(void)
      blk_unref(blk_b);
  }
 +static void test_graph_change_drain_all(void)
 +{
 +    BlockBackend *blk_a, *blk_b;
-+    BlockDriverState *bs_a, *bs_b;
++    BlockDriverState *bs_a, *bs_b, *backing;
-+    BDRVTestState *a_s, *b_s;
++    BDRVTestState *a_s, *b_s, *backing_s;
 +
-+    /* Create node A with a BlockBackend */
 +    blk_a = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
 +    bs_a = bdrv_new_open_driver(&bdrv_test, "test-node-a", BDRV_O_RDWR,
 +                                &error_abort);
 +    a_s = bs_a->opaque;
 +    blk_insert_bs(blk_a, bs_a, &error_abort);
 +
-+    g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
-+    g_assert_cmpint(a_s->drain_count, ==, 0);
-+
-+    /* Call bdrv_drain_all_begin() */
-+    bdrv_drain_all_begin();
-+
-+    g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
-+    g_assert_cmpint(a_s->drain_count, ==, 1);
-+
-+    /* Create node B with a BlockBackend */
 +    blk_b = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
 +    bs_b = bdrv_new_open_driver(&bdrv_test, "test-node-b", BDRV_O_RDWR,
 +                                &error_abort);
 +    b_s = bs_b->opaque;
 +    blk_insert_bs(blk_b, bs_b, &error_abort);
 +
-+    g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
++    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
-+    g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
++    backing_s = backing->opaque;
-+    g_assert_cmpint(a_s->drain_count, ==, 1);
++    bdrv_set_backing_hd(bs_a, backing, &error_abort);
 +    g_assert_cmpint(b_s->drain_count, ==, 1);
 +
-+    /* Unref and finally delete node A */
++    g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
 +    g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
 +    g_assert_cmpint(backing->quiesce_counter, ==, 0);
 +    g_assert_cmpint(a_s->drain_count, ==, 0);
 +    g_assert_cmpint(b_s->drain_count, ==, 0);
 +    g_assert_cmpint(backing_s->drain_count, ==, 0);
 +
 +    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
 +    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
 +    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
 +    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b);
 +    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b);
 +
 +    bdrv_set_backing_hd(bs_b, backing, &error_abort);
 +    g_assert_cmpint(bs_a->quiesce_counter, ==, 5);
 +    g_assert_cmpint(bs_b->quiesce_counter, ==, 5);
 +    g_assert_cmpint(backing->quiesce_counter, ==, 5);
 +    g_assert_cmpint(a_s->drain_count, ==, 5);
 +    g_assert_cmpint(b_s->drain_count, ==, 5);
 +    g_assert_cmpint(backing_s->drain_count, ==, 5);
 +
 +    bdrv_set_backing_hd(bs_b, NULL, &error_abort);
 +    g_assert_cmpint(bs_a->quiesce_counter, ==, 3);
 +    g_assert_cmpint(bs_b->quiesce_counter, ==, 2);
 +    g_assert_cmpint(backing->quiesce_counter, ==, 3);
 +    g_assert_cmpint(a_s->drain_count, ==, 3);
 +    g_assert_cmpint(b_s->drain_count, ==, 2);
 +    g_assert_cmpint(backing_s->drain_count, ==, 3);
 +
 +    bdrv_set_backing_hd(bs_b, backing, &error_abort);
 +    g_assert_cmpint(bs_a->quiesce_counter, ==, 5);
 +    g_assert_cmpint(bs_b->quiesce_counter, ==, 5);
 +    g_assert_cmpint(backing->quiesce_counter, ==, 5);
 +    g_assert_cmpint(a_s->drain_count, ==, 5);
 +    g_assert_cmpint(b_s->drain_count, ==, 5);
 +    g_assert_cmpint(backing_s->drain_count, ==, 5);
 +
 +    do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
 +    do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
 +    do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
 +    do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
 +    do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
 +
 +    g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
 +    g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
 +    g_assert_cmpint(backing->quiesce_counter, ==, 0);
 +    g_assert_cmpint(a_s->drain_count, ==, 0);
 +    g_assert_cmpint(b_s->drain_count, ==, 0);
 +    g_assert_cmpint(backing_s->drain_count, ==, 0);
 +
 +    bdrv_unref(backing);
 +    bdrv_unref(bs_a);
 +    bdrv_unref(bs_b);
 +    blk_unref(blk_a);
-+
-+    g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
-+    g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
-+    g_assert_cmpint(a_s->drain_count, ==, 1);
-+    g_assert_cmpint(b_s->drain_count, ==, 1);
-+
-+    bdrv_unref(bs_a);
-+
-+    g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
-+    g_assert_cmpint(b_s->drain_count, ==, 1);
-+
-+    /* End the drained section */
-+    bdrv_drain_all_end();
-+
-+    g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
-+    g_assert_cmpint(b_s->drain_count, ==, 0);
-+
-+    bdrv_unref(bs_b);
 +    blk_unref(blk_b);
 +}
 +
- struct test_iothread_data {
-     BlockDriverState *bs;
+ typedef struct TestBlockJob {
-     enum drain_type drain_type;
+     BlockJob common;
@@ -XXX,XX +XXX,XX @@ static void do_test_delete_by_drain(bool detach_instead_of_delete,
          bdrv_subtree_drained_begin(bs);
          bdrv_subtree_drained_end(bs);
          break;
 +    case BDRV_DRAIN_ALL:
 +        bdrv_drain_all_begin();
 +        bdrv_drain_all_end();
 +        break;
      default:
          g_assert_not_reached();
      }
@@ -XXX,XX +XXX,XX @@ static void test_delete_by_drain(void)
      do_test_delete_by_drain(false, BDRV_DRAIN);
  }
 +static void test_detach_by_drain_all(void)
 +{
 +    do_test_delete_by_drain(true, BDRV_DRAIN_ALL);
 +}
 +
  static void test_detach_by_drain(void)
  {
      do_test_delete_by_drain(true, BDRV_DRAIN);
 @@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
      g_test_add_func("/bdrv-drain/nested", test_nested);
      g_test_add_func("/bdrv-drain/multiparent", test_multiparent);
--    g_test_add_func("/bdrv-drain/graph-change", test_graph_change);
++    g_test_add_func("/bdrv-drain/graph-change", test_graph_change);
-+
-+    g_test_add_func("/bdrv-drain/graph-change/drain_subtree",
+     g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
-+                    test_graph_change_drain_subtree);
+     g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
 +    g_test_add_func("/bdrv-drain/graph-change/drain_all",
 +                    test_graph_change_drain_all);
      g_test_add_func("/bdrv-drain/iothread/drain_all", test_iothread_drain_all);
      g_test_add_func("/bdrv-drain/iothread/drain", test_iothread_drain);
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
                      test_blockjob_drain_subtree);
      g_test_add_func("/bdrv-drain/deletion/drain", test_delete_by_drain);
 +    g_test_add_func("/bdrv-drain/detach/drain_all", test_detach_by_drain_all);
      g_test_add_func("/bdrv-drain/detach/drain", test_detach_by_drain);
      g_test_add_func("/bdrv-drain/detach/drain_subtree", test_detach_by_drain_subtree);
      g_test_add_func("/bdrv-drain/detach/parent_cb", test_detach_by_parent_cb);
 --
 .13.6

-[Qemu-devel] [PULL 22/35] block/mirror: Pull out mirror_perform()
+[Qemu-devel] [PULL v3 34/35] commit: Simplify reopen of base
-From: Max Reitz <mreitz@redhat.com>
+Since commit bde70715, base is the only node that is reopened in
 commit_start(). This means that the code, which still involves an
 explicit BlockReopenQueue, can now be simplified by using bdrv_reopen().
-When converting mirror's I/O to coroutines, we are going to need a point
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-where these coroutines are created.  mirror_perform() is going to be
+Reviewed-by: Fam Zheng <famz@redhat.com>
-that point.
+---
  block/commit.c | 8 +-------
 file changed, 1 insertion(+), 7 deletions(-)
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+diff --git a/block/commit.c b/block/commit.c
 Reviewed-by: Fam Zheng <famz@redhat.com>
 Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
 Reviewed-by: Jeff Cody <jcody@redhat.com>
 Reviewed-by: Alberto Garcia <berto@igalia.com>
 Message-id: 20180613181823.13618-2-mreitz@redhat.com
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 ---
  block/mirror.c | 51 +++++++++++++++++++++++++++++----------------------
 file changed, 29 insertions(+), 22 deletions(-)
 diff --git a/block/mirror.c b/block/mirror.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/mirror.c
+--- a/block/commit.c
-+++ b/block/mirror.c
++++ b/block/commit.c
-@@ -XXX,XX +XXX,XX @@ typedef struct MirrorOp {
+@@ -XXX,XX +XXX,XX @@ void commit_start(const char *job_id, BlockDriverState *bs,
-     uint64_t bytes;
+                   const char *filter_node_name, Error **errp)
  } MirrorOp;
 +typedef enum MirrorMethod {
 +    MIRROR_METHOD_COPY,
 +    MIRROR_METHOD_ZERO,
 +    MIRROR_METHOD_DISCARD,
 +} MirrorMethod;
 +
  static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read,
                                              int error)
  {
-@@ -XXX,XX +XXX,XX @@ static void mirror_do_zero_or_discard(MirrorBlockJob *s,
+     CommitBlockJob *s;
-     }
+-    BlockReopenQueue *reopen_queue = NULL;
- }
+     int orig_base_flags;
+     BlockDriverState *iter;
-+static unsigned mirror_perform(MirrorBlockJob *s, int64_t offset,
+     BlockDriverState *commit_top_bs = NULL;
-+                               unsigned bytes, MirrorMethod mirror_method)
+@@ -XXX,XX +XXX,XX @@ void commit_start(const char *job_id, BlockDriverState *bs,
-+{
+     /* convert base to r/w, if necessary */
-+    switch (mirror_method) {
+     orig_base_flags = bdrv_get_flags(base);
-+    case MIRROR_METHOD_COPY:
+     if (!(orig_base_flags & BDRV_O_RDWR)) {
-+        return mirror_do_read(s, offset, bytes);
+-        reopen_queue = bdrv_reopen_queue(reopen_queue, base, NULL,
-+    case MIRROR_METHOD_ZERO:
+-                                         orig_base_flags | BDRV_O_RDWR);
-+    case MIRROR_METHOD_DISCARD:
+-    }
-+        mirror_do_zero_or_discard(s, offset, bytes,
+-
-+                                  mirror_method == MIRROR_METHOD_DISCARD);
+-    if (reopen_queue) {
-+        return bytes;
+-        bdrv_reopen_multiple(bdrv_get_aio_context(bs), reopen_queue, &local_err);
-+    default:
++        bdrv_reopen(base, orig_base_flags | BDRV_O_RDWR, &local_err);
-+        abort();
+         if (local_err != NULL) {
-+    }
+             error_propagate(errp, local_err);
-+}
+             goto fail;
 +
  static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
  {
      BlockDriverState *source = s->source;
@@ -XXX,XX +XXX,XX @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
          int ret;
          int64_t io_bytes;
          int64_t io_bytes_acct;
 -        enum MirrorMethod {
 -            MIRROR_METHOD_COPY,
 -            MIRROR_METHOD_ZERO,
 -            MIRROR_METHOD_DISCARD
 -        } mirror_method = MIRROR_METHOD_COPY;
 +        MirrorMethod mirror_method = MIRROR_METHOD_COPY;
          assert(!(offset % s->granularity));
          ret = bdrv_block_status_above(source, NULL, offset,
@@ -XXX,XX +XXX,XX @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
          }
          io_bytes = mirror_clip_bytes(s, offset, io_bytes);
 -        switch (mirror_method) {
 -        case MIRROR_METHOD_COPY:
 -            io_bytes = io_bytes_acct = mirror_do_read(s, offset, io_bytes);
 -            break;
 -        case MIRROR_METHOD_ZERO:
 -        case MIRROR_METHOD_DISCARD:
 -            mirror_do_zero_or_discard(s, offset, io_bytes,
 -                                      mirror_method == MIRROR_METHOD_DISCARD);
 -            if (write_zeroes_ok) {
 -                io_bytes_acct = 0;
 -            } else {
 -                io_bytes_acct = io_bytes;
 -            }
 -            break;
 -        default:
 -            abort();
 +        io_bytes = mirror_perform(s, offset, io_bytes, mirror_method);
 +        if (mirror_method != MIRROR_METHOD_COPY && write_zeroes_ok) {
 +            io_bytes_acct = 0;
 +        } else {
 +            io_bytes_acct = io_bytes;
          }
          assert(io_bytes);
          offset += io_bytes;
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
                  continue;
              }
 -            mirror_do_zero_or_discard(s, offset, bytes, false);
 +            mirror_perform(s, offset, bytes, MIRROR_METHOD_ZERO);
              offset += bytes;
          }
 --
 .13.6

-[Qemu-devel] [PULL 25/35] block/mirror: Wait for in-flight op conflicts
+[Qemu-devel] [PULL v3 35/35] block: Keep nodes drained between reopen_queue/multiple
-From: Max Reitz <mreitz@redhat.com>
+The bdrv_reopen*() implementation doesn't like it if the graph is
 changed between queuing nodes for reopen and actually reopening them
 (one of the reasons is that queuing can be recursive).
-This patch makes the mirror code differentiate between simply waiting
+So instead of draining the device only in bdrv_reopen_multiple(),
-for any operation to complete (mirror_wait_for_free_in_flight_slot())
+require that callers already drained all affected nodes, and assert this
-and specifically waiting for all operations touching a certain range of
+in bdrv_reopen_queue().
 the virtual disk to complete (mirror_wait_on_conflicts()).
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 Reviewed-by: Fam Zheng <famz@redhat.com>
-Message-id: 20180613181823.13618-5-mreitz@redhat.com
-Signed-off-by: Max Reitz <mreitz@redhat.com>
 ---
- block/mirror.c | 102 +++++++++++++++++++++++++++++++++++++++++++++++----------
+ block.c             | 23 ++++++++++++++++-------
-file changed, 84 insertions(+), 18 deletions(-)
+ block/replication.c |  6 ++++++
  qemu-io-cmds.c      |  3 +++
 files changed, 25 insertions(+), 7 deletions(-)
-diff --git a/block/mirror.c b/block/mirror.c
+diff --git a/block.c b/block.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/mirror.c
+--- a/block.c
-+++ b/block/mirror.c
++++ b/block.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ BlockDriverState *bdrv_open(const char *filename, const char *reference,
- #include "qemu/osdep.h"
+  * returns a pointer to bs_queue, which is either the newly allocated
- #include "qemu/cutils.h"
+  * bs_queue, or the existing bs_queue being used.
- #include "qemu/coroutine.h"
+  *
-+#include "qemu/range.h"
++ * bs must be drained between bdrv_reopen_queue() and bdrv_reopen_multiple().
- #include "trace.h"
+  */
- #include "block/blockjob_int.h"
+ static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue,
- #include "block/block_int.h"
+                                                  BlockDriverState *bs,
-@@ -XXX,XX +XXX,XX @@ struct MirrorOp {
+@@ -XXX,XX +XXX,XX @@ static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue,
-      * mirror_co_discard() before yielding for the first time */
+     BdrvChild *child;
-     int64_t *bytes_handled;
+     QDict *old_options, *explicit_options;
-+    bool is_pseudo_op;
++    /* Make sure that the caller remembered to use a drained section. This is
-     CoQueue waiting_requests;
++     * important to avoid graph changes between the recursive queuing here and
++     * bdrv_reopen_multiple(). */
-     QTAILQ_ENTRY(MirrorOp) next;
++    assert(bs->quiesce_counter > 0);
-@@ -XXX,XX +XXX,XX @@ static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read,
++
      if (bs_queue == NULL) {
          bs_queue = g_new0(BlockReopenQueue, 1);
          QSIMPLEQ_INIT(bs_queue);
@@ -XXX,XX +XXX,XX @@ BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
   * If all devices prepare successfully, then the changes are committed
   * to all devices.
   *
 + * All affected nodes must be drained between bdrv_reopen_queue() and
 + * bdrv_reopen_multiple().
   */
  int bdrv_reopen_multiple(AioContext *ctx, BlockReopenQueue *bs_queue, Error **errp)
  {
@@ -XXX,XX +XXX,XX @@ int bdrv_reopen_multiple(AioContext *ctx, BlockReopenQueue *bs_queue, Error **er
      assert(bs_queue != NULL);
 -    aio_context_release(ctx);
 -    bdrv_drain_all_begin();
 -    aio_context_acquire(ctx);
 -
      QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
 +        assert(bs_entry->state.bs->quiesce_counter > 0);
          if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
              error_propagate(errp, local_err);
              goto cleanup;
@@ -XXX,XX +XXX,XX @@ cleanup:
      }
- }
+     g_free(bs_queue);
-+static void coroutine_fn mirror_wait_on_conflicts(MirrorOp *self,
+-    bdrv_drain_all_end();
-+                                                  MirrorBlockJob *s,
+-
 +                                                  uint64_t offset,
 +                                                  uint64_t bytes)
 +{
 +    uint64_t self_start_chunk = offset / s->granularity;
 +    uint64_t self_end_chunk = DIV_ROUND_UP(offset + bytes, s->granularity);
 +    uint64_t self_nb_chunks = self_end_chunk - self_start_chunk;
 +
 +    while (find_next_bit(s->in_flight_bitmap, self_end_chunk,
 +                         self_start_chunk) < self_end_chunk &&
 +           s->ret >= 0)
 +    {
 +        MirrorOp *op;
 +
 +        QTAILQ_FOREACH(op, &s->ops_in_flight, next) {
 +            uint64_t op_start_chunk = op->offset / s->granularity;
 +            uint64_t op_nb_chunks = DIV_ROUND_UP(op->offset + op->bytes,
 +                                                 s->granularity) -
 +                                    op_start_chunk;
 +
 +            if (op == self) {
 +                continue;
 +            }
 +
 +            if (ranges_overlap(self_start_chunk, self_nb_chunks,
 +                               op_start_chunk, op_nb_chunks))
 +            {
 +                qemu_co_queue_wait(&op->waiting_requests, NULL);
 +                break;
 +            }
 +        }
 +    }
 +}
 +
  static void coroutine_fn mirror_iteration_done(MirrorOp *op, int ret)
  {
      MirrorBlockJob *s = op->s;
@@ -XXX,XX +XXX,XX @@ static int mirror_cow_align(MirrorBlockJob *s, int64_t *offset,
      return ret;
  }
--static inline void mirror_wait_for_io(MirrorBlockJob *s)
+@@ -XXX,XX +XXX,XX @@ int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
 +static inline void mirror_wait_for_free_in_flight_slot(MirrorBlockJob *s)
  {
-     MirrorOp *op;
+     int ret = -1;
+     Error *local_err = NULL;
--    op = QTAILQ_FIRST(&s->ops_in_flight);
+-    BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, NULL, bdrv_flags);
--    assert(op);
++    BlockReopenQueue *queue;
--    qemu_co_queue_wait(&op->waiting_requests, NULL);
-+    QTAILQ_FOREACH(op, &s->ops_in_flight, next) {
++    bdrv_subtree_drained_begin(bs);
-+        /* Do not wait on pseudo ops, because it may in turn wait on
++
-+         * some other operation to start, which may in fact be the
++    queue = bdrv_reopen_queue(NULL, bs, NULL, bdrv_flags);
-+         * caller of this function.  Since there is only one pseudo op
+     ret = bdrv_reopen_multiple(bdrv_get_aio_context(bs), queue, &local_err);
-+         * at any given time, we will always find some real operation
+     if (local_err != NULL) {
-+         * to wait on. */
+         error_propagate(errp, local_err);
-+        if (!op->is_pseudo_op) {
+     }
-+            qemu_co_queue_wait(&op->waiting_requests, NULL);
++
-+            return;
++    bdrv_subtree_drained_end(bs);
-+        }
++
-+    }
+     return ret;
 +    abort();
  }
- /* Perform a mirror copy operation.
+diff --git a/block/replication.c b/block/replication.c
-@@ -XXX,XX +XXX,XX @@ static void coroutine_fn mirror_co_read(void *opaque)
+index XXXXXXX..XXXXXXX 100644
+--- a/block/replication.c
-     while (s->buf_free_count < nb_chunks) {
++++ b/block/replication.c
-         trace_mirror_yield_in_flight(s, op->offset, s->in_flight);
+@@ -XXX,XX +XXX,XX @@ static void reopen_backing_file(BlockDriverState *bs, bool writable,
--        mirror_wait_for_io(s);
+         new_secondary_flags = s->orig_secondary_flags;
 +        mirror_wait_for_free_in_flight_slot(s);
      }
-     /* Now make a QEMUIOVector taking enough granularity-sized chunks
++    bdrv_subtree_drained_begin(s->hidden_disk->bs);
-@@ -XXX,XX +XXX,XX @@ static unsigned mirror_perform(MirrorBlockJob *s, int64_t offset,
++    bdrv_subtree_drained_begin(s->secondary_disk->bs);
- static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
++
- {
+     if (orig_hidden_flags != new_hidden_flags) {
-     BlockDriverState *source = s->source;
+         reopen_queue = bdrv_reopen_queue(reopen_queue, s->hidden_disk->bs, NULL,
--    int64_t offset, first_chunk;
+                                          new_hidden_flags);
--    uint64_t delay_ns = 0;
+@@ -XXX,XX +XXX,XX @@ static void reopen_backing_file(BlockDriverState *bs, bool writable,
-+    MirrorOp *pseudo_op;
+                              reopen_queue, &local_err);
-+    int64_t offset;
+         error_propagate(errp, local_err);
 +    uint64_t delay_ns = 0, ret = 0;
      /* At least the first dirty chunk is mirrored in one iteration. */
      int nb_chunks = 1;
      bool write_zeroes_ok = bdrv_can_write_zeroes_with_unmap(blk_bs(s->target));
@@ -XXX,XX +XXX,XX @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
      }
-     bdrv_dirty_bitmap_unlock(s->dirty_bitmap);
--    first_chunk = offset / s->granularity;
--    while (test_bit(first_chunk, s->in_flight_bitmap)) {
--        trace_mirror_yield_in_flight(s, offset, s->in_flight);
--        mirror_wait_for_io(s);
--    }
-+    mirror_wait_on_conflicts(NULL, s, offset, 1);
-     job_pause_point(&s->common.job);
-@@ -XXX,XX +XXX,XX @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
-                                    nb_chunks * s->granularity);
-     bdrv_dirty_bitmap_unlock(s->dirty_bitmap);
-+    /* Before claiming an area in the in-flight bitmap, we have to
-+     * create a MirrorOp for it so that conflicting requests can wait
-+     * for it.  mirror_perform() will create the real MirrorOps later,
-+     * for now we just create a pseudo operation that will wake up all
-+     * conflicting requests once all real operations have been
-+     * launched. */
-+    pseudo_op = g_new(MirrorOp, 1);
-+    *pseudo_op = (MirrorOp){
-+        .offset         = offset,
-+        .bytes          = nb_chunks * s->granularity,
-+        .is_pseudo_op   = true,
-+    };
-+    qemu_co_queue_init(&pseudo_op->waiting_requests);
-+    QTAILQ_INSERT_TAIL(&s->ops_in_flight, pseudo_op, next);
 +
-     bitmap_set(s->in_flight_bitmap, offset / s->granularity, nb_chunks);
++    bdrv_subtree_drained_end(s->hidden_disk->bs);
-     while (nb_chunks > 0 && offset < s->bdev_length) {
++    bdrv_subtree_drained_end(s->secondary_disk->bs);
-         int ret;
+ }
-@@ -XXX,XX +XXX,XX @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
+ static void backup_job_cleanup(BlockDriverState *bs)
-         while (s->in_flight >= MAX_IN_FLIGHT) {
+diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
-             trace_mirror_yield_in_flight(s, offset, s->in_flight);
+index XXXXXXX..XXXXXXX 100644
--            mirror_wait_for_io(s);
+--- a/qemu-io-cmds.c
-+            mirror_wait_for_free_in_flight_slot(s);
++++ b/qemu-io-cmds.c
-         }
+@@ -XXX,XX +XXX,XX @@ static int reopen_f(BlockBackend *blk, int argc, char **argv)
+     opts = qopts ? qemu_opts_to_qdict(qopts, NULL) : NULL;
-         if (s->ret < 0) {
+     qemu_opts_reset(&reopen_opts);
--            return 0;
-+            ret = 0;
++    bdrv_subtree_drained_begin(bs);
-+            goto fail;
+     brq = bdrv_reopen_queue(NULL, bs, opts, flags);
-         }
+     bdrv_reopen_multiple(bdrv_get_aio_context(bs), brq, &local_err);
++    bdrv_subtree_drained_end(bs);
          io_bytes = mirror_clip_bytes(s, offset, io_bytes);
@@ -XXX,XX +XXX,XX @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
          nb_chunks -= DIV_ROUND_UP(io_bytes, s->granularity);
          delay_ns = block_job_ratelimit_get_delay(&s->common, io_bytes_acct);
      }
 -    return delay_ns;
 +
-+    ret = delay_ns;
+     if (local_err) {
-+fail:
+         error_report_err(local_err);
-+    QTAILQ_REMOVE(&s->ops_in_flight, pseudo_op, next);
+     } else {
 +    qemu_co_queue_restart_all(&pseudo_op->waiting_requests);
 +    g_free(pseudo_op);
 +
 +    return ret;
  }
  static void mirror_free_init(MirrorBlockJob *s)
@@ -XXX,XX +XXX,XX @@ static void mirror_free_init(MirrorBlockJob *s)
  static void mirror_wait_for_all_io(MirrorBlockJob *s)
  {
      while (s->in_flight > 0) {
 -        mirror_wait_for_io(s);
 +        mirror_wait_for_free_in_flight_slot(s);
      }
  }
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
              if (s->in_flight >= MAX_IN_FLIGHT) {
                  trace_mirror_yield(s, UINT64_MAX, s->buf_free_count,
                                     s->in_flight);
 -                mirror_wait_for_io(s);
 +                mirror_wait_for_free_in_flight_slot(s);
                  continue;
              }
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn mirror_run(void *opaque)
              if (s->in_flight >= MAX_IN_FLIGHT || s->buf_free_count == 0 ||
                  (cnt == 0 && s->in_flight > 0)) {
                  trace_mirror_yield(s, cnt, s->buf_free_count, s->in_flight);
 -                mirror_wait_for_io(s);
 +                mirror_wait_for_free_in_flight_slot(s);
                  continue;
              } else if (cnt != 0) {
                  delay_ns = mirror_iteration(s);
 --
 .13.6

The following changes since commit 2ef2f16781af9dee6ba6517755e9073ba5799fa2:

Merge remote-tracking branch 'remotes/dgilbert/tags/pull-migration-20180615a' into staging (2018-06-15 18:13:35 +0100)

are available in the git repository at:

git://repo.or.cz/qemu/kevin.git tags/for-upstream

for you to fetch changes up to 4c790afe2503eab12874508acab5b388d7babfd2:

Merge remote-tracking branch 'mreitz/tags/pull-block-2018-06-18' into queue-block (2018-06-18 17:20:42 +0200)

----------------------------------------------------------------
Block layer patches:

- Active mirror (blockdev-mirror copy-mode=write-blocking)
- bdrv_drain_*() fixes and test cases
- Fix crash with scsi-hd and drive_del

----------------------------------------------------------------
Greg Kurz (1):
      block: fix QEMU crash with scsi-hd and drive_del

Kevin Wolf (20):
      test-bdrv-drain: bdrv_drain() works with cross-AioContext events
      block: Use bdrv_do_drain_begin/end in bdrv_drain_all()
      block: Remove 'recursive' parameter from bdrv_drain_invoke()
      block: Don't manually poll in bdrv_drain_all()
      tests/test-bdrv-drain: bdrv_drain_all() works in coroutines now
      block: Avoid unnecessary aio_poll() in AIO_WAIT_WHILE()
      block: Really pause block jobs on drain
      block: Remove bdrv_drain_recurse()
      block: Drain recursively with a single BDRV_POLL_WHILE()
      test-bdrv-drain: Test node deletion in subtree recursion
      block: Don't poll in parent drain callbacks
      test-bdrv-drain: Graph change through parent callback
      block: Defer .bdrv_drain_begin callback to polling phase
      test-bdrv-drain: Test that bdrv_drain_invoke() doesn't poll
      block: Allow AIO_WAIT_WHILE with NULL ctx
      block: Move bdrv_drain_all_begin() out of coroutine context
      block: ignore_bds_parents parameter for drain functions
      block: Allow graph changes in bdrv_drain_all_begin/end sections
      test-bdrv-drain: Test graph changes in drain_all section
      Merge remote-tracking branch 'mreitz/tags/pull-block-2018-06-18' into queue-block

Max Reitz (15):
      test-bdrv-drain: Add test for node deletion
      block/mirror: Pull out mirror_perform()
      block/mirror: Convert to coroutines
      block/mirror: Use CoQueue to wait on in-flight ops
      block/mirror: Wait for in-flight op conflicts
      block/mirror: Use source as a BdrvChild
      block: Generalize should_update_child() rule
      hbitmap: Add @advance param to hbitmap_iter_next()
      test-hbitmap: Add non-advancing iter_next tests
      block/dirty-bitmap: Add bdrv_dirty_iter_next_area
      block/mirror: Add MirrorBDSOpaque
      job: Add job_progress_increase_remaining()
      block/mirror: Add active mirroring
      block/mirror: Add copy mode QAPI interface
      iotests: Add test for active mirroring

As long as nobody keeps the other I/O thread from working, there is no
reason why bdrv_drain() wouldn't work with cross-AioContext events. The
key is that the root request we're waiting for is in the AioContext
we're polling (which it always is for bdrv_drain()) so that aio_poll()
is woken up in the end.

Add a test case that shows that it works. Remove the comment in
bdrv_drain() that claims otherwise.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/io.c              |   4 --
 tests/test-bdrv-drain.c | 187 +++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 186 insertions(+), 5 deletions(-)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent)
  *
  * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
  * AioContext.
- *
- * Only this BlockDriverState's AioContext is run, so in-flight requests must
- * not depend on events in other AioContexts.  In that case, use
- * bdrv_drain_all() instead.
  */
 void coroutine_fn bdrv_co_drain(BlockDriverState *bs)
 {
diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@
 #include "block/blockjob_int.h"
 #include "sysemu/block-backend.h"
 #include "qapi/error.h"
+#include "iothread.h"
+
+static QemuEvent done_event;
 
 typedef struct BDRVTestState {
     int drain_count;
+    AioContext *bh_indirection_ctx;
 } BDRVTestState;
 
 static void coroutine_fn bdrv_test_co_drain_begin(BlockDriverState *bs)
@@ -XXX,XX +XXX,XX @@ static void bdrv_test_close(BlockDriverState *bs)
     g_assert_cmpint(s->drain_count, >, 0);
 }
 
+static void co_reenter_bh(void *opaque)
+{
+    aio_co_wake(opaque);
+}
+
 static int coroutine_fn bdrv_test_co_preadv(BlockDriverState *bs,
                                             uint64_t offset, uint64_t bytes,
                                             QEMUIOVector *qiov, int flags)
 {
+    BDRVTestState *s = bs->opaque;
+
     /* We want this request to stay until the polling loop in drain waits for
      * it to complete. We need to sleep a while as bdrv_drain_invoke() comes
      * first and polls its result, too, but it shouldn't accidentally complete
      * this request yet. */
     qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100000);
 
+    if (s->bh_indirection_ctx) {
+        aio_bh_schedule_oneshot(s->bh_indirection_ctx, co_reenter_bh,
+                                qemu_coroutine_self());
+        qemu_coroutine_yield();
+    }
+
     return 0;
 }
 
@@ -XXX,XX +XXX,XX @@ static void test_graph_change(void)
     blk_unref(blk_b);
 }
 
+struct test_iothread_data {
+    BlockDriverState *bs;
+    enum drain_type drain_type;
+    int *aio_ret;
+};
+
+static void test_iothread_drain_entry(void *opaque)
+{
+    struct test_iothread_data *data = opaque;
+
+    aio_context_acquire(bdrv_get_aio_context(data->bs));
+    do_drain_begin(data->drain_type, data->bs);
+    g_assert_cmpint(*data->aio_ret, ==, 0);
+    do_drain_end(data->drain_type, data->bs);
+    aio_context_release(bdrv_get_aio_context(data->bs));
+
+    qemu_event_set(&done_event);
+}
+
+static void test_iothread_aio_cb(void *opaque, int ret)
+{
+    int *aio_ret = opaque;
+    *aio_ret = ret;
+    qemu_event_set(&done_event);
+}
+
+/*
+ * Starts an AIO request on a BDS that runs in the AioContext of iothread 1.
+ * The request involves a BH on iothread 2 before it can complete.
+ *
+ * @drain_thread = 0 means that do_drain_begin/end are called from the main
+ * thread, @drain_thread = 1 means that they are called from iothread 1. Drain
+ * for this BDS cannot be called from iothread 2 because only the main thread
+ * may do cross-AioContext polling.
+ */
+static void test_iothread_common(enum drain_type drain_type, int drain_thread)
+{
+    BlockBackend *blk;
+    BlockDriverState *bs;
+    BDRVTestState *s;
+    BlockAIOCB *acb;
+    int aio_ret;
+    struct test_iothread_data data;
+
+    IOThread *a = iothread_new();
+    IOThread *b = iothread_new();
+    AioContext *ctx_a = iothread_get_aio_context(a);
+    AioContext *ctx_b = iothread_get_aio_context(b);
+
+    QEMUIOVector qiov;
+    struct iovec iov = {
+        .iov_base = NULL,
+        .iov_len = 0,
+    };
+    qemu_iovec_init_external(&qiov, &iov, 1);
+
+    /* bdrv_drain_all() may only be called from the main loop thread */
+    if (drain_type == BDRV_DRAIN_ALL && drain_thread != 0) {
+        goto out;
+    }
+
+    blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
+                              &error_abort);
+    s = bs->opaque;
+    blk_insert_bs(blk, bs, &error_abort);
+
+    blk_set_aio_context(blk, ctx_a);
+    aio_context_acquire(ctx_a);
+
+    s->bh_indirection_ctx = ctx_b;
+
+    aio_ret = -EINPROGRESS;
+    if (drain_thread == 0) {
+        acb = blk_aio_preadv(blk, 0, &qiov, 0, test_iothread_aio_cb, &aio_ret);
+    } else {
+        acb = blk_aio_preadv(blk, 0, &qiov, 0, aio_ret_cb, &aio_ret);
+    }
+    g_assert(acb != NULL);
+    g_assert_cmpint(aio_ret, ==, -EINPROGRESS);
+
+    aio_context_release(ctx_a);
+
+    data = (struct test_iothread_data) {
+        .bs         = bs,
+        .drain_type = drain_type,
+        .aio_ret    = &aio_ret,
+    };
+
+    switch (drain_thread) {
+    case 0:
+        if (drain_type != BDRV_DRAIN_ALL) {
+            aio_context_acquire(ctx_a);
+        }
+
+        /* The request is running on the IOThread a. Draining its block device
+         * will make sure that it has completed as far as the BDS is concerned,
+         * but the drain in this thread can continue immediately after
+         * bdrv_dec_in_flight() and aio_ret might be assigned only slightly
+         * later. */
+        qemu_event_reset(&done_event);
+        do_drain_begin(drain_type, bs);
+        g_assert_cmpint(bs->in_flight, ==, 0);
+
+        if (drain_type != BDRV_DRAIN_ALL) {
+            aio_context_release(ctx_a);
+        }
+        qemu_event_wait(&done_event);
+        if (drain_type != BDRV_DRAIN_ALL) {
+            aio_context_acquire(ctx_a);
+        }
+
+        g_assert_cmpint(aio_ret, ==, 0);
+        do_drain_end(drain_type, bs);
+
+        if (drain_type != BDRV_DRAIN_ALL) {
+            aio_context_release(ctx_a);
+        }
+        break;
+    case 1:
+        qemu_event_reset(&done_event);
+        aio_bh_schedule_oneshot(ctx_a, test_iothread_drain_entry, &data);
+        qemu_event_wait(&done_event);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    aio_context_acquire(ctx_a);
+    blk_set_aio_context(blk, qemu_get_aio_context());
+    aio_context_release(ctx_a);
+
+    bdrv_unref(bs);
+    blk_unref(blk);
+
+out:
+    iothread_join(a);
+    iothread_join(b);
+}
+
+static void test_iothread_drain_all(void)
+{
+    test_iothread_common(BDRV_DRAIN_ALL, 0);
+    test_iothread_common(BDRV_DRAIN_ALL, 1);
+}
+
+static void test_iothread_drain(void)
+{
+    test_iothread_common(BDRV_DRAIN, 0);
+    test_iothread_common(BDRV_DRAIN, 1);
+}
+
+static void test_iothread_drain_subtree(void)
+{
+    test_iothread_common(BDRV_SUBTREE_DRAIN, 0);
+    test_iothread_common(BDRV_SUBTREE_DRAIN, 1);
+}
+
 
 typedef struct TestBlockJob {
     BlockJob common;
@@ -XXX,XX +XXX,XX @@ static void test_blockjob_drain_subtree(void)
 
 int main(int argc, char **argv)
 {
+    int ret;
+
     bdrv_init();
     qemu_init_main_loop(&error_abort);
 
     g_test_init(&argc, &argv, NULL);
+    qemu_event_init(&done_event, false);
 
     g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
     g_test_add_func("/bdrv-drain/driver-cb/drain", test_drv_cb_drain);
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
     g_test_add_func("/bdrv-drain/multiparent", test_multiparent);
     g_test_add_func("/bdrv-drain/graph-change", test_graph_change);
 
+    g_test_add_func("/bdrv-drain/iothread/drain_all", test_iothread_drain_all);
+    g_test_add_func("/bdrv-drain/iothread/drain", test_iothread_drain);
+    g_test_add_func("/bdrv-drain/iothread/drain_subtree",
+                    test_iothread_drain_subtree);
+
     g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
     g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
     g_test_add_func("/bdrv-drain/blockjob/drain_subtree",
                     test_blockjob_drain_subtree);
 
-    return g_test_run();
+    ret = g_test_run();
+    qemu_event_destroy(&done_event);
+    return ret;
 }
-- 
2.13.6

bdrv_do_drain_begin/end() implement already everything that
bdrv_drain_all_begin/end() need and currently still do manually: Disable
external events, call parent drain callbacks, call block driver
callbacks.

It also does two more things:

The first is incrementing bs->quiesce_counter. bdrv_drain_all() already
stood out in the test case by behaving different from the other drain
variants. Adding this is not only safe, but in fact a bug fix.

The second is calling bdrv_drain_recurse(). We already do that later in
the same function in a loop, so basically doing an early first iteration
doesn't hurt.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/io.c              | 10 ++--------
 tests/test-bdrv-drain.c | 14 ++++----------
 2 files changed, 6 insertions(+), 18 deletions(-)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
         AioContext *aio_context = bdrv_get_aio_context(bs);
 
-        /* Stop things in parent-to-child order */
         aio_context_acquire(aio_context);
-        aio_disable_external(aio_context);
-        bdrv_parent_drained_begin(bs, NULL);
-        bdrv_drain_invoke(bs, true, true);
+        bdrv_do_drained_begin(bs, true, NULL);
         aio_context_release(aio_context);
 
         if (!g_slist_find(aio_ctxs, aio_context)) {
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
         AioContext *aio_context = bdrv_get_aio_context(bs);
 
-        /* Re-enable things in child-to-parent order */
         aio_context_acquire(aio_context);
-        bdrv_drain_invoke(bs, false, true);
-        bdrv_parent_drained_end(bs, NULL);
-        aio_enable_external(aio_context);
+        bdrv_do_drained_end(bs, true, NULL);
         aio_context_release(aio_context);
     }
 }
diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static void test_quiesce_common(enum drain_type drain_type, bool recursive)
 
 static void test_quiesce_drain_all(void)
 {
-    // XXX drain_all doesn't quiesce
-    //test_quiesce_common(BDRV_DRAIN_ALL, true);
+    test_quiesce_common(BDRV_DRAIN_ALL, true);
 }
 
 static void test_quiesce_drain(void)
@@ -XXX,XX +XXX,XX @@ static void test_nested(void)
 
     for (outer = 0; outer < DRAIN_TYPE_MAX; outer++) {
         for (inner = 0; inner < DRAIN_TYPE_MAX; inner++) {
-            /* XXX bdrv_drain_all() doesn't increase the quiesce_counter */
-            int bs_quiesce      = (outer != BDRV_DRAIN_ALL) +
-                                  (inner != BDRV_DRAIN_ALL);
-            int backing_quiesce = (outer == BDRV_SUBTREE_DRAIN) +
-                                  (inner == BDRV_SUBTREE_DRAIN);
-            int backing_cb_cnt  = (outer != BDRV_DRAIN) +
+            int backing_quiesce = (outer != BDRV_DRAIN) +
                                   (inner != BDRV_DRAIN);
 
             g_assert_cmpint(bs->quiesce_counter, ==, 0);
@@ -XXX,XX +XXX,XX @@ static void test_nested(void)
             do_drain_begin(outer, bs);
             do_drain_begin(inner, bs);
 
-            g_assert_cmpint(bs->quiesce_counter, ==, bs_quiesce);
+            g_assert_cmpint(bs->quiesce_counter, ==, 2);
             g_assert_cmpint(backing->quiesce_counter, ==, backing_quiesce);
             g_assert_cmpint(s->drain_count, ==, 2);
-            g_assert_cmpint(backing_s->drain_count, ==, backing_cb_cnt);
+            g_assert_cmpint(backing_s->drain_count, ==, backing_quiesce);
 
             do_drain_end(inner, bs);
             do_drain_end(outer, bs);
-- 
2.13.6

All callers pass false for the 'recursive' parameter now. Remove it.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/io.c | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
 }
 
 /* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
-static void bdrv_drain_invoke(BlockDriverState *bs, bool begin, bool recursive)
+static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
 {
-    BdrvChild *child, *tmp;
     BdrvCoDrainData data = { .bs = bs, .done = false, .begin = begin};
 
     if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) ||
@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin, bool recursive)
     data.co = qemu_coroutine_create(bdrv_drain_invoke_entry, &data);
     bdrv_coroutine_enter(bs, data.co);
     BDRV_POLL_WHILE(bs, !data.done);
-
-    if (recursive) {
-        QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
-            bdrv_drain_invoke(child->bs, begin, true);
-        }
-    }
 }
 
 static bool bdrv_drain_recurse(BlockDriverState *bs)
@@ -XXX,XX +XXX,XX @@ void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
     }
 
     bdrv_parent_drained_begin(bs, parent);
-    bdrv_drain_invoke(bs, true, false);
+    bdrv_drain_invoke(bs, true);
     bdrv_drain_recurse(bs);
 
     if (recursive) {
@@ -XXX,XX +XXX,XX @@ void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
     old_quiesce_counter = atomic_fetch_dec(&bs->quiesce_counter);
 
     /* Re-enable things in child-to-parent order */
-    bdrv_drain_invoke(bs, false, false);
+    bdrv_drain_invoke(bs, false);
     bdrv_parent_drained_end(bs, parent);
     if (old_quiesce_counter == 1) {
         aio_enable_external(bdrv_get_aio_context(bs));
-- 
2.13.6

All involved nodes are already idle, we called bdrv_do_drain_begin() on
them.

The comment in the code suggested that this was not correct because the
completion of a request on one node could spawn a new request on a
different node (which might have been drained before, so we wouldn't
drain the new request). In reality, new requests to different nodes
aren't spawned out of nothing, but only in the context of a parent
request, and they aren't submitted to random nodes, but only to child
nodes. As long as we still poll for the completion of the parent request
(which we do), draining each root node separately is good enough.

Remove the additional polling code from bdrv_drain_all_begin() and
replace it with an assertion that all nodes are already idle after we
drained them separately.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/io.c | 41 ++++++++++++-----------------------------
 1 file changed, 12 insertions(+), 29 deletions(-)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ void bdrv_drain(BlockDriverState *bs)
     bdrv_drained_end(bs);
 }
 
+static void bdrv_drain_assert_idle(BlockDriverState *bs)
+{
+    BdrvChild *child, *next;
+
+    assert(atomic_read(&bs->in_flight) == 0);
+    QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
+        bdrv_drain_assert_idle(child->bs);
+    }
+}
+
 /*
  * Wait for pending requests to complete across all BlockDriverStates
  *
@@ -XXX,XX +XXX,XX @@ void bdrv_drain(BlockDriverState *bs)
  */
 void bdrv_drain_all_begin(void)
 {
-    /* Always run first iteration so any pending completion BHs run */
-    bool waited = true;
     BlockDriverState *bs;
     BdrvNextIterator it;
-    GSList *aio_ctxs = NULL, *ctx;
 
     /* BDRV_POLL_WHILE() for a node can only be called from its own I/O thread
      * or the main loop AioContext. We potentially use BDRV_POLL_WHILE() on
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
         aio_context_acquire(aio_context);
         bdrv_do_drained_begin(bs, true, NULL);
         aio_context_release(aio_context);
-
-        if (!g_slist_find(aio_ctxs, aio_context)) {
-            aio_ctxs = g_slist_prepend(aio_ctxs, aio_context);
-        }
     }
 
-    /* Note that completion of an asynchronous I/O operation can trigger any
-     * number of other I/O operations on other devices---for example a
-     * coroutine can submit an I/O request to another device in response to
-     * request completion.  Therefore we must keep looping until there was no
-     * more activity rather than simply draining each device independently.
-     */
-    while (waited) {
-        waited = false;
-
-        for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) {
-            AioContext *aio_context = ctx->data;
-
-            aio_context_acquire(aio_context);
-            for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
-                if (aio_context == bdrv_get_aio_context(bs)) {
-                    waited |= bdrv_drain_recurse(bs);
-                }
-            }
-            aio_context_release(aio_context);
-        }
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
+        bdrv_drain_assert_idle(bs);
     }
-
-    g_slist_free(aio_ctxs);
 }
 
 void bdrv_drain_all_end(void)
-- 
2.13.6

Since we use bdrv_do_drained_begin/end() for bdrv_drain_all_begin/end(),
coroutine context is automatically left with a BH, preventing the
deadlocks that made bdrv_drain_all*() unsafe in coroutine context. Now
that we even removed the old polling code as dead code, it's obvious
that it's compatible now.

Enable the coroutine test cases for bdrv_drain_all().

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 tests/test-bdrv-drain.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain_subtree(void)
     test_drv_cb_common(BDRV_SUBTREE_DRAIN, true);
 }
 
+static void test_drv_cb_co_drain_all(void)
+{
+    call_in_coroutine(test_drv_cb_drain_all);
+}
+
 static void test_drv_cb_co_drain(void)
 {
     call_in_coroutine(test_drv_cb_drain);
@@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain_subtree(void)
     test_quiesce_common(BDRV_SUBTREE_DRAIN, true);
 }
 
+static void test_quiesce_co_drain_all(void)
+{
+    call_in_coroutine(test_quiesce_drain_all);
+}
+
 static void test_quiesce_co_drain(void)
 {
     call_in_coroutine(test_quiesce_drain);
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
     g_test_add_func("/bdrv-drain/driver-cb/drain_subtree",
                     test_drv_cb_drain_subtree);
 
-    // XXX bdrv_drain_all() doesn't work in coroutine context
+    g_test_add_func("/bdrv-drain/driver-cb/co/drain_all",
+                    test_drv_cb_co_drain_all);
     g_test_add_func("/bdrv-drain/driver-cb/co/drain", test_drv_cb_co_drain);
     g_test_add_func("/bdrv-drain/driver-cb/co/drain_subtree",
                     test_drv_cb_co_drain_subtree);
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
     g_test_add_func("/bdrv-drain/quiesce/drain_subtree",
                     test_quiesce_drain_subtree);
 
-    // XXX bdrv_drain_all() doesn't work in coroutine context
+    g_test_add_func("/bdrv-drain/quiesce/co/drain_all",
+                    test_quiesce_co_drain_all);
     g_test_add_func("/bdrv-drain/quiesce/co/drain", test_quiesce_co_drain);
     g_test_add_func("/bdrv-drain/quiesce/co/drain_subtree",
                     test_quiesce_co_drain_subtree);
-- 
2.13.6

Commit 91af091f923 added an additional aio_poll() to BDRV_POLL_WHILE()
in order to make sure that all pending BHs are executed on drain. This
was the wrong place to make the fix, as it is useless overhead for all
other users of the macro and unnecessarily complicates the mechanism.

This patch effectively reverts said commit (the context has changed a
bit and the code has moved to AIO_WAIT_WHILE()) and instead polls in the
loop condition for drain.

The effect is probably hard to measure in any real-world use case
because actual I/O will dominate, but if I run only the initialisation
part of 'qemu-img convert' where it calls bdrv_block_status() for the
whole image to find out how much data there is copy, this phase actually
needs only roughly half the time after this patch.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/block/aio-wait.h | 22 ++++++++--------------
 block/io.c               | 11 ++++++++++-
 2 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/include/block/aio-wait.h b/include/block/aio-wait.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/aio-wait.h
+++ b/include/block/aio-wait.h
@@ -XXX,XX +XXX,XX @@ typedef struct {
  */
 #define AIO_WAIT_WHILE(wait, ctx, cond) ({                         \
     bool waited_ = false;                                          \
-    bool busy_ = true;                                             \
     AioWait *wait_ = (wait);                                       \
     AioContext *ctx_ = (ctx);                                      \
     if (in_aio_context_home_thread(ctx_)) {                        \
-        while ((cond) || busy_) {                                  \
-            busy_ = aio_poll(ctx_, (cond));                        \
-            waited_ |= !!(cond) | busy_;                           \
+        while ((cond)) {                                           \
+            aio_poll(ctx_, true);                                  \
+            waited_ = true;                                        \
         }                                                          \
     } else {                                                       \
         assert(qemu_get_current_aio_context() ==                   \
                qemu_get_aio_context());                            \
         /* Increment wait_->num_waiters before evaluating cond. */ \
         atomic_inc(&wait_->num_waiters);                           \
-        while (busy_) {                                            \
-            if ((cond)) {                                          \
-                waited_ = busy_ = true;                            \
-                aio_context_release(ctx_);                         \
-                aio_poll(qemu_get_aio_context(), true);            \
-                aio_context_acquire(ctx_);                         \
-            } else {                                               \
-                busy_ = aio_poll(ctx_, false);                     \
-                waited_ |= busy_;                                  \
-            }                                                      \
+        while ((cond)) {                                           \
+            aio_context_release(ctx_);                             \
+            aio_poll(qemu_get_aio_context(), true);                \
+            aio_context_acquire(ctx_);                             \
+            waited_ = true;                                        \
         }                                                          \
         atomic_dec(&wait_->num_waiters);                           \
     }                                                              \
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
     BDRV_POLL_WHILE(bs, !data.done);
 }
 
+/* Returns true if BDRV_POLL_WHILE() should go into a blocking aio_poll() */
+static bool bdrv_drain_poll(BlockDriverState *bs)
+{
+    /* Execute pending BHs first and check everything else only after the BHs
+     * have executed. */
+    while (aio_poll(bs->aio_context, false));
+    return atomic_read(&bs->in_flight);
+}
+
 static bool bdrv_drain_recurse(BlockDriverState *bs)
 {
     BdrvChild *child, *tmp;
     bool waited;
 
     /* Wait for drained requests to finish */
-    waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0);
+    waited = BDRV_POLL_WHILE(bs, bdrv_drain_poll(bs));
 
     QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
         BlockDriverState *bs = child->bs;
-- 
2.13.6

We already requested that block jobs be paused in .bdrv_drained_begin,
but no guarantee was made that the job was actually inactive at the
point where bdrv_drained_begin() returned.

This introduces a new callback BdrvChildRole.bdrv_drained_poll() and
uses it to make bdrv_drain_poll() consider block jobs using the node to
be drained.

For the test case to work as expected, we have to switch from
block_job_sleep_ns() to qemu_co_sleep_ns() so that the test job is even
considered active and must be waited for when draining the node.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/block.h        |  8 ++++++++
 include/block/block_int.h    |  7 +++++++
 include/block/blockjob_int.h |  8 ++++++++
 block.c                      |  9 +++++++++
 block/io.c                   | 40 ++++++++++++++++++++++++++++++++++------
 block/mirror.c               |  8 ++++++++
 blockjob.c                   | 23 +++++++++++++++++++++++
 tests/test-bdrv-drain.c      | 18 ++++++++++--------
 8 files changed, 107 insertions(+), 14 deletions(-)

diff --git a/include/block/block.h b/include/block/block.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -XXX,XX +XXX,XX @@ void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore);
 void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore);
 
 /**
+ * bdrv_drain_poll:
+ *
+ * Poll for pending requests in @bs and its parents (except for
+ * @ignore_parent). This is part of bdrv_drained_begin.
+ */
+bool bdrv_drain_poll(BlockDriverState *bs, BdrvChild *ignore_parent);
+
+/**
  * bdrv_drained_begin:
  *
  * Begin a quiesced section for exclusive access to the BDS, by disabling
diff --git a/include/block/block_int.h b/include/block/block_int.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -XXX,XX +XXX,XX @@ struct BdrvChildRole {
     void (*drained_begin)(BdrvChild *child);
     void (*drained_end)(BdrvChild *child);
 
+    /*
+     * Returns whether the parent has pending requests for the child. This
+     * callback is polled after .drained_begin() has been called until all
+     * activity on the child has stopped.
+     */
+    bool (*drained_poll)(BdrvChild *child);
+
     /* Notifies the parent that the child has been activated/inactivated (e.g.
      * when migration is completing) and it can start/stop requesting
      * permissions and doing I/O on it. */
diff --git a/include/block/blockjob_int.h b/include/block/blockjob_int.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/blockjob_int.h
+++ b/include/block/blockjob_int.h
@@ -XXX,XX +XXX,XX @@ struct BlockJobDriver {
     JobDriver job_driver;
 
     /*
+     * Returns whether the job has pending requests for the child or will
+     * submit new requests before the next pause point. This callback is polled
+     * in the context of draining a job node after requesting that the job be
+     * paused, until all activity on the child has stopped.
+     */
+    bool (*drained_poll)(BlockJob *job);
+
+    /*
      * If the callback is not NULL, it will be invoked before the job is
      * resumed in a new AioContext.  This is the place to move any resources
      * besides job->blk to the new AioContext.
diff --git a/block.c b/block.c
index XXXXXXX..XXXXXXX 100644
--- a/block.c
+++ b/block.c
@@ -XXX,XX +XXX,XX @@ static void bdrv_child_cb_drained_begin(BdrvChild *child)
     bdrv_drained_begin(bs);
 }
 
+static bool bdrv_child_cb_drained_poll(BdrvChild *child)
+{
+    BlockDriverState *bs = child->opaque;
+    return bdrv_drain_poll(bs, NULL);
+}
+
 static void bdrv_child_cb_drained_end(BdrvChild *child)
 {
     BlockDriverState *bs = child->opaque;
@@ -XXX,XX +XXX,XX @@ const BdrvChildRole child_file = {
     .get_parent_desc = bdrv_child_get_parent_desc,
     .inherit_options = bdrv_inherited_options,
     .drained_begin   = bdrv_child_cb_drained_begin,
+    .drained_poll    = bdrv_child_cb_drained_poll,
     .drained_end     = bdrv_child_cb_drained_end,
     .attach          = bdrv_child_cb_attach,
     .detach          = bdrv_child_cb_detach,
@@ -XXX,XX +XXX,XX @@ const BdrvChildRole child_format = {
     .get_parent_desc = bdrv_child_get_parent_desc,
     .inherit_options = bdrv_inherited_fmt_options,
     .drained_begin   = bdrv_child_cb_drained_begin,
+    .drained_poll    = bdrv_child_cb_drained_poll,
     .drained_end     = bdrv_child_cb_drained_end,
     .attach          = bdrv_child_cb_attach,
     .detach          = bdrv_child_cb_detach,
@@ -XXX,XX +XXX,XX @@ const BdrvChildRole child_backing = {
     .detach          = bdrv_backing_detach,
     .inherit_options = bdrv_backing_options,
     .drained_begin   = bdrv_child_cb_drained_begin,
+    .drained_poll    = bdrv_child_cb_drained_poll,
     .drained_end     = bdrv_child_cb_drained_end,
     .inactivate      = bdrv_child_cb_inactivate,
     .update_filename = bdrv_backing_update_filename,
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore)
     }
 }
 
+static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore)
+{
+    BdrvChild *c, *next;
+    bool busy = false;
+
+    QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
+        if (c == ignore) {
+            continue;
+        }
+        if (c->role->drained_poll) {
+            busy |= c->role->drained_poll(c);
+        }
+    }
+
+    return busy;
+}
+
 static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src)
 {
     dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer);
@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
 }
 
 /* Returns true if BDRV_POLL_WHILE() should go into a blocking aio_poll() */
-static bool bdrv_drain_poll(BlockDriverState *bs)
+bool bdrv_drain_poll(BlockDriverState *bs, BdrvChild *ignore_parent)
+{
+    if (bdrv_parent_drained_poll(bs, ignore_parent)) {
+        return true;
+    }
+
+    return atomic_read(&bs->in_flight);
+}
+
+static bool bdrv_drain_poll_top_level(BlockDriverState *bs,
+                                      BdrvChild *ignore_parent)
 {
     /* Execute pending BHs first and check everything else only after the BHs
      * have executed. */
     while (aio_poll(bs->aio_context, false));
-    return atomic_read(&bs->in_flight);
+
+    return bdrv_drain_poll(bs, ignore_parent);
 }
 
-static bool bdrv_drain_recurse(BlockDriverState *bs)
+static bool bdrv_drain_recurse(BlockDriverState *bs, BdrvChild *parent)
 {
     BdrvChild *child, *tmp;
     bool waited;
 
     /* Wait for drained requests to finish */
-    waited = BDRV_POLL_WHILE(bs, bdrv_drain_poll(bs));
+    waited = BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, parent));
 
     QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
         BlockDriverState *bs = child->bs;
@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs)
              */
             bdrv_ref(bs);
         }
-        waited |= bdrv_drain_recurse(bs);
+        waited |= bdrv_drain_recurse(bs, child);
         if (in_main_loop) {
             bdrv_unref(bs);
         }
@@ -XXX,XX +XXX,XX @@ void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
 
     bdrv_parent_drained_begin(bs, parent);
     bdrv_drain_invoke(bs, true);
-    bdrv_drain_recurse(bs);
+    bdrv_drain_recurse(bs, parent);
 
     if (recursive) {
         bs->recursive_quiesce_counter++;
diff --git a/block/mirror.c b/block/mirror.c
index XXXXXXX..XXXXXXX 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -XXX,XX +XXX,XX @@ static void mirror_pause(Job *job)
     mirror_wait_for_all_io(s);
 }
 
+static bool mirror_drained_poll(BlockJob *job)
+{
+    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
+    return !!s->in_flight;
+}
+
 static void mirror_attached_aio_context(BlockJob *job, AioContext *new_context)
 {
     MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
@@ -XXX,XX +XXX,XX @@ static const BlockJobDriver mirror_job_driver = {
         .pause                  = mirror_pause,
         .complete               = mirror_complete,
     },
+    .drained_poll           = mirror_drained_poll,
     .attached_aio_context   = mirror_attached_aio_context,
     .drain                  = mirror_drain,
 };
@@ -XXX,XX +XXX,XX @@ static const BlockJobDriver commit_active_job_driver = {
         .pause                  = mirror_pause,
         .complete               = mirror_complete,
     },
+    .drained_poll           = mirror_drained_poll,
     .attached_aio_context   = mirror_attached_aio_context,
     .drain                  = mirror_drain,
 };
diff --git a/blockjob.c b/blockjob.c
index XXXXXXX..XXXXXXX 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -XXX,XX +XXX,XX @@ static void child_job_drained_begin(BdrvChild *c)
     job_pause(&job->job);
 }
 
+static bool child_job_drained_poll(BdrvChild *c)
+{
+    BlockJob *bjob = c->opaque;
+    Job *job = &bjob->job;
+    const BlockJobDriver *drv = block_job_driver(bjob);
+
+    /* An inactive or completed job doesn't have any pending requests. Jobs
+     * with !job->busy are either already paused or have a pause point after
+     * being reentered, so no job driver code will run before they pause. */
+    if (!job->busy || job_is_completed(job) || job->deferred_to_main_loop) {
+        return false;
+    }
+
+    /* Otherwise, assume that it isn't fully stopped yet, but allow the job to
+     * override this assumption. */
+    if (drv->drained_poll) {
+        return drv->drained_poll(bjob);
+    } else {
+        return true;
+    }
+}
+
 static void child_job_drained_end(BdrvChild *c)
 {
     BlockJob *job = c->opaque;
@@ -XXX,XX +XXX,XX @@ static void child_job_drained_end(BdrvChild *c)
 static const BdrvChildRole child_job = {
     .get_parent_desc    = child_job_get_parent_desc,
     .drained_begin      = child_job_drained_begin,
+    .drained_poll       = child_job_drained_poll,
     .drained_end        = child_job_drained_end,
     .stay_at_node       = true,
 };
diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn test_job_start(void *opaque)
 
     job_transition_to_ready(&s->common.job);
     while (!s->should_complete) {
-        job_sleep_ns(&s->common.job, 100000);
+        /* Avoid block_job_sleep_ns() because it marks the job as !busy. We
+         * want to emulate some actual activity (probably some I/O) here so
+         * that drain has to wait for this acitivity to stop. */
+        qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100000);
+        job_pause_point(&s->common.job);
     }
 
     job_defer_to_main_loop(&s->common.job, test_job_completed, NULL);
@@ -XXX,XX +XXX,XX @@ static void test_blockjob_common(enum drain_type drain_type)
 
     g_assert_cmpint(job->job.pause_count, ==, 0);
     g_assert_false(job->job.paused);
-    g_assert_false(job->job.busy); /* We're in job_sleep_ns() */
+    g_assert_true(job->job.busy); /* We're in job_sleep_ns() */
 
     do_drain_begin(drain_type, src);
 
@@ -XXX,XX +XXX,XX @@ static void test_blockjob_common(enum drain_type drain_type)
     } else {
         g_assert_cmpint(job->job.pause_count, ==, 1);
     }
-    /* XXX We don't wait until the job is actually paused. Is this okay? */
-    /* g_assert_true(job->job.paused); */
+    g_assert_true(job->job.paused);
     g_assert_false(job->job.busy); /* The job is paused */
 
     do_drain_end(drain_type, src);
 
     g_assert_cmpint(job->job.pause_count, ==, 0);
     g_assert_false(job->job.paused);
-    g_assert_false(job->job.busy); /* We're in job_sleep_ns() */
+    g_assert_true(job->job.busy); /* We're in qemu_co_sleep_ns() */
 
     do_drain_begin(drain_type, target);
 
@@ -XXX,XX +XXX,XX @@ static void test_blockjob_common(enum drain_type drain_type)
     } else {
         g_assert_cmpint(job->job.pause_count, ==, 1);
     }
-    /* XXX We don't wait until the job is actually paused. Is this okay? */
-    /* g_assert_true(job->job.paused); */
+    g_assert_true(job->job.paused);
     g_assert_false(job->job.busy); /* The job is paused */
 
     do_drain_end(drain_type, target);
 
     g_assert_cmpint(job->job.pause_count, ==, 0);
     g_assert_false(job->job.paused);
-    g_assert_false(job->job.busy); /* We're in job_sleep_ns() */
+    g_assert_true(job->job.busy); /* We're in job_sleep_ns() */
 
     ret = job_complete_sync(&job->job, &error_abort);
     g_assert_cmpint(ret, ==, 0);
-- 
2.13.6

For bdrv_drain(), recursively waiting for child node requests is
pointless because we didn't quiesce their parents, so new requests could
come in anyway. Letting the function work only on a single node makes it
more consistent.

For subtree drains and drain_all, we already have the recursion in
bdrv_do_drained_begin(), so the extra recursion doesn't add anything
either.

Remove the useless code.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/io.c | 36 +++---------------------------------
 1 file changed, 3 insertions(+), 33 deletions(-)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_poll_top_level(BlockDriverState *bs,
     return bdrv_drain_poll(bs, ignore_parent);
 }
 
-static bool bdrv_drain_recurse(BlockDriverState *bs, BdrvChild *parent)
-{
-    BdrvChild *child, *tmp;
-    bool waited;
-
-    /* Wait for drained requests to finish */
-    waited = BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, parent));
-
-    QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
-        BlockDriverState *bs = child->bs;
-        bool in_main_loop =
-            qemu_get_current_aio_context() == qemu_get_aio_context();
-        assert(bs->refcnt > 0);
-        if (in_main_loop) {
-            /* In case the recursive bdrv_drain_recurse processes a
-             * block_job_defer_to_main_loop BH and modifies the graph,
-             * let's hold a reference to bs until we are done.
-             *
-             * IOThread doesn't have such a BH, and it is not safe to call
-             * bdrv_unref without BQL, so skip doing it there.
-             */
-            bdrv_ref(bs);
-        }
-        waited |= bdrv_drain_recurse(bs, child);
-        if (in_main_loop) {
-            bdrv_unref(bs);
-        }
-    }
-
-    return waited;
-}
-
 static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
                                   BdrvChild *parent);
 static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
@@ -XXX,XX +XXX,XX @@ void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
 
     bdrv_parent_drained_begin(bs, parent);
     bdrv_drain_invoke(bs, true);
-    bdrv_drain_recurse(bs, parent);
+
+    /* Wait for drained requests to finish */
+    BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, parent));
 
     if (recursive) {
         bs->recursive_quiesce_counter++;
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

This patch adds two bdrv-drain tests for what happens if some BDS goes
away during the drainage.

The basic idea is that you have a parent BDS with some child nodes.
Then, you drain one of the children.  Because of that, the party who
actually owns the parent decides to (A) delete it, or (B) detach all its
children from it -- both while the child is still being drained.

A real-world case where this can happen is the mirror block job, which
may exit if you drain one of its children.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/test-bdrv-drain.c | 169 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 169 insertions(+)

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static void test_blockjob_drain_subtree(void)
     test_blockjob_common(BDRV_SUBTREE_DRAIN);
 }
 
+
+typedef struct BDRVTestTopState {
+    BdrvChild *wait_child;
+} BDRVTestTopState;
+
+static void bdrv_test_top_close(BlockDriverState *bs)
+{
+    BdrvChild *c, *next_c;
+    QLIST_FOREACH_SAFE(c, &bs->children, next, next_c) {
+        bdrv_unref_child(bs, c);
+    }
+}
+
+static int coroutine_fn bdrv_test_top_co_preadv(BlockDriverState *bs,
+                                                uint64_t offset, uint64_t bytes,
+                                                QEMUIOVector *qiov, int flags)
+{
+    BDRVTestTopState *tts = bs->opaque;
+    return bdrv_co_preadv(tts->wait_child, offset, bytes, qiov, flags);
+}
+
+static BlockDriver bdrv_test_top_driver = {
+    .format_name            = "test_top_driver",
+    .instance_size          = sizeof(BDRVTestTopState),
+
+    .bdrv_close             = bdrv_test_top_close,
+    .bdrv_co_preadv         = bdrv_test_top_co_preadv,
+
+    .bdrv_child_perm        = bdrv_format_default_perms,
+};
+
+typedef struct TestCoDeleteByDrainData {
+    BlockBackend *blk;
+    bool detach_instead_of_delete;
+    bool done;
+} TestCoDeleteByDrainData;
+
+static void coroutine_fn test_co_delete_by_drain(void *opaque)
+{
+    TestCoDeleteByDrainData *dbdd = opaque;
+    BlockBackend *blk = dbdd->blk;
+    BlockDriverState *bs = blk_bs(blk);
+    BDRVTestTopState *tts = bs->opaque;
+    void *buffer = g_malloc(65536);
+    QEMUIOVector qiov;
+    struct iovec iov = {
+        .iov_base = buffer,
+        .iov_len  = 65536,
+    };
+
+    qemu_iovec_init_external(&qiov, &iov, 1);
+
+    /* Pretend some internal write operation from parent to child.
+     * Important: We have to read from the child, not from the parent!
+     * Draining works by first propagating it all up the tree to the
+     * root and then waiting for drainage from root to the leaves
+     * (protocol nodes).  If we have a request waiting on the root,
+     * everything will be drained before we go back down the tree, but
+     * we do not want that.  We want to be in the middle of draining
+     * when this following requests returns. */
+    bdrv_co_preadv(tts->wait_child, 0, 65536, &qiov, 0);
+
+    g_assert_cmpint(bs->refcnt, ==, 1);
+
+    if (!dbdd->detach_instead_of_delete) {
+        blk_unref(blk);
+    } else {
+        BdrvChild *c, *next_c;
+        QLIST_FOREACH_SAFE(c, &bs->children, next, next_c) {
+            bdrv_unref_child(bs, c);
+        }
+    }
+
+    dbdd->done = true;
+}
+
+/**
+ * Test what happens when some BDS has some children, you drain one of
+ * them and this results in the BDS being deleted.
+ *
+ * If @detach_instead_of_delete is set, the BDS is not going to be
+ * deleted but will only detach all of its children.
+ */
+static void do_test_delete_by_drain(bool detach_instead_of_delete)
+{
+    BlockBackend *blk;
+    BlockDriverState *bs, *child_bs, *null_bs;
+    BDRVTestTopState *tts;
+    TestCoDeleteByDrainData dbdd;
+    Coroutine *co;
+
+    bs = bdrv_new_open_driver(&bdrv_test_top_driver, "top", BDRV_O_RDWR,
+                              &error_abort);
+    bs->total_sectors = 65536 >> BDRV_SECTOR_BITS;
+    tts = bs->opaque;
+
+    null_bs = bdrv_open("null-co://", NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
+                        &error_abort);
+    bdrv_attach_child(bs, null_bs, "null-child", &child_file, &error_abort);
+
+    /* This child will be the one to pass to requests through to, and
+     * it will stall until a drain occurs */
+    child_bs = bdrv_new_open_driver(&bdrv_test, "child", BDRV_O_RDWR,
+                                    &error_abort);
+    child_bs->total_sectors = 65536 >> BDRV_SECTOR_BITS;
+    /* Takes our reference to child_bs */
+    tts->wait_child = bdrv_attach_child(bs, child_bs, "wait-child", &child_file,
+                                        &error_abort);
+
+    /* This child is just there to be deleted
+     * (for detach_instead_of_delete == true) */
+    null_bs = bdrv_open("null-co://", NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
+                        &error_abort);
+    bdrv_attach_child(bs, null_bs, "null-child", &child_file, &error_abort);
+
+    blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    blk_insert_bs(blk, bs, &error_abort);
+
+    /* Referenced by blk now */
+    bdrv_unref(bs);
+
+    g_assert_cmpint(bs->refcnt, ==, 1);
+    g_assert_cmpint(child_bs->refcnt, ==, 1);
+    g_assert_cmpint(null_bs->refcnt, ==, 1);
+
+
+    dbdd = (TestCoDeleteByDrainData){
+        .blk = blk,
+        .detach_instead_of_delete = detach_instead_of_delete,
+        .done = false,
+    };
+    co = qemu_coroutine_create(test_co_delete_by_drain, &dbdd);
+    qemu_coroutine_enter(co);
+
+    /* Drain the child while the read operation is still pending.
+     * This should result in the operation finishing and
+     * test_co_delete_by_drain() resuming.  Thus, @bs will be deleted
+     * and the coroutine will exit while this drain operation is still
+     * in progress. */
+    bdrv_ref(child_bs);
+    bdrv_drain(child_bs);
+    bdrv_unref(child_bs);
+
+    while (!dbdd.done) {
+        aio_poll(qemu_get_aio_context(), true);
+    }
+
+    if (detach_instead_of_delete) {
+        /* Here, the reference has not passed over to the coroutine,
+         * so we have to delete the BB ourselves */
+        blk_unref(blk);
+    }
+}
+
+
+static void test_delete_by_drain(void)
+{
+    do_test_delete_by_drain(false);
+}
+
+static void test_detach_by_drain(void)
+{
+    do_test_delete_by_drain(true);
+}
+
+
 int main(int argc, char **argv)
 {
     int ret;
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
     g_test_add_func("/bdrv-drain/blockjob/drain_subtree",
                     test_blockjob_drain_subtree);
 
+    g_test_add_func("/bdrv-drain/deletion", test_delete_by_drain);
+    g_test_add_func("/bdrv-drain/detach", test_detach_by_drain);
+
     ret = g_test_run();
     qemu_event_destroy(&done_event);
     return ret;
-- 
2.13.6

Anything can happen inside BDRV_POLL_WHILE(), including graph
changes that may interfere with its callers (e.g. child list iteration
in recursive callers of bdrv_do_drained_begin).

Switch to a single BDRV_POLL_WHILE() call for the whole subtree at the
end of bdrv_do_drained_begin() to avoid such effects. The recursion
happens now inside the loop condition. As the graph can only change
between bdrv_drain_poll() calls, but not inside of it, doing the
recursion here is safe.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/block.h |  9 +++++---
 block.c               |  2 +-
 block/io.c            | 63 ++++++++++++++++++++++++++++++++++++---------------
 3 files changed, 52 insertions(+), 22 deletions(-)

diff --git a/include/block/block.h b/include/block/block.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -XXX,XX +XXX,XX @@ void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore);
 /**
  * bdrv_drain_poll:
  *
- * Poll for pending requests in @bs and its parents (except for
- * @ignore_parent). This is part of bdrv_drained_begin.
+ * Poll for pending requests in @bs, its parents (except for @ignore_parent),
+ * and if @recursive is true its children as well.
+ *
+ * This is part of bdrv_drained_begin.
  */
-bool bdrv_drain_poll(BlockDriverState *bs, BdrvChild *ignore_parent);
+bool bdrv_drain_poll(BlockDriverState *bs, bool recursive,
+                     BdrvChild *ignore_parent);
 
 /**
  * bdrv_drained_begin:
diff --git a/block.c b/block.c
index XXXXXXX..XXXXXXX 100644
--- a/block.c
+++ b/block.c
@@ -XXX,XX +XXX,XX @@ static void bdrv_child_cb_drained_begin(BdrvChild *child)
 static bool bdrv_child_cb_drained_poll(BdrvChild *child)
 {
     BlockDriverState *bs = child->opaque;
-    return bdrv_drain_poll(bs, NULL);
+    return bdrv_drain_poll(bs, false, NULL);
 }
 
 static void bdrv_child_cb_drained_end(BdrvChild *child)
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ typedef struct {
     bool done;
     bool begin;
     bool recursive;
+    bool poll;
     BdrvChild *parent;
 } BdrvCoDrainData;
 
@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
 }
 
 /* Returns true if BDRV_POLL_WHILE() should go into a blocking aio_poll() */
-bool bdrv_drain_poll(BlockDriverState *bs, BdrvChild *ignore_parent)
+bool bdrv_drain_poll(BlockDriverState *bs, bool recursive,
+                     BdrvChild *ignore_parent)
 {
+    BdrvChild *child, *next;
+
     if (bdrv_parent_drained_poll(bs, ignore_parent)) {
         return true;
     }
 
-    return atomic_read(&bs->in_flight);
+    if (atomic_read(&bs->in_flight)) {
+        return true;
+    }
+
+    if (recursive) {
+        QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
+            if (bdrv_drain_poll(child->bs, recursive, child)) {
+                return true;
+            }
+        }
+    }
+
+    return false;
 }
 
-static bool bdrv_drain_poll_top_level(BlockDriverState *bs,
+static bool bdrv_drain_poll_top_level(BlockDriverState *bs, bool recursive,
                                       BdrvChild *ignore_parent)
 {
     /* Execute pending BHs first and check everything else only after the BHs
      * have executed. */
     while (aio_poll(bs->aio_context, false));
 
-    return bdrv_drain_poll(bs, ignore_parent);
+    return bdrv_drain_poll(bs, recursive, ignore_parent);
 }
 
 static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
-                                  BdrvChild *parent);
+                                  BdrvChild *parent, bool poll);
 static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
                                 BdrvChild *parent);
 
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
 
     bdrv_dec_in_flight(bs);
     if (data->begin) {
-        bdrv_do_drained_begin(bs, data->recursive, data->parent);
+        bdrv_do_drained_begin(bs, data->recursive, data->parent, data->poll);
     } else {
         bdrv_do_drained_end(bs, data->recursive, data->parent);
     }
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
 
 static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
                                                 bool begin, bool recursive,
-                                                BdrvChild *parent)
+                                                BdrvChild *parent, bool poll)
 {
     BdrvCoDrainData data;
 
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
         .begin = begin,
         .recursive = recursive,
         .parent = parent,
+        .poll = poll,
     };
     bdrv_inc_in_flight(bs);
     aio_bh_schedule_oneshot(bdrv_get_aio_context(bs),
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
 }
 
 void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
-                           BdrvChild *parent)
+                           BdrvChild *parent, bool poll)
 {
     BdrvChild *child, *next;
 
     if (qemu_in_coroutine()) {
-        bdrv_co_yield_to_drain(bs, true, recursive, parent);
+        bdrv_co_yield_to_drain(bs, true, recursive, parent, poll);
         return;
     }
 
@@ -XXX,XX +XXX,XX @@ void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
     bdrv_parent_drained_begin(bs, parent);
     bdrv_drain_invoke(bs, true);
 
-    /* Wait for drained requests to finish */
-    BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, parent));
-
     if (recursive) {
         bs->recursive_quiesce_counter++;
         QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
-            bdrv_do_drained_begin(child->bs, true, child);
+            bdrv_do_drained_begin(child->bs, true, child, false);
         }
     }
+
+    /*
+     * Wait for drained requests to finish.
+     *
+     * Calling BDRV_POLL_WHILE() only once for the top-level node is okay: The
+     * call is needed so things in this AioContext can make progress even
+     * though we don't return to the main AioContext loop - this automatically
+     * includes other nodes in the same AioContext and therefore all child
+     * nodes.
+     */
+    if (poll) {
+        BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, recursive, parent));
+    }
 }
 
 void bdrv_drained_begin(BlockDriverState *bs)
 {
-    bdrv_do_drained_begin(bs, false, NULL);
+    bdrv_do_drained_begin(bs, false, NULL, true);
 }
 
 void bdrv_subtree_drained_begin(BlockDriverState *bs)
 {
-    bdrv_do_drained_begin(bs, true, NULL);
+    bdrv_do_drained_begin(bs, true, NULL, true);
 }
 
 void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
@@ -XXX,XX +XXX,XX @@ void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
     int old_quiesce_counter;
 
     if (qemu_in_coroutine()) {
-        bdrv_co_yield_to_drain(bs, false, recursive, parent);
+        bdrv_co_yield_to_drain(bs, false, recursive, parent, false);
         return;
     }
     assert(bs->quiesce_counter > 0);
@@ -XXX,XX +XXX,XX @@ void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent)
     int i;
 
     for (i = 0; i < new_parent->recursive_quiesce_counter; i++) {
-        bdrv_do_drained_begin(child->bs, true, child);
+        bdrv_do_drained_begin(child->bs, true, child, true);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
         AioContext *aio_context = bdrv_get_aio_context(bs);
 
         aio_context_acquire(aio_context);
-        bdrv_do_drained_begin(bs, true, NULL);
+        bdrv_do_drained_begin(bs, true, NULL, true);
         aio_context_release(aio_context);
     }
 
-- 
2.13.6

If bdrv_do_drained_begin() polls during its subtree recursion, the graph
can change and mess up the bs->children iteration. Test that this
doesn't happen.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/test-bdrv-drain.c | 38 +++++++++++++++++++++++++++++---------
 1 file changed, 29 insertions(+), 9 deletions(-)

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn test_co_delete_by_drain(void *opaque)
  * If @detach_instead_of_delete is set, the BDS is not going to be
  * deleted but will only detach all of its children.
  */
-static void do_test_delete_by_drain(bool detach_instead_of_delete)
+static void do_test_delete_by_drain(bool detach_instead_of_delete,
+                                    enum drain_type drain_type)
 {
     BlockBackend *blk;
     BlockDriverState *bs, *child_bs, *null_bs;
@@ -XXX,XX +XXX,XX @@ static void do_test_delete_by_drain(bool detach_instead_of_delete)
      * test_co_delete_by_drain() resuming.  Thus, @bs will be deleted
      * and the coroutine will exit while this drain operation is still
      * in progress. */
-    bdrv_ref(child_bs);
-    bdrv_drain(child_bs);
-    bdrv_unref(child_bs);
+    switch (drain_type) {
+    case BDRV_DRAIN:
+        bdrv_ref(child_bs);
+        bdrv_drain(child_bs);
+        bdrv_unref(child_bs);
+        break;
+    case BDRV_SUBTREE_DRAIN:
+        /* Would have to ref/unref bs here for !detach_instead_of_delete, but
+         * then the whole test becomes pointless because the graph changes
+         * don't occur during the drain any more. */
+        assert(detach_instead_of_delete);
+        bdrv_subtree_drained_begin(bs);
+        bdrv_subtree_drained_end(bs);
+        break;
+    default:
+        g_assert_not_reached();
+    }
 
     while (!dbdd.done) {
         aio_poll(qemu_get_aio_context(), true);
@@ -XXX,XX +XXX,XX @@ static void do_test_delete_by_drain(bool detach_instead_of_delete)
     }
 }
 
-
 static void test_delete_by_drain(void)
 {
-    do_test_delete_by_drain(false);
+    do_test_delete_by_drain(false, BDRV_DRAIN);
 }
 
 static void test_detach_by_drain(void)
 {
-    do_test_delete_by_drain(true);
+    do_test_delete_by_drain(true, BDRV_DRAIN);
+}
+
+static void test_detach_by_drain_subtree(void)
+{
+    do_test_delete_by_drain(true, BDRV_SUBTREE_DRAIN);
 }
 
 
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
     g_test_add_func("/bdrv-drain/blockjob/drain_subtree",
                     test_blockjob_drain_subtree);
 
-    g_test_add_func("/bdrv-drain/deletion", test_delete_by_drain);
-    g_test_add_func("/bdrv-drain/detach", test_detach_by_drain);
+    g_test_add_func("/bdrv-drain/deletion/drain", test_delete_by_drain);
+    g_test_add_func("/bdrv-drain/detach/drain", test_detach_by_drain);
+    g_test_add_func("/bdrv-drain/detach/drain_subtree", test_detach_by_drain_subtree);
 
     ret = g_test_run();
     qemu_event_destroy(&done_event);
-- 
2.13.6

bdrv_do_drained_begin() is only safe if we have a single
BDRV_POLL_WHILE() after quiescing all affected nodes. We cannot allow
that parent callbacks introduce a nested polling loop that could cause
graph changes while we're traversing the graph.

Split off bdrv_do_drained_begin_quiesce(), which only quiesces a single
node without waiting for its requests to complete. These requests will
be waited for in the BDRV_POLL_WHILE() call down the call chain.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/block.h |  9 +++++++++
 block.c               |  2 +-
 block/io.c            | 24 ++++++++++++++++--------
 3 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/include/block/block.h b/include/block/block.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -XXX,XX +XXX,XX @@ bool bdrv_drain_poll(BlockDriverState *bs, bool recursive,
 void bdrv_drained_begin(BlockDriverState *bs);
 
 /**
+ * bdrv_do_drained_begin_quiesce:
+ *
+ * Quiesces a BDS like bdrv_drained_begin(), but does not wait for already
+ * running requests to complete.
+ */
+void bdrv_do_drained_begin_quiesce(BlockDriverState *bs,
+                                   BdrvChild *parent);
+
+/**
  * Like bdrv_drained_begin, but recursively begins a quiesced section for
  * exclusive access to all child nodes as well.
  */
diff --git a/block.c b/block.c
index XXXXXXX..XXXXXXX 100644
--- a/block.c
+++ b/block.c
@@ -XXX,XX +XXX,XX @@ static char *bdrv_child_get_parent_desc(BdrvChild *c)
 static void bdrv_child_cb_drained_begin(BdrvChild *child)
 {
     BlockDriverState *bs = child->opaque;
-    bdrv_drained_begin(bs);
+    bdrv_do_drained_begin_quiesce(bs, NULL);
 }
 
 static bool bdrv_child_cb_drained_poll(BdrvChild *child)
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
     assert(data.done);
 }
 
-void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
-                           BdrvChild *parent, bool poll)
+void bdrv_do_drained_begin_quiesce(BlockDriverState *bs,
+                                   BdrvChild *parent)
 {
-    BdrvChild *child, *next;
-
-    if (qemu_in_coroutine()) {
-        bdrv_co_yield_to_drain(bs, true, recursive, parent, poll);
-        return;
-    }
+    assert(!qemu_in_coroutine());
 
     /* Stop things in parent-to-child order */
     if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
@@ -XXX,XX +XXX,XX @@ void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
 
     bdrv_parent_drained_begin(bs, parent);
     bdrv_drain_invoke(bs, true);
+}
+
+static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
+                                  BdrvChild *parent, bool poll)
+{
+    BdrvChild *child, *next;
+
+    if (qemu_in_coroutine()) {
+        bdrv_co_yield_to_drain(bs, true, recursive, parent, poll);
+        return;
+    }
+
+    bdrv_do_drained_begin_quiesce(bs, parent);
 
     if (recursive) {
         bs->recursive_quiesce_counter++;
-- 
2.13.6

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/test-bdrv-drain.c | 130 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 130 insertions(+)

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static void test_detach_by_drain_subtree(void)
 }
 
 
+struct detach_by_parent_data {
+    BlockDriverState *parent_b;
+    BdrvChild *child_b;
+    BlockDriverState *c;
+    BdrvChild *child_c;
+};
+
+static void detach_by_parent_aio_cb(void *opaque, int ret)
+{
+    struct detach_by_parent_data *data = opaque;
+
+    g_assert_cmpint(ret, ==, 0);
+    bdrv_unref_child(data->parent_b, data->child_b);
+
+    bdrv_ref(data->c);
+    data->child_c = bdrv_attach_child(data->parent_b, data->c, "PB-C",
+                                      &child_file, &error_abort);
+}
+
+/*
+ * Initial graph:
+ *
+ * PA     PB
+ *    \ /   \
+ *     A     B     C
+ *
+ * PA has a pending write request whose callback changes the child nodes of PB:
+ * It removes B and adds C instead. The subtree of PB is drained, which will
+ * indirectly drain the write request, too.
+ */
+static void test_detach_by_parent_cb(void)
+{
+    BlockBackend *blk;
+    BlockDriverState *parent_a, *parent_b, *a, *b, *c;
+    BdrvChild *child_a, *child_b;
+    BlockAIOCB *acb;
+    struct detach_by_parent_data data;
+
+    QEMUIOVector qiov;
+    struct iovec iov = {
+        .iov_base = NULL,
+        .iov_len = 0,
+    };
+    qemu_iovec_init_external(&qiov, &iov, 1);
+
+    /* Create all involved nodes */
+    parent_a = bdrv_new_open_driver(&bdrv_test, "parent-a", BDRV_O_RDWR,
+                                    &error_abort);
+    parent_b = bdrv_new_open_driver(&bdrv_test, "parent-b", 0,
+                                    &error_abort);
+
+    a = bdrv_new_open_driver(&bdrv_test, "a", BDRV_O_RDWR, &error_abort);
+    b = bdrv_new_open_driver(&bdrv_test, "b", BDRV_O_RDWR, &error_abort);
+    c = bdrv_new_open_driver(&bdrv_test, "c", BDRV_O_RDWR, &error_abort);
+
+    /* blk is a BB for parent-a */
+    blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    blk_insert_bs(blk, parent_a, &error_abort);
+    bdrv_unref(parent_a);
+
+    /* Set child relationships */
+    bdrv_ref(b);
+    bdrv_ref(a);
+    child_b = bdrv_attach_child(parent_b, b, "PB-B", &child_file, &error_abort);
+    child_a = bdrv_attach_child(parent_b, a, "PB-A", &child_backing, &error_abort);
+
+    bdrv_ref(a);
+    bdrv_attach_child(parent_a, a, "PA-A", &child_file, &error_abort);
+
+    g_assert_cmpint(parent_a->refcnt, ==, 1);
+    g_assert_cmpint(parent_b->refcnt, ==, 1);
+    g_assert_cmpint(a->refcnt, ==, 3);
+    g_assert_cmpint(b->refcnt, ==, 2);
+    g_assert_cmpint(c->refcnt, ==, 1);
+
+    g_assert(QLIST_FIRST(&parent_b->children) == child_a);
+    g_assert(QLIST_NEXT(child_a, next) == child_b);
+    g_assert(QLIST_NEXT(child_b, next) == NULL);
+
+    /* Start the evil write request */
+    data = (struct detach_by_parent_data) {
+        .parent_b = parent_b,
+        .child_b = child_b,
+        .c = c,
+    };
+    acb = blk_aio_preadv(blk, 0, &qiov, 0, detach_by_parent_aio_cb, &data);
+    g_assert(acb != NULL);
+
+    /* Drain and check the expected result */
+    bdrv_subtree_drained_begin(parent_b);
+
+    g_assert(data.child_c != NULL);
+
+    g_assert_cmpint(parent_a->refcnt, ==, 1);
+    g_assert_cmpint(parent_b->refcnt, ==, 1);
+    g_assert_cmpint(a->refcnt, ==, 3);
+    g_assert_cmpint(b->refcnt, ==, 1);
+    g_assert_cmpint(c->refcnt, ==, 2);
+
+    g_assert(QLIST_FIRST(&parent_b->children) == data.child_c);
+    g_assert(QLIST_NEXT(data.child_c, next) == child_a);
+    g_assert(QLIST_NEXT(child_a, next) == NULL);
+
+    g_assert_cmpint(parent_a->quiesce_counter, ==, 1);
+    g_assert_cmpint(parent_b->quiesce_counter, ==, 1);
+    g_assert_cmpint(a->quiesce_counter, ==, 1);
+    g_assert_cmpint(b->quiesce_counter, ==, 0);
+    g_assert_cmpint(c->quiesce_counter, ==, 1);
+
+    bdrv_subtree_drained_end(parent_b);
+
+    bdrv_unref(parent_b);
+    blk_unref(blk);
+
+    /* XXX Once bdrv_close() unref's children instead of just detaching them,
+     * this won't be necessary any more. */
+    bdrv_unref(a);
+    bdrv_unref(a);
+    bdrv_unref(c);
+
+    g_assert_cmpint(a->refcnt, ==, 1);
+    g_assert_cmpint(b->refcnt, ==, 1);
+    g_assert_cmpint(c->refcnt, ==, 1);
+    bdrv_unref(a);
+    bdrv_unref(b);
+    bdrv_unref(c);
+}
+
+
 int main(int argc, char **argv)
 {
     int ret;
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
     g_test_add_func("/bdrv-drain/deletion/drain", test_delete_by_drain);
     g_test_add_func("/bdrv-drain/detach/drain", test_detach_by_drain);
     g_test_add_func("/bdrv-drain/detach/drain_subtree", test_detach_by_drain_subtree);
+    g_test_add_func("/bdrv-drain/detach/parent_cb", test_detach_by_parent_cb);
 
     ret = g_test_run();
     qemu_event_destroy(&done_event);
-- 
2.13.6

We cannot allow aio_poll() in bdrv_drain_invoke(begin=true) until we're
done with propagating the drain through the graph and are doing the
single final BDRV_POLL_WHILE().

Just schedule the coroutine with the callback and increase bs->in_flight
to make sure that the polling phase will wait for it.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/io.c | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
 
     /* Set data->done before reading bs->wakeup.  */
     atomic_mb_set(&data->done, true);
-    bdrv_wakeup(bs);
+    bdrv_dec_in_flight(bs);
+
+    if (data->begin) {
+        g_free(data);
+    }
 }
 
 /* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
 static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
 {
-    BdrvCoDrainData data = { .bs = bs, .done = false, .begin = begin};
+    BdrvCoDrainData *data;
 
     if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) ||
             (!begin && !bs->drv->bdrv_co_drain_end)) {
         return;
     }
 
-    data.co = qemu_coroutine_create(bdrv_drain_invoke_entry, &data);
-    bdrv_coroutine_enter(bs, data.co);
-    BDRV_POLL_WHILE(bs, !data.done);
+    data = g_new(BdrvCoDrainData, 1);
+    *data = (BdrvCoDrainData) {
+        .bs = bs,
+        .done = false,
+        .begin = begin
+    };
+
+    /* Make sure the driver callback completes during the polling phase for
+     * drain_begin. */
+    bdrv_inc_in_flight(bs);
+    data->co = qemu_coroutine_create(bdrv_drain_invoke_entry, data);
+    aio_co_schedule(bdrv_get_aio_context(bs), data->co);
+
+    if (!begin) {
+        BDRV_POLL_WHILE(bs, !data->done);
+        g_free(data);
+    }
 }
 
 /* Returns true if BDRV_POLL_WHILE() should go into a blocking aio_poll() */
-- 
2.13.6

This adds a test case that goes wrong if bdrv_drain_invoke() calls
aio_poll().

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/test-bdrv-drain.c | 102 +++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 88 insertions(+), 14 deletions(-)

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static QemuEvent done_event;
 typedef struct BDRVTestState {
     int drain_count;
     AioContext *bh_indirection_ctx;
+    bool sleep_in_drain_begin;
 } BDRVTestState;
 
 static void coroutine_fn bdrv_test_co_drain_begin(BlockDriverState *bs)
 {
     BDRVTestState *s = bs->opaque;
     s->drain_count++;
+    if (s->sleep_in_drain_begin) {
+        qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100000);
+    }
 }
 
 static void coroutine_fn bdrv_test_co_drain_end(BlockDriverState *bs)
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_test_co_preadv(BlockDriverState *bs,
     return 0;
 }
 
+static void bdrv_test_child_perm(BlockDriverState *bs, BdrvChild *c,
+                                 const BdrvChildRole *role,
+                                 BlockReopenQueue *reopen_queue,
+                                 uint64_t perm, uint64_t shared,
+                                 uint64_t *nperm, uint64_t *nshared)
+{
+    /* bdrv_format_default_perms() accepts only these two, so disguise
+     * detach_by_driver_cb_role as one of them. */
+    if (role != &child_file && role != &child_backing) {
+        role = &child_file;
+    }
+
+    bdrv_format_default_perms(bs, c, role, reopen_queue, perm, shared,
+                              nperm, nshared);
+}
+
 static BlockDriver bdrv_test = {
     .format_name            = "test",
     .instance_size          = sizeof(BDRVTestState),
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_test = {
     .bdrv_co_drain_begin    = bdrv_test_co_drain_begin,
     .bdrv_co_drain_end      = bdrv_test_co_drain_end,
 
-    .bdrv_child_perm        = bdrv_format_default_perms,
+    .bdrv_child_perm        = bdrv_test_child_perm,
 };
 
 static void aio_ret_cb(void *opaque, int ret)
@@ -XXX,XX +XXX,XX @@ struct detach_by_parent_data {
     BdrvChild *child_b;
     BlockDriverState *c;
     BdrvChild *child_c;
+    bool by_parent_cb;
 };
+static struct detach_by_parent_data detach_by_parent_data;
 
-static void detach_by_parent_aio_cb(void *opaque, int ret)
+static void detach_indirect_bh(void *opaque)
 {
     struct detach_by_parent_data *data = opaque;
 
-    g_assert_cmpint(ret, ==, 0);
     bdrv_unref_child(data->parent_b, data->child_b);
 
     bdrv_ref(data->c);
@@ -XXX,XX +XXX,XX @@ static void detach_by_parent_aio_cb(void *opaque, int ret)
                                       &child_file, &error_abort);
 }
 
+static void detach_by_parent_aio_cb(void *opaque, int ret)
+{
+    struct detach_by_parent_data *data = &detach_by_parent_data;
+
+    g_assert_cmpint(ret, ==, 0);
+    if (data->by_parent_cb) {
+        detach_indirect_bh(data);
+    }
+}
+
+static void detach_by_driver_cb_drained_begin(BdrvChild *child)
+{
+    aio_bh_schedule_oneshot(qemu_get_current_aio_context(),
+                            detach_indirect_bh, &detach_by_parent_data);
+    child_file.drained_begin(child);
+}
+
+static BdrvChildRole detach_by_driver_cb_role;
+
 /*
  * Initial graph:
  *
@@ -XXX,XX +XXX,XX @@ static void detach_by_parent_aio_cb(void *opaque, int ret)
  *    \ /   \
  *     A     B     C
  *
- * PA has a pending write request whose callback changes the child nodes of PB:
- * It removes B and adds C instead. The subtree of PB is drained, which will
- * indirectly drain the write request, too.
+ * by_parent_cb == true:  Test that parent callbacks don't poll
+ *
+ *     PA has a pending write request whose callback changes the child nodes of
+ *     PB: It removes B and adds C instead. The subtree of PB is drained, which
+ *     will indirectly drain the write request, too.
+ *
+ * by_parent_cb == false: Test that bdrv_drain_invoke() doesn't poll
+ *
+ *     PA's BdrvChildRole has a .drained_begin callback that schedules a BH
+ *     that does the same graph change. If bdrv_drain_invoke() calls it, the
+ *     state is messed up, but if it is only polled in the single
+ *     BDRV_POLL_WHILE() at the end of the drain, this should work fine.
  */
-static void test_detach_by_parent_cb(void)
+static void test_detach_indirect(bool by_parent_cb)
 {
     BlockBackend *blk;
     BlockDriverState *parent_a, *parent_b, *a, *b, *c;
     BdrvChild *child_a, *child_b;
     BlockAIOCB *acb;
-    struct detach_by_parent_data data;
 
     QEMUIOVector qiov;
     struct iovec iov = {
@@ -XXX,XX +XXX,XX @@ static void test_detach_by_parent_cb(void)
     };
     qemu_iovec_init_external(&qiov, &iov, 1);
 
+    if (!by_parent_cb) {
+        detach_by_driver_cb_role = child_file;
+        detach_by_driver_cb_role.drained_begin =
+            detach_by_driver_cb_drained_begin;
+    }
+
     /* Create all involved nodes */
     parent_a = bdrv_new_open_driver(&bdrv_test, "parent-a", BDRV_O_RDWR,
                                     &error_abort);
@@ -XXX,XX +XXX,XX @@ static void test_detach_by_parent_cb(void)
     blk_insert_bs(blk, parent_a, &error_abort);
     bdrv_unref(parent_a);
 
+    /* If we want to get bdrv_drain_invoke() to call aio_poll(), the driver
+     * callback must not return immediately. */
+    if (!by_parent_cb) {
+        BDRVTestState *s = parent_a->opaque;
+        s->sleep_in_drain_begin = true;
+    }
+
     /* Set child relationships */
     bdrv_ref(b);
     bdrv_ref(a);
@@ -XXX,XX +XXX,XX @@ static void test_detach_by_parent_cb(void)
     child_a = bdrv_attach_child(parent_b, a, "PB-A", &child_backing, &error_abort);
 
     bdrv_ref(a);
-    bdrv_attach_child(parent_a, a, "PA-A", &child_file, &error_abort);
+    bdrv_attach_child(parent_a, a, "PA-A",
+                      by_parent_cb ? &child_file : &detach_by_driver_cb_role,
+                      &error_abort);
 
     g_assert_cmpint(parent_a->refcnt, ==, 1);
     g_assert_cmpint(parent_b->refcnt, ==, 1);
@@ -XXX,XX +XXX,XX @@ static void test_detach_by_parent_cb(void)
     g_assert(QLIST_NEXT(child_b, next) == NULL);
 
     /* Start the evil write request */
-    data = (struct detach_by_parent_data) {
+    detach_by_parent_data = (struct detach_by_parent_data) {
         .parent_b = parent_b,
         .child_b = child_b,
         .c = c,
+        .by_parent_cb = by_parent_cb,
     };
-    acb = blk_aio_preadv(blk, 0, &qiov, 0, detach_by_parent_aio_cb, &data);
+    acb = blk_aio_preadv(blk, 0, &qiov, 0, detach_by_parent_aio_cb, NULL);
     g_assert(acb != NULL);
 
     /* Drain and check the expected result */
     bdrv_subtree_drained_begin(parent_b);
 
-    g_assert(data.child_c != NULL);
+    g_assert(detach_by_parent_data.child_c != NULL);
 
     g_assert_cmpint(parent_a->refcnt, ==, 1);
     g_assert_cmpint(parent_b->refcnt, ==, 1);
@@ -XXX,XX +XXX,XX @@ static void test_detach_by_parent_cb(void)
     g_assert_cmpint(b->refcnt, ==, 1);
     g_assert_cmpint(c->refcnt, ==, 2);
 
-    g_assert(QLIST_FIRST(&parent_b->children) == data.child_c);
-    g_assert(QLIST_NEXT(data.child_c, next) == child_a);
+    g_assert(QLIST_FIRST(&parent_b->children) == detach_by_parent_data.child_c);
+    g_assert(QLIST_NEXT(detach_by_parent_data.child_c, next) == child_a);
     g_assert(QLIST_NEXT(child_a, next) == NULL);
 
     g_assert_cmpint(parent_a->quiesce_counter, ==, 1);
@@ -XXX,XX +XXX,XX @@ static void test_detach_by_parent_cb(void)
     bdrv_unref(c);
 }
 
+static void test_detach_by_parent_cb(void)
+{
+    test_detach_indirect(true);
+}
+
+static void test_detach_by_driver_cb(void)
+{
+    test_detach_indirect(false);
+}
 
 int main(int argc, char **argv)
 {
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
     g_test_add_func("/bdrv-drain/detach/drain", test_detach_by_drain);
     g_test_add_func("/bdrv-drain/detach/drain_subtree", test_detach_by_drain_subtree);
     g_test_add_func("/bdrv-drain/detach/parent_cb", test_detach_by_parent_cb);
+    g_test_add_func("/bdrv-drain/detach/driver_cb", test_detach_by_driver_cb);
 
     ret = g_test_run();
     qemu_event_destroy(&done_event);
-- 
2.13.6

bdrv_drain_all() wants to have a single polling loop for draining the
in-flight requests of all nodes. This means that the AIO_WAIT_WHILE()
condition relies on activity in multiple AioContexts, which is polled
from the mainloop context. We must therefore call AIO_WAIT_WHILE() from
the mainloop thread and use the AioWait notification mechanism.

Just randomly picking the AioContext of any non-mainloop thread would
work, but instead of bothering to find such a context in the caller, we
can just as well accept NULL for ctx.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/aio-wait.h | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/include/block/aio-wait.h b/include/block/aio-wait.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/aio-wait.h
+++ b/include/block/aio-wait.h
@@ -XXX,XX +XXX,XX @@ typedef struct {
 /**
  * AIO_WAIT_WHILE:
  * @wait: the aio wait object
- * @ctx: the aio context
+ * @ctx: the aio context, or NULL if multiple aio contexts (for which the
+ *       caller does not hold a lock) are involved in the polling condition.
  * @cond: wait while this conditional expression is true
  *
  * Wait while a condition is true.  Use this to implement synchronous
@@ -XXX,XX +XXX,XX @@ typedef struct {
     bool waited_ = false;                                          \
     AioWait *wait_ = (wait);                                       \
     AioContext *ctx_ = (ctx);                                      \
-    if (in_aio_context_home_thread(ctx_)) {                        \
+    if (ctx_ && in_aio_context_home_thread(ctx_)) {                \
         while ((cond)) {                                           \
             aio_poll(ctx_, true);                                  \
             waited_ = true;                                        \
@@ -XXX,XX +XXX,XX @@ typedef struct {
         /* Increment wait_->num_waiters before evaluating cond. */ \
         atomic_inc(&wait_->num_waiters);                           \
         while ((cond)) {                                           \
-            aio_context_release(ctx_);                             \
+            if (ctx_) {                                            \
+                aio_context_release(ctx_);                         \
+            }                                                      \
             aio_poll(qemu_get_aio_context(), true);                \
-            aio_context_acquire(ctx_);                             \
+            if (ctx_) {                                            \
+                aio_context_acquire(ctx_);                         \
+            }                                                      \
             waited_ = true;                                        \
         }                                                          \
         atomic_dec(&wait_->num_waiters);                           \
-- 
2.13.6

Before we can introduce a single polling loop for all nodes in
bdrv_drain_all_begin(), we must make sure to run it outside of coroutine
context like we already do for bdrv_do_drained_begin().

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/io.c | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
     Coroutine *co = data->co;
     BlockDriverState *bs = data->bs;
 
-    bdrv_dec_in_flight(bs);
-    if (data->begin) {
-        bdrv_do_drained_begin(bs, data->recursive, data->parent, data->poll);
+    if (bs) {
+        bdrv_dec_in_flight(bs);
+        if (data->begin) {
+            bdrv_do_drained_begin(bs, data->recursive, data->parent, data->poll);
+        } else {
+            bdrv_do_drained_end(bs, data->recursive, data->parent);
+        }
     } else {
-        bdrv_do_drained_end(bs, data->recursive, data->parent);
+        assert(data->begin);
+        bdrv_drain_all_begin();
     }
 
     data->done = true;
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
         .parent = parent,
         .poll = poll,
     };
-    bdrv_inc_in_flight(bs);
+    if (bs) {
+        bdrv_inc_in_flight(bs);
+    }
     aio_bh_schedule_oneshot(bdrv_get_aio_context(bs),
                             bdrv_co_drain_bh_cb, &data);
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
     BlockDriverState *bs;
     BdrvNextIterator it;
 
+    if (qemu_in_coroutine()) {
+        bdrv_co_yield_to_drain(NULL, true, false, NULL, true);
+        return;
+    }
+
     /* BDRV_POLL_WHILE() for a node can only be called from its own I/O thread
      * or the main loop AioContext. We potentially use BDRV_POLL_WHILE() on
      * nodes in several different AioContexts, so make sure we're in the main
-- 
2.13.6

In the future, bdrv_drained_all_begin/end() will drain all invidiual
nodes separately rather than whole subtrees. This means that we don't
want to propagate the drain to all parents any more: If the parent is a
BDS, it will already be drained separately. Recursing to all parents is
unnecessary work and would make it an O(n²) operation.

Prepare the drain function for the changed drain_all by adding an
ignore_bds_parents parameter to the internal implementation that
prevents the propagation of the drain to BDS parents. We still (have to)
propagate it to non-BDS parents like BlockBackends or Jobs because those
are not drained separately.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/block.h     | 16 ++++++---
 include/block/block_int.h |  6 ++++
 block.c                   | 11 +++---
 block/io.c                | 88 ++++++++++++++++++++++++++++-------------------
 block/vvfat.c             |  1 +
 5 files changed, 78 insertions(+), 44 deletions(-)

diff --git a/include/block/block.h b/include/block/block.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -XXX,XX +XXX,XX @@ void bdrv_io_unplug(BlockDriverState *bs);
  * Begin a quiesced section of all users of @bs. This is part of
  * bdrv_drained_begin.
  */
-void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore);
+void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore,
+                               bool ignore_bds_parents);
 
 /**
  * bdrv_parent_drained_end:
@@ -XXX,XX +XXX,XX @@ void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore);
  * End a quiesced section of all users of @bs. This is part of
  * bdrv_drained_end.
  */
-void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore);
+void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore,
+                             bool ignore_bds_parents);
 
 /**
  * bdrv_drain_poll:
  *
  * Poll for pending requests in @bs, its parents (except for @ignore_parent),
- * and if @recursive is true its children as well.
+ * and if @recursive is true its children as well (used for subtree drain).
+ *
+ * If @ignore_bds_parents is true, parents that are BlockDriverStates must
+ * ignore the drain request because they will be drained separately (used for
+ * drain_all).
  *
  * This is part of bdrv_drained_begin.
  */
 bool bdrv_drain_poll(BlockDriverState *bs, bool recursive,
-                     BdrvChild *ignore_parent);
+                     BdrvChild *ignore_parent, bool ignore_bds_parents);
 
 /**
  * bdrv_drained_begin:
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs);
  * running requests to complete.
  */
 void bdrv_do_drained_begin_quiesce(BlockDriverState *bs,
-                                   BdrvChild *parent);
+                                   BdrvChild *parent, bool ignore_bds_parents);
 
 /**
  * Like bdrv_drained_begin, but recursively begins a quiesced section for
diff --git a/include/block/block_int.h b/include/block/block_int.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -XXX,XX +XXX,XX @@ struct BdrvChildRole {
      * points to. */
     bool stay_at_node;
 
+    /* If true, the parent is a BlockDriverState and bdrv_next_all_states()
+     * will return it. This information is used for drain_all, where every node
+     * will be drained separately, so the drain only needs to be propagated to
+     * non-BDS parents. */
+    bool parent_is_bds;
+
     void (*inherit_options)(int *child_flags, QDict *child_options,
                             int parent_flags, QDict *parent_options);
 
diff --git a/block.c b/block.c
index XXXXXXX..XXXXXXX 100644
--- a/block.c
+++ b/block.c
@@ -XXX,XX +XXX,XX @@ static char *bdrv_child_get_parent_desc(BdrvChild *c)
 static void bdrv_child_cb_drained_begin(BdrvChild *child)
 {
     BlockDriverState *bs = child->opaque;
-    bdrv_do_drained_begin_quiesce(bs, NULL);
+    bdrv_do_drained_begin_quiesce(bs, NULL, false);
 }
 
 static bool bdrv_child_cb_drained_poll(BdrvChild *child)
 {
     BlockDriverState *bs = child->opaque;
-    return bdrv_drain_poll(bs, false, NULL);
+    return bdrv_drain_poll(bs, false, NULL, false);
 }
 
 static void bdrv_child_cb_drained_end(BdrvChild *child)
@@ -XXX,XX +XXX,XX @@ static void bdrv_inherited_options(int *child_flags, QDict *child_options,
 }
 
 const BdrvChildRole child_file = {
+    .parent_is_bds   = true,
     .get_parent_desc = bdrv_child_get_parent_desc,
     .inherit_options = bdrv_inherited_options,
     .drained_begin   = bdrv_child_cb_drained_begin,
@@ -XXX,XX +XXX,XX @@ static void bdrv_inherited_fmt_options(int *child_flags, QDict *child_options,
 }
 
 const BdrvChildRole child_format = {
+    .parent_is_bds   = true,
     .get_parent_desc = bdrv_child_get_parent_desc,
     .inherit_options = bdrv_inherited_fmt_options,
     .drained_begin   = bdrv_child_cb_drained_begin,
@@ -XXX,XX +XXX,XX @@ static int bdrv_backing_update_filename(BdrvChild *c, BlockDriverState *base,
 }
 
 const BdrvChildRole child_backing = {
+    .parent_is_bds   = true,
     .get_parent_desc = bdrv_child_get_parent_desc,
     .attach          = bdrv_backing_attach,
     .detach          = bdrv_backing_detach,
@@ -XXX,XX +XXX,XX @@ void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
     AioContext *ctx = bdrv_get_aio_context(bs);
 
     aio_disable_external(ctx);
-    bdrv_parent_drained_begin(bs, NULL);
+    bdrv_parent_drained_begin(bs, NULL, false);
     bdrv_drain(bs); /* ensure there are no in-flight requests */
 
     while (aio_poll(ctx, false)) {
@@ -XXX,XX +XXX,XX @@ void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
      */
     aio_context_acquire(new_context);
     bdrv_attach_aio_context(bs, new_context);
-    bdrv_parent_drained_end(bs, NULL);
+    bdrv_parent_drained_end(bs, NULL, false);
     aio_enable_external(ctx);
     aio_context_release(new_context);
 }
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@
 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
     int64_t offset, int bytes, BdrvRequestFlags flags);
 
-void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore)
+void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore,
+                               bool ignore_bds_parents)
 {
     BdrvChild *c, *next;
 
     QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
-        if (c == ignore) {
+        if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) {
             continue;
         }
         if (c->role->drained_begin) {
@@ -XXX,XX +XXX,XX @@ void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore)
     }
 }
 
-void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore)
+void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore,
+                             bool ignore_bds_parents)
 {
     BdrvChild *c, *next;
 
     QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
-        if (c == ignore) {
+        if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) {
             continue;
         }
         if (c->role->drained_end) {
@@ -XXX,XX +XXX,XX @@ void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore)
     }
 }
 
-static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore)
+static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore,
+                                     bool ignore_bds_parents)
 {
     BdrvChild *c, *next;
     bool busy = false;
 
     QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
-        if (c == ignore) {
+        if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) {
             continue;
         }
         if (c->role->drained_poll) {
@@ -XXX,XX +XXX,XX @@ typedef struct {
     bool recursive;
     bool poll;
     BdrvChild *parent;
+    bool ignore_bds_parents;
 } BdrvCoDrainData;
 
 static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
 
 /* Returns true if BDRV_POLL_WHILE() should go into a blocking aio_poll() */
 bool bdrv_drain_poll(BlockDriverState *bs, bool recursive,
-                     BdrvChild *ignore_parent)
+                     BdrvChild *ignore_parent, bool ignore_bds_parents)
 {
     BdrvChild *child, *next;
 
-    if (bdrv_parent_drained_poll(bs, ignore_parent)) {
+    if (bdrv_parent_drained_poll(bs, ignore_parent, ignore_bds_parents)) {
         return true;
     }
 
@@ -XXX,XX +XXX,XX @@ bool bdrv_drain_poll(BlockDriverState *bs, bool recursive,
     }
 
     if (recursive) {
+        assert(!ignore_bds_parents);
         QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
-            if (bdrv_drain_poll(child->bs, recursive, child)) {
+            if (bdrv_drain_poll(child->bs, recursive, child, false)) {
                 return true;
             }
         }
@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_poll_top_level(BlockDriverState *bs, bool recursive,
      * have executed. */
     while (aio_poll(bs->aio_context, false));
 
-    return bdrv_drain_poll(bs, recursive, ignore_parent);
+    return bdrv_drain_poll(bs, recursive, ignore_parent, false);
 }
 
 static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
-                                  BdrvChild *parent, bool poll);
+                                  BdrvChild *parent, bool ignore_bds_parents,
+                                  bool poll);
 static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
-                                BdrvChild *parent);
+                                BdrvChild *parent, bool ignore_bds_parents);
 
 static void bdrv_co_drain_bh_cb(void *opaque)
 {
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
     if (bs) {
         bdrv_dec_in_flight(bs);
         if (data->begin) {
-            bdrv_do_drained_begin(bs, data->recursive, data->parent, data->poll);
+            bdrv_do_drained_begin(bs, data->recursive, data->parent,
+                                  data->ignore_bds_parents, data->poll);
         } else {
-            bdrv_do_drained_end(bs, data->recursive, data->parent);
+            bdrv_do_drained_end(bs, data->recursive, data->parent,
+                                data->ignore_bds_parents);
         }
     } else {
         assert(data->begin);
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
 
 static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
                                                 bool begin, bool recursive,
-                                                BdrvChild *parent, bool poll)
+                                                BdrvChild *parent,
+                                                bool ignore_bds_parents,
+                                                bool poll)
 {
     BdrvCoDrainData data;
 
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
         .begin = begin,
         .recursive = recursive,
         .parent = parent,
+        .ignore_bds_parents = ignore_bds_parents,
         .poll = poll,
     };
     if (bs) {
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
 }
 
 void bdrv_do_drained_begin_quiesce(BlockDriverState *bs,
-                                   BdrvChild *parent)
+                                   BdrvChild *parent, bool ignore_bds_parents)
 {
     assert(!qemu_in_coroutine());
 
@@ -XXX,XX +XXX,XX @@ void bdrv_do_drained_begin_quiesce(BlockDriverState *bs,
         aio_disable_external(bdrv_get_aio_context(bs));
     }
 
-    bdrv_parent_drained_begin(bs, parent);
+    bdrv_parent_drained_begin(bs, parent, ignore_bds_parents);
     bdrv_drain_invoke(bs, true);
 }
 
 static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
-                                  BdrvChild *parent, bool poll)
+                                  BdrvChild *parent, bool ignore_bds_parents,
+                                  bool poll)
 {
     BdrvChild *child, *next;
 
     if (qemu_in_coroutine()) {
-        bdrv_co_yield_to_drain(bs, true, recursive, parent, poll);
+        bdrv_co_yield_to_drain(bs, true, recursive, parent, ignore_bds_parents,
+                               poll);
         return;
     }
 
-    bdrv_do_drained_begin_quiesce(bs, parent);
+    bdrv_do_drained_begin_quiesce(bs, parent, ignore_bds_parents);
 
     if (recursive) {
+        assert(!ignore_bds_parents);
         bs->recursive_quiesce_counter++;
         QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
-            bdrv_do_drained_begin(child->bs, true, child, false);
+            bdrv_do_drained_begin(child->bs, true, child, ignore_bds_parents,
+                                  false);
         }
     }
 
@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
      * nodes.
      */
     if (poll) {
+        assert(!ignore_bds_parents);
         BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, recursive, parent));
     }
 }
 
 void bdrv_drained_begin(BlockDriverState *bs)
 {
-    bdrv_do_drained_begin(bs, false, NULL, true);
+    bdrv_do_drained_begin(bs, false, NULL, false, true);
 }
 
 void bdrv_subtree_drained_begin(BlockDriverState *bs)
 {
-    bdrv_do_drained_begin(bs, true, NULL, true);
+    bdrv_do_drained_begin(bs, true, NULL, false, true);
 }
 
-void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
-                         BdrvChild *parent)
+static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
+                                BdrvChild *parent, bool ignore_bds_parents)
 {
     BdrvChild *child, *next;
     int old_quiesce_counter;
 
     if (qemu_in_coroutine()) {
-        bdrv_co_yield_to_drain(bs, false, recursive, parent, false);
+        bdrv_co_yield_to_drain(bs, false, recursive, parent, ignore_bds_parents,
+                               false);
         return;
     }
     assert(bs->quiesce_counter > 0);
@@ -XXX,XX +XXX,XX @@ void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
 
     /* Re-enable things in child-to-parent order */
     bdrv_drain_invoke(bs, false);
-    bdrv_parent_drained_end(bs, parent);
+    bdrv_parent_drained_end(bs, parent, ignore_bds_parents);
     if (old_quiesce_counter == 1) {
         aio_enable_external(bdrv_get_aio_context(bs));
     }
 
     if (recursive) {
+        assert(!ignore_bds_parents);
         bs->recursive_quiesce_counter--;
         QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
-            bdrv_do_drained_end(child->bs, true, child);
+            bdrv_do_drained_end(child->bs, true, child, ignore_bds_parents);
         }
     }
 }
 
 void bdrv_drained_end(BlockDriverState *bs)
 {
-    bdrv_do_drained_end(bs, false, NULL);
+    bdrv_do_drained_end(bs, false, NULL, false);
 }
 
 void bdrv_subtree_drained_end(BlockDriverState *bs)
 {
-    bdrv_do_drained_end(bs, true, NULL);
+    bdrv_do_drained_end(bs, true, NULL, false);
 }
 
 void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent)
@@ -XXX,XX +XXX,XX @@ void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent)
     int i;
 
     for (i = 0; i < new_parent->recursive_quiesce_counter; i++) {
-        bdrv_do_drained_begin(child->bs, true, child, true);
+        bdrv_do_drained_begin(child->bs, true, child, false, true);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent)
     int i;
 
     for (i = 0; i < old_parent->recursive_quiesce_counter; i++) {
-        bdrv_do_drained_end(child->bs, true, child);
+        bdrv_do_drained_end(child->bs, true, child, false);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
     BdrvNextIterator it;
 
     if (qemu_in_coroutine()) {
-        bdrv_co_yield_to_drain(NULL, true, false, NULL, true);
+        bdrv_co_yield_to_drain(NULL, true, false, NULL, false, true);
         return;
     }
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
         AioContext *aio_context = bdrv_get_aio_context(bs);
 
         aio_context_acquire(aio_context);
-        bdrv_do_drained_begin(bs, true, NULL, true);
+        bdrv_do_drained_begin(bs, true, NULL, false, true);
         aio_context_release(aio_context);
     }
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
         AioContext *aio_context = bdrv_get_aio_context(bs);
 
         aio_context_acquire(aio_context);
-        bdrv_do_drained_end(bs, true, NULL);
+        bdrv_do_drained_end(bs, true, NULL, false);
         aio_context_release(aio_context);
     }
 }
diff --git a/block/vvfat.c b/block/vvfat.c
index XXXXXXX..XXXXXXX 100644
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -XXX,XX +XXX,XX @@ static void vvfat_qcow_options(int *child_flags, QDict *child_options,
 }
 
 static const BdrvChildRole child_vvfat_qcow = {
+    .parent_is_bds      = true,
     .inherit_options    = vvfat_qcow_options,
 };
 
-- 
2.13.6

bdrv_drain_all_*() used bdrv_next() to iterate over all root nodes and
did a subtree drain for each of them. This works fine as long as the
graph is static, but sadly, reality looks different.

If the graph changes so that root nodes are added or removed, we would
have to compensate for this. bdrv_next() returns each root node only
once even if it's the root node for multiple BlockBackends or for a
monitor-owned block driver tree, which would only complicate things.

The much easier and more obviously correct way is to fundamentally
change the way the functions work: Iterate over all BlockDriverStates,
no matter who owns them, and drain them individually. Compensation is
only necessary when a new BDS is created inside a drain_all section.
Removal of a BDS doesn't require any action because it's gone afterwards
anyway.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/block.h     |  1 +
 include/block/block_int.h |  1 +
 block.c                   | 34 ++++++++++++++++++++++++---
 block/io.c                | 60 ++++++++++++++++++++++++++++++++++++-----------
 4 files changed, 79 insertions(+), 17 deletions(-)

diff --git a/include/block/block.h b/include/block/block.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -XXX,XX +XXX,XX @@ BlockDriverState *bdrv_lookup_bs(const char *device,
                                  Error **errp);
 bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base);
 BlockDriverState *bdrv_next_node(BlockDriverState *bs);
+BlockDriverState *bdrv_next_all_states(BlockDriverState *bs);
 
 typedef struct BdrvNextIterator {
     enum {
diff --git a/include/block/block_int.h b/include/block/block_int.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -XXX,XX +XXX,XX @@ int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
     BdrvRequestFlags flags);
 
+extern unsigned int bdrv_drain_all_count;
 void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent);
 void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent);
 
diff --git a/block.c b/block.c
index XXXXXXX..XXXXXXX 100644
--- a/block.c
+++ b/block.c
@@ -XXX,XX +XXX,XX @@ BlockDriverState *bdrv_new(void)
 
     qemu_co_queue_init(&bs->flush_queue);
 
+    for (i = 0; i < bdrv_drain_all_count; i++) {
+        bdrv_drained_begin(bs);
+    }
+
     QTAILQ_INSERT_TAIL(&all_bdrv_states, bs, bs_list);
 
     return bs;
@@ -XXX,XX +XXX,XX @@ static int bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv,
                             int open_flags, Error **errp)
 {
     Error *local_err = NULL;
-    int ret;
+    int i, ret;
 
     bdrv_assign_node_name(bs, node_name, &local_err);
     if (local_err) {
@@ -XXX,XX +XXX,XX @@ static int bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv,
     assert(bdrv_min_mem_align(bs) != 0);
     assert(is_power_of_2(bs->bl.request_alignment));
 
+    for (i = 0; i < bs->quiesce_counter; i++) {
+        if (drv->bdrv_co_drain_begin) {
+            drv->bdrv_co_drain_begin(bs);
+        }
+    }
+
     return 0;
 open_failed:
     bs->drv = NULL;
@@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child,
             child->role->detach(child);
         }
         if (old_bs->quiesce_counter && child->role->drained_end) {
-            for (i = 0; i < old_bs->quiesce_counter; i++) {
+            int num = old_bs->quiesce_counter;
+            if (child->role->parent_is_bds) {
+                num -= bdrv_drain_all_count;
+            }
+            assert(num >= 0);
+            for (i = 0; i < num; i++) {
                 child->role->drained_end(child);
             }
         }
@@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child,
     if (new_bs) {
         QLIST_INSERT_HEAD(&new_bs->parents, child, next_parent);
         if (new_bs->quiesce_counter && child->role->drained_begin) {
-            for (i = 0; i < new_bs->quiesce_counter; i++) {
+            int num = new_bs->quiesce_counter;
+            if (child->role->parent_is_bds) {
+                num -= bdrv_drain_all_count;
+            }
+            assert(num >= 0);
+            for (i = 0; i < num; i++) {
                 child->role->drained_begin(child);
             }
         }
@@ -XXX,XX +XXX,XX @@ BlockDriverState *bdrv_next_node(BlockDriverState *bs)
     return QTAILQ_NEXT(bs, node_list);
 }
 
+BlockDriverState *bdrv_next_all_states(BlockDriverState *bs)
+{
+    if (!bs) {
+        return QTAILQ_FIRST(&all_bdrv_states);
+    }
+    return QTAILQ_NEXT(bs, bs_list);
+}
+
 const char *bdrv_get_node_name(const BlockDriverState *bs)
 {
     return bs->node_name;
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@
 /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */
 #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)
 
+static AioWait drain_all_aio_wait;
+
 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
     int64_t offset, int bytes, BdrvRequestFlags flags);
 
@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_assert_idle(BlockDriverState *bs)
     }
 }
 
+unsigned int bdrv_drain_all_count = 0;
+
+static bool bdrv_drain_all_poll(void)
+{
+    BlockDriverState *bs = NULL;
+    bool result = false;
+
+    /* Execute pending BHs first (may modify the graph) and check everything
+     * else only after the BHs have executed. */
+    while (aio_poll(qemu_get_aio_context(), false));
+
+    /* bdrv_drain_poll() can't make changes to the graph and we are holding the
+     * main AioContext lock, so iterating bdrv_next_all_states() is safe. */
+    while ((bs = bdrv_next_all_states(bs))) {
+        AioContext *aio_context = bdrv_get_aio_context(bs);
+        aio_context_acquire(aio_context);
+        result |= bdrv_drain_poll(bs, false, NULL, true);
+        aio_context_release(aio_context);
+    }
+
+    return result;
+}
+
 /*
  * Wait for pending requests to complete across all BlockDriverStates
  *
@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_assert_idle(BlockDriverState *bs)
  */
 void bdrv_drain_all_begin(void)
 {
-    BlockDriverState *bs;
-    BdrvNextIterator it;
+    BlockDriverState *bs = NULL;
 
     if (qemu_in_coroutine()) {
-        bdrv_co_yield_to_drain(NULL, true, false, NULL, false, true);
+        bdrv_co_yield_to_drain(NULL, true, false, NULL, true, true);
         return;
     }
 
-    /* BDRV_POLL_WHILE() for a node can only be called from its own I/O thread
-     * or the main loop AioContext. We potentially use BDRV_POLL_WHILE() on
-     * nodes in several different AioContexts, so make sure we're in the main
-     * context. */
+    /* AIO_WAIT_WHILE() with a NULL context can only be called from the main
+     * loop AioContext, so make sure we're in the main context. */
     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
+    assert(bdrv_drain_all_count < INT_MAX);
+    bdrv_drain_all_count++;
 
-    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
+    /* Quiesce all nodes, without polling in-flight requests yet. The graph
+     * cannot change during this loop. */
+    while ((bs = bdrv_next_all_states(bs))) {
         AioContext *aio_context = bdrv_get_aio_context(bs);
 
         aio_context_acquire(aio_context);
-        bdrv_do_drained_begin(bs, true, NULL, false, true);
+        bdrv_do_drained_begin(bs, false, NULL, true, false);
         aio_context_release(aio_context);
     }
 
-    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
+    /* Now poll the in-flight requests */
+    AIO_WAIT_WHILE(&drain_all_aio_wait, NULL, bdrv_drain_all_poll());
+
+    while ((bs = bdrv_next_all_states(bs))) {
         bdrv_drain_assert_idle(bs);
     }
 }
 
 void bdrv_drain_all_end(void)
 {
-    BlockDriverState *bs;
-    BdrvNextIterator it;
+    BlockDriverState *bs = NULL;
 
-    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
+    while ((bs = bdrv_next_all_states(bs))) {
         AioContext *aio_context = bdrv_get_aio_context(bs);
 
         aio_context_acquire(aio_context);
-        bdrv_do_drained_end(bs, true, NULL, false);
+        bdrv_do_drained_end(bs, false, NULL, true);
         aio_context_release(aio_context);
     }
+
+    assert(bdrv_drain_all_count > 0);
+    bdrv_drain_all_count--;
 }
 
 void bdrv_drain_all(void)
@@ -XXX,XX +XXX,XX @@ void bdrv_inc_in_flight(BlockDriverState *bs)
 void bdrv_wakeup(BlockDriverState *bs)
 {
     aio_wait_kick(bdrv_get_aio_wait(bs));
+    aio_wait_kick(&drain_all_aio_wait);
 }
 
 void bdrv_dec_in_flight(BlockDriverState *bs)
-- 
2.13.6

This tests both adding and remove a node between bdrv_drain_all_begin()
and bdrv_drain_all_end(), and enabled the existing detach test for
drain_all.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/test-bdrv-drain.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 73 insertions(+), 2 deletions(-)

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static void test_multiparent(void)
     blk_unref(blk_b);
 }
 
-static void test_graph_change(void)
+static void test_graph_change_drain_subtree(void)
 {
     BlockBackend *blk_a, *blk_b;
     BlockDriverState *bs_a, *bs_b, *backing;
@@ -XXX,XX +XXX,XX @@ static void test_graph_change(void)
     blk_unref(blk_b);
 }
 
+static void test_graph_change_drain_all(void)
+{
+    BlockBackend *blk_a, *blk_b;
+    BlockDriverState *bs_a, *bs_b;
+    BDRVTestState *a_s, *b_s;
+
+    /* Create node A with a BlockBackend */
+    blk_a = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    bs_a = bdrv_new_open_driver(&bdrv_test, "test-node-a", BDRV_O_RDWR,
+                                &error_abort);
+    a_s = bs_a->opaque;
+    blk_insert_bs(blk_a, bs_a, &error_abort);
+
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
+    g_assert_cmpint(a_s->drain_count, ==, 0);
+
+    /* Call bdrv_drain_all_begin() */
+    bdrv_drain_all_begin();
+
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
+    g_assert_cmpint(a_s->drain_count, ==, 1);
+
+    /* Create node B with a BlockBackend */
+    blk_b = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    bs_b = bdrv_new_open_driver(&bdrv_test, "test-node-b", BDRV_O_RDWR,
+                                &error_abort);
+    b_s = bs_b->opaque;
+    blk_insert_bs(blk_b, bs_b, &error_abort);
+
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
+    g_assert_cmpint(a_s->drain_count, ==, 1);
+    g_assert_cmpint(b_s->drain_count, ==, 1);
+
+    /* Unref and finally delete node A */
+    blk_unref(blk_a);
+
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
+    g_assert_cmpint(a_s->drain_count, ==, 1);
+    g_assert_cmpint(b_s->drain_count, ==, 1);
+
+    bdrv_unref(bs_a);
+
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
+    g_assert_cmpint(b_s->drain_count, ==, 1);
+
+    /* End the drained section */
+    bdrv_drain_all_end();
+
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
+    g_assert_cmpint(b_s->drain_count, ==, 0);
+
+    bdrv_unref(bs_b);
+    blk_unref(blk_b);
+}
+
 struct test_iothread_data {
     BlockDriverState *bs;
     enum drain_type drain_type;
@@ -XXX,XX +XXX,XX @@ static void do_test_delete_by_drain(bool detach_instead_of_delete,
         bdrv_subtree_drained_begin(bs);
         bdrv_subtree_drained_end(bs);
         break;
+    case BDRV_DRAIN_ALL:
+        bdrv_drain_all_begin();
+        bdrv_drain_all_end();
+        break;
     default:
         g_assert_not_reached();
     }
@@ -XXX,XX +XXX,XX @@ static void test_delete_by_drain(void)
     do_test_delete_by_drain(false, BDRV_DRAIN);
 }
 
+static void test_detach_by_drain_all(void)
+{
+    do_test_delete_by_drain(true, BDRV_DRAIN_ALL);
+}
+
 static void test_detach_by_drain(void)
 {
     do_test_delete_by_drain(true, BDRV_DRAIN);
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
 
     g_test_add_func("/bdrv-drain/nested", test_nested);
     g_test_add_func("/bdrv-drain/multiparent", test_multiparent);
-    g_test_add_func("/bdrv-drain/graph-change", test_graph_change);
+
+    g_test_add_func("/bdrv-drain/graph-change/drain_subtree",
+                    test_graph_change_drain_subtree);
+    g_test_add_func("/bdrv-drain/graph-change/drain_all",
+                    test_graph_change_drain_all);
 
     g_test_add_func("/bdrv-drain/iothread/drain_all", test_iothread_drain_all);
     g_test_add_func("/bdrv-drain/iothread/drain", test_iothread_drain);
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
                     test_blockjob_drain_subtree);
 
     g_test_add_func("/bdrv-drain/deletion/drain", test_delete_by_drain);
+    g_test_add_func("/bdrv-drain/detach/drain_all", test_detach_by_drain_all);
     g_test_add_func("/bdrv-drain/detach/drain", test_detach_by_drain);
     g_test_add_func("/bdrv-drain/detach/drain_subtree", test_detach_by_drain_subtree);
     g_test_add_func("/bdrv-drain/detach/parent_cb", test_detach_by_parent_cb);
-- 
2.13.6

From: Greg Kurz <groug@kaod.org>

Removing a drive with drive_del while it is being used to run an I/O
intensive workload can cause QEMU to crash.

An AIO flush can yield at some point:

blk_aio_flush_entry()
 blk_co_flush(blk)
  bdrv_co_flush(blk->root->bs)
   ...
    qemu_coroutine_yield()

and let the HMP command to run, free blk->root and give control
back to the AIO flush:

hmp_drive_del()
     blk_remove_bs()
      bdrv_root_unref_child(blk->root)
       child_bs = blk->root->bs
       bdrv_detach_child(blk->root)
        bdrv_replace_child(blk->root, NULL)
         blk->root->bs = NULL
        g_free(blk->root) <============== blk->root becomes stale
       bdrv_unref(child_bs)
        bdrv_delete(child_bs)
         bdrv_close()
          bdrv_drained_begin()
           bdrv_do_drained_begin()
            bdrv_drain_recurse()
             aio_poll()
              ...
              qemu_coroutine_switch()

and the AIO flush completion ends up dereferencing blk->root:

blk_aio_complete()
   scsi_aio_complete()
    blk_get_aio_context(blk)
     bs = blk_bs(blk)
 ie, bs = blk->root ? blk->root->bs : NULL
            ^^^^^
            stale

The problem is that we should avoid making block driver graph
changes while we have in-flight requests. Let's drain all I/O
for this BB before calling bdrv_root_unref_child().

Signed-off-by: Greg Kurz <groug@kaod.org>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/block-backend.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/block/block-backend.c b/block/block-backend.c
index XXXXXXX..XXXXXXX 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -XXX,XX +XXX,XX @@ void blk_remove_bs(BlockBackend *blk)
 
     blk_update_root_state(blk);
 
+    /* bdrv_root_unref_child() will cause blk->root to become stale and may
+     * switch to a completion coroutine later on. Let's drain all I/O here
+     * to avoid that and a potential QEMU crash.
+     */
+    blk_drain(blk);
     bdrv_root_unref_child(blk->root);
     blk->root = NULL;
 }
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

When converting mirror's I/O to coroutines, we are going to need a point
where these coroutines are created.  mirror_perform() is going to be
that point.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Jeff Cody <jcody@redhat.com>
Reviewed-by: Alberto Garcia <berto@igalia.com>
Message-id: 20180613181823.13618-2-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/mirror.c | 51 +++++++++++++++++++++++++++++----------------------
 1 file changed, 29 insertions(+), 22 deletions(-)

diff --git a/block/mirror.c b/block/mirror.c
index XXXXXXX..XXXXXXX 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -XXX,XX +XXX,XX @@ typedef struct MirrorOp {
     uint64_t bytes;
 } MirrorOp;
 
+typedef enum MirrorMethod {
+    MIRROR_METHOD_COPY,
+    MIRROR_METHOD_ZERO,
+    MIRROR_METHOD_DISCARD,
+} MirrorMethod;
+
 static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read,
                                             int error)
 {
@@ -XXX,XX +XXX,XX @@ static void mirror_do_zero_or_discard(MirrorBlockJob *s,
     }
 }
 
+static unsigned mirror_perform(MirrorBlockJob *s, int64_t offset,
+                               unsigned bytes, MirrorMethod mirror_method)
+{
+    switch (mirror_method) {
+    case MIRROR_METHOD_COPY:
+        return mirror_do_read(s, offset, bytes);
+    case MIRROR_METHOD_ZERO:
+    case MIRROR_METHOD_DISCARD:
+        mirror_do_zero_or_discard(s, offset, bytes,
+                                  mirror_method == MIRROR_METHOD_DISCARD);
+        return bytes;
+    default:
+        abort();
+    }
+}
+
 static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
 {
     BlockDriverState *source = s->source;
@@ -XXX,XX +XXX,XX @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
         int ret;
         int64_t io_bytes;
         int64_t io_bytes_acct;
-        enum MirrorMethod {
-            MIRROR_METHOD_COPY,
-            MIRROR_METHOD_ZERO,
-            MIRROR_METHOD_DISCARD
-        } mirror_method = MIRROR_METHOD_COPY;
+        MirrorMethod mirror_method = MIRROR_METHOD_COPY;
 
         assert(!(offset % s->granularity));
         ret = bdrv_block_status_above(source, NULL, offset,
@@ -XXX,XX +XXX,XX @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
         }
 
         io_bytes = mirror_clip_bytes(s, offset, io_bytes);
-        switch (mirror_method) {
-        case MIRROR_METHOD_COPY:
-            io_bytes = io_bytes_acct = mirror_do_read(s, offset, io_bytes);
-            break;
-        case MIRROR_METHOD_ZERO:
-        case MIRROR_METHOD_DISCARD:
-            mirror_do_zero_or_discard(s, offset, io_bytes,
-                                      mirror_method == MIRROR_METHOD_DISCARD);
-            if (write_zeroes_ok) {
-                io_bytes_acct = 0;
-            } else {
-                io_bytes_acct = io_bytes;
-            }
-            break;
-        default:
-            abort();
+        io_bytes = mirror_perform(s, offset, io_bytes, mirror_method);
+        if (mirror_method != MIRROR_METHOD_COPY && write_zeroes_ok) {
+            io_bytes_acct = 0;
+        } else {
+            io_bytes_acct = io_bytes;
         }
         assert(io_bytes);
         offset += io_bytes;
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
                 continue;
             }
 
-            mirror_do_zero_or_discard(s, offset, bytes, false);
+            mirror_perform(s, offset, bytes, MIRROR_METHOD_ZERO);
             offset += bytes;
         }
 
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

In order to talk to the source BDS (and maybe in the future to the
target BDS as well) directly, we need to convert our existing AIO
requests into coroutine I/O requests.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 20180613181823.13618-3-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/mirror.c | 152 ++++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 90 insertions(+), 62 deletions(-)

diff --git a/block/mirror.c b/block/mirror.c
index XXXXXXX..XXXXXXX 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -XXX,XX +XXX,XX @@ typedef struct MirrorOp {
     QEMUIOVector qiov;
     int64_t offset;
     uint64_t bytes;
+
+    /* The pointee is set by mirror_co_read(), mirror_co_zero(), and
+     * mirror_co_discard() before yielding for the first time */
+    int64_t *bytes_handled;
 } MirrorOp;
 
 typedef enum MirrorMethod {
@@ -XXX,XX +XXX,XX @@ static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read,
     }
 }
 
-static void mirror_iteration_done(MirrorOp *op, int ret)
+static void coroutine_fn mirror_iteration_done(MirrorOp *op, int ret)
 {
     MirrorBlockJob *s = op->s;
     struct iovec *iov;
@@ -XXX,XX +XXX,XX @@ static void mirror_iteration_done(MirrorOp *op, int ret)
     }
 }
 
-static void mirror_write_complete(void *opaque, int ret)
+static void coroutine_fn mirror_write_complete(MirrorOp *op, int ret)
 {
-    MirrorOp *op = opaque;
     MirrorBlockJob *s = op->s;
 
     aio_context_acquire(blk_get_aio_context(s->common.blk));
@@ -XXX,XX +XXX,XX @@ static void mirror_write_complete(void *opaque, int ret)
     aio_context_release(blk_get_aio_context(s->common.blk));
 }
 
-static void mirror_read_complete(void *opaque, int ret)
+static void coroutine_fn mirror_read_complete(MirrorOp *op, int ret)
 {
-    MirrorOp *op = opaque;
     MirrorBlockJob *s = op->s;
 
     aio_context_acquire(blk_get_aio_context(s->common.blk));
@@ -XXX,XX +XXX,XX @@ static void mirror_read_complete(void *opaque, int ret)
 
         mirror_iteration_done(op, ret);
     } else {
-        blk_aio_pwritev(s->target, op->offset, &op->qiov,
-                        0, mirror_write_complete, op);
+        ret = blk_co_pwritev(s->target, op->offset,
+                             op->qiov.size, &op->qiov, 0);
+        mirror_write_complete(op, ret);
     }
     aio_context_release(blk_get_aio_context(s->common.blk));
 }
@@ -XXX,XX +XXX,XX @@ static inline void mirror_wait_for_io(MirrorBlockJob *s)
     s->waiting_for_io = false;
 }
 
-/* Submit async read while handling COW.
- * Returns: The number of bytes copied after and including offset,
- *          excluding any bytes copied prior to offset due to alignment.
- *          This will be @bytes if no alignment is necessary, or
- *          (new_end - offset) if tail is rounded up or down due to
- *          alignment or buffer limit.
+/* Perform a mirror copy operation.
+ *
+ * *op->bytes_handled is set to the number of bytes copied after and
+ * including offset, excluding any bytes copied prior to offset due
+ * to alignment.  This will be op->bytes if no alignment is necessary,
+ * or (new_end - op->offset) if the tail is rounded up or down due to
+ * alignment or buffer limit.
  */
-static uint64_t mirror_do_read(MirrorBlockJob *s, int64_t offset,
-                               uint64_t bytes)
+static void coroutine_fn mirror_co_read(void *opaque)
 {
+    MirrorOp *op = opaque;
+    MirrorBlockJob *s = op->s;
     BlockBackend *source = s->common.blk;
     int nb_chunks;
     uint64_t ret;
-    MirrorOp *op;
     uint64_t max_bytes;
 
     max_bytes = s->granularity * s->max_iov;
 
     /* We can only handle as much as buf_size at a time. */
-    bytes = MIN(s->buf_size, MIN(max_bytes, bytes));
-    assert(bytes);
-    assert(bytes < BDRV_REQUEST_MAX_BYTES);
-    ret = bytes;
+    op->bytes = MIN(s->buf_size, MIN(max_bytes, op->bytes));
+    assert(op->bytes);
+    assert(op->bytes < BDRV_REQUEST_MAX_BYTES);
+    *op->bytes_handled = op->bytes;
 
     if (s->cow_bitmap) {
-        ret += mirror_cow_align(s, &offset, &bytes);
+        *op->bytes_handled += mirror_cow_align(s, &op->offset, &op->bytes);
     }
-    assert(bytes <= s->buf_size);
+    /* Cannot exceed BDRV_REQUEST_MAX_BYTES + INT_MAX */
+    assert(*op->bytes_handled <= UINT_MAX);
+    assert(op->bytes <= s->buf_size);
     /* The offset is granularity-aligned because:
      * 1) Caller passes in aligned values;
      * 2) mirror_cow_align is used only when target cluster is larger. */
-    assert(QEMU_IS_ALIGNED(offset, s->granularity));
+    assert(QEMU_IS_ALIGNED(op->offset, s->granularity));
     /* The range is sector-aligned, since bdrv_getlength() rounds up. */
-    assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
-    nb_chunks = DIV_ROUND_UP(bytes, s->granularity);
+    assert(QEMU_IS_ALIGNED(op->bytes, BDRV_SECTOR_SIZE));
+    nb_chunks = DIV_ROUND_UP(op->bytes, s->granularity);
 
     while (s->buf_free_count < nb_chunks) {
-        trace_mirror_yield_in_flight(s, offset, s->in_flight);
+        trace_mirror_yield_in_flight(s, op->offset, s->in_flight);
         mirror_wait_for_io(s);
     }
 
-    /* Allocate a MirrorOp that is used as an AIO callback.  */
-    op = g_new(MirrorOp, 1);
-    op->s = s;
-    op->offset = offset;
-    op->bytes = bytes;
-
     /* Now make a QEMUIOVector taking enough granularity-sized chunks
      * from s->buf_free.
      */
     qemu_iovec_init(&op->qiov, nb_chunks);
     while (nb_chunks-- > 0) {
         MirrorBuffer *buf = QSIMPLEQ_FIRST(&s->buf_free);
-        size_t remaining = bytes - op->qiov.size;
+        size_t remaining = op->bytes - op->qiov.size;
 
         QSIMPLEQ_REMOVE_HEAD(&s->buf_free, next);
         s->buf_free_count--;
@@ -XXX,XX +XXX,XX @@ static uint64_t mirror_do_read(MirrorBlockJob *s, int64_t offset,
 
     /* Copy the dirty cluster.  */
     s->in_flight++;
-    s->bytes_in_flight += bytes;
-    trace_mirror_one_iteration(s, offset, bytes);
+    s->bytes_in_flight += op->bytes;
+    trace_mirror_one_iteration(s, op->offset, op->bytes);
 
-    blk_aio_preadv(source, offset, &op->qiov, 0, mirror_read_complete, op);
-    return ret;
+    ret = blk_co_preadv(source, op->offset, op->bytes, &op->qiov, 0);
+    mirror_read_complete(op, ret);
 }
 
-static void mirror_do_zero_or_discard(MirrorBlockJob *s,
-                                      int64_t offset,
-                                      uint64_t bytes,
-                                      bool is_discard)
+static void coroutine_fn mirror_co_zero(void *opaque)
 {
-    MirrorOp *op;
+    MirrorOp *op = opaque;
+    int ret;
 
-    /* Allocate a MirrorOp that is used as an AIO callback. The qiov is zeroed
-     * so the freeing in mirror_iteration_done is nop. */
-    op = g_new0(MirrorOp, 1);
-    op->s = s;
-    op->offset = offset;
-    op->bytes = bytes;
+    op->s->in_flight++;
+    op->s->bytes_in_flight += op->bytes;
+    *op->bytes_handled = op->bytes;
 
-    s->in_flight++;
-    s->bytes_in_flight += bytes;
-    if (is_discard) {
-        blk_aio_pdiscard(s->target, offset,
-                         op->bytes, mirror_write_complete, op);
-    } else {
-        blk_aio_pwrite_zeroes(s->target, offset,
-                              op->bytes, s->unmap ? BDRV_REQ_MAY_UNMAP : 0,
-                              mirror_write_complete, op);
-    }
+    ret = blk_co_pwrite_zeroes(op->s->target, op->offset, op->bytes,
+                               op->s->unmap ? BDRV_REQ_MAY_UNMAP : 0);
+    mirror_write_complete(op, ret);
+}
+
+static void coroutine_fn mirror_co_discard(void *opaque)
+{
+    MirrorOp *op = opaque;
+    int ret;
+
+    op->s->in_flight++;
+    op->s->bytes_in_flight += op->bytes;
+    *op->bytes_handled = op->bytes;
+
+    ret = blk_co_pdiscard(op->s->target, op->offset, op->bytes);
+    mirror_write_complete(op, ret);
 }
 
 static unsigned mirror_perform(MirrorBlockJob *s, int64_t offset,
                                unsigned bytes, MirrorMethod mirror_method)
 {
+    MirrorOp *op;
+    Coroutine *co;
+    int64_t bytes_handled = -1;
+
+    op = g_new(MirrorOp, 1);
+    *op = (MirrorOp){
+        .s              = s,
+        .offset         = offset,
+        .bytes          = bytes,
+        .bytes_handled  = &bytes_handled,
+    };
+
     switch (mirror_method) {
     case MIRROR_METHOD_COPY:
-        return mirror_do_read(s, offset, bytes);
+        co = qemu_coroutine_create(mirror_co_read, op);
+        break;
     case MIRROR_METHOD_ZERO:
+        co = qemu_coroutine_create(mirror_co_zero, op);
+        break;
     case MIRROR_METHOD_DISCARD:
-        mirror_do_zero_or_discard(s, offset, bytes,
-                                  mirror_method == MIRROR_METHOD_DISCARD);
-        return bytes;
+        co = qemu_coroutine_create(mirror_co_discard, op);
+        break;
     default:
         abort();
     }
+
+    qemu_coroutine_enter(co);
+    /* At this point, ownership of op has been moved to the coroutine
+     * and the object may already be freed */
+
+    /* Assert that this value has been set */
+    assert(bytes_handled >= 0);
+
+    /* Same assertion as in mirror_co_read() (and for mirror_co_read()
+     * and mirror_co_discard(), bytes_handled == op->bytes, which
+     * is the @bytes parameter given to this function) */
+    assert(bytes_handled <= UINT_MAX);
+    return bytes_handled;
 }
 
 static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

Attach a CoQueue to each in-flight operation so if we need to wait for
any we can use it to wait instead of just blindly yielding and hoping
for some operation to wake us.

A later patch will use this infrastructure to allow requests accessing
the same area of the virtual disk to specifically wait for each other.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 20180613181823.13618-4-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/mirror.c | 34 +++++++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 11 deletions(-)

From: Max Reitz <mreitz@redhat.com>

This patch makes the mirror code differentiate between simply waiting
for any operation to complete (mirror_wait_for_free_in_flight_slot())
and specifically waiting for all operations touching a certain range of
the virtual disk to complete (mirror_wait_on_conflicts()).

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 20180613181823.13618-5-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/mirror.c | 102 +++++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 84 insertions(+), 18 deletions(-)

diff --git a/block/mirror.c b/block/mirror.c
index XXXXXXX..XXXXXXX 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/osdep.h"
 #include "qemu/cutils.h"
 #include "qemu/coroutine.h"
+#include "qemu/range.h"
 #include "trace.h"
 #include "block/blockjob_int.h"
 #include "block/block_int.h"
@@ -XXX,XX +XXX,XX @@ struct MirrorOp {
      * mirror_co_discard() before yielding for the first time */
     int64_t *bytes_handled;
 
+    bool is_pseudo_op;
     CoQueue waiting_requests;
 
     QTAILQ_ENTRY(MirrorOp) next;
@@ -XXX,XX +XXX,XX @@ static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read,
     }
 }
 
+static void coroutine_fn mirror_wait_on_conflicts(MirrorOp *self,
+                                                  MirrorBlockJob *s,
+                                                  uint64_t offset,
+                                                  uint64_t bytes)
+{
+    uint64_t self_start_chunk = offset / s->granularity;
+    uint64_t self_end_chunk = DIV_ROUND_UP(offset + bytes, s->granularity);
+    uint64_t self_nb_chunks = self_end_chunk - self_start_chunk;
+
+    while (find_next_bit(s->in_flight_bitmap, self_end_chunk,
+                         self_start_chunk) < self_end_chunk &&
+           s->ret >= 0)
+    {
+        MirrorOp *op;
+
+        QTAILQ_FOREACH(op, &s->ops_in_flight, next) {
+            uint64_t op_start_chunk = op->offset / s->granularity;
+            uint64_t op_nb_chunks = DIV_ROUND_UP(op->offset + op->bytes,
+                                                 s->granularity) -
+                                    op_start_chunk;
+
+            if (op == self) {
+                continue;
+            }
+
+            if (ranges_overlap(self_start_chunk, self_nb_chunks,
+                               op_start_chunk, op_nb_chunks))
+            {
+                qemu_co_queue_wait(&op->waiting_requests, NULL);
+                break;
+            }
+        }
+    }
+}
+
 static void coroutine_fn mirror_iteration_done(MirrorOp *op, int ret)
 {
     MirrorBlockJob *s = op->s;
@@ -XXX,XX +XXX,XX @@ static int mirror_cow_align(MirrorBlockJob *s, int64_t *offset,
     return ret;
 }
 
-static inline void mirror_wait_for_io(MirrorBlockJob *s)
+static inline void mirror_wait_for_free_in_flight_slot(MirrorBlockJob *s)
 {
     MirrorOp *op;
 
-    op = QTAILQ_FIRST(&s->ops_in_flight);
-    assert(op);
-    qemu_co_queue_wait(&op->waiting_requests, NULL);
+    QTAILQ_FOREACH(op, &s->ops_in_flight, next) {
+        /* Do not wait on pseudo ops, because it may in turn wait on
+         * some other operation to start, which may in fact be the
+         * caller of this function.  Since there is only one pseudo op
+         * at any given time, we will always find some real operation
+         * to wait on. */
+        if (!op->is_pseudo_op) {
+            qemu_co_queue_wait(&op->waiting_requests, NULL);
+            return;
+        }
+    }
+    abort();
 }
 
 /* Perform a mirror copy operation.
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn mirror_co_read(void *opaque)
 
     while (s->buf_free_count < nb_chunks) {
         trace_mirror_yield_in_flight(s, op->offset, s->in_flight);
-        mirror_wait_for_io(s);
+        mirror_wait_for_free_in_flight_slot(s);
     }
 
     /* Now make a QEMUIOVector taking enough granularity-sized chunks
@@ -XXX,XX +XXX,XX @@ static unsigned mirror_perform(MirrorBlockJob *s, int64_t offset,
 static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
 {
     BlockDriverState *source = s->source;
-    int64_t offset, first_chunk;
-    uint64_t delay_ns = 0;
+    MirrorOp *pseudo_op;
+    int64_t offset;
+    uint64_t delay_ns = 0, ret = 0;
     /* At least the first dirty chunk is mirrored in one iteration. */
     int nb_chunks = 1;
     bool write_zeroes_ok = bdrv_can_write_zeroes_with_unmap(blk_bs(s->target));
@@ -XXX,XX +XXX,XX @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
     }
     bdrv_dirty_bitmap_unlock(s->dirty_bitmap);
 
-    first_chunk = offset / s->granularity;
-    while (test_bit(first_chunk, s->in_flight_bitmap)) {
-        trace_mirror_yield_in_flight(s, offset, s->in_flight);
-        mirror_wait_for_io(s);
-    }
+    mirror_wait_on_conflicts(NULL, s, offset, 1);
 
     job_pause_point(&s->common.job);
 
@@ -XXX,XX +XXX,XX @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
                                    nb_chunks * s->granularity);
     bdrv_dirty_bitmap_unlock(s->dirty_bitmap);
 
+    /* Before claiming an area in the in-flight bitmap, we have to
+     * create a MirrorOp for it so that conflicting requests can wait
+     * for it.  mirror_perform() will create the real MirrorOps later,
+     * for now we just create a pseudo operation that will wake up all
+     * conflicting requests once all real operations have been
+     * launched. */
+    pseudo_op = g_new(MirrorOp, 1);
+    *pseudo_op = (MirrorOp){
+        .offset         = offset,
+        .bytes          = nb_chunks * s->granularity,
+        .is_pseudo_op   = true,
+    };
+    qemu_co_queue_init(&pseudo_op->waiting_requests);
+    QTAILQ_INSERT_TAIL(&s->ops_in_flight, pseudo_op, next);
+
     bitmap_set(s->in_flight_bitmap, offset / s->granularity, nb_chunks);
     while (nb_chunks > 0 && offset < s->bdev_length) {
         int ret;
@@ -XXX,XX +XXX,XX @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
 
         while (s->in_flight >= MAX_IN_FLIGHT) {
             trace_mirror_yield_in_flight(s, offset, s->in_flight);
-            mirror_wait_for_io(s);
+            mirror_wait_for_free_in_flight_slot(s);
         }
 
         if (s->ret < 0) {
-            return 0;
+            ret = 0;
+            goto fail;
         }
 
         io_bytes = mirror_clip_bytes(s, offset, io_bytes);
@@ -XXX,XX +XXX,XX @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
         nb_chunks -= DIV_ROUND_UP(io_bytes, s->granularity);
         delay_ns = block_job_ratelimit_get_delay(&s->common, io_bytes_acct);
     }
-    return delay_ns;
+
+    ret = delay_ns;
+fail:
+    QTAILQ_REMOVE(&s->ops_in_flight, pseudo_op, next);
+    qemu_co_queue_restart_all(&pseudo_op->waiting_requests);
+    g_free(pseudo_op);
+
+    return ret;
 }
 
 static void mirror_free_init(MirrorBlockJob *s)
@@ -XXX,XX +XXX,XX @@ static void mirror_free_init(MirrorBlockJob *s)
 static void mirror_wait_for_all_io(MirrorBlockJob *s)
 {
     while (s->in_flight > 0) {
-        mirror_wait_for_io(s);
+        mirror_wait_for_free_in_flight_slot(s);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
             if (s->in_flight >= MAX_IN_FLIGHT) {
                 trace_mirror_yield(s, UINT64_MAX, s->buf_free_count,
                                    s->in_flight);
-                mirror_wait_for_io(s);
+                mirror_wait_for_free_in_flight_slot(s);
                 continue;
             }
 
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn mirror_run(void *opaque)
             if (s->in_flight >= MAX_IN_FLIGHT || s->buf_free_count == 0 ||
                 (cnt == 0 && s->in_flight > 0)) {
                 trace_mirror_yield(s, cnt, s->buf_free_count, s->in_flight);
-                mirror_wait_for_io(s);
+                mirror_wait_for_free_in_flight_slot(s);
                 continue;
             } else if (cnt != 0) {
                 delay_ns = mirror_iteration(s);
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

With this, the mirror_top_bs is no longer just a technically required
node in the BDS graph but actually represents the block job operation.

Also, drop MirrorBlockJob.source, as we can reach it through
mirror_top_bs->backing.

diff --git a/block/mirror.c b/block/mirror.c
index XXXXXXX..XXXXXXX 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -XXX,XX +XXX,XX @@ typedef struct MirrorBlockJob {
     BlockJob common;
     BlockBackend *target;
     BlockDriverState *mirror_top_bs;
-    BlockDriverState *source;
     BlockDriverState *base;
 
     /* The name of the graph node to replace */
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn mirror_co_read(void *opaque)
 {
     MirrorOp *op = opaque;
     MirrorBlockJob *s = op->s;
-    BlockBackend *source = s->common.blk;
     int nb_chunks;
     uint64_t ret;
     uint64_t max_bytes;
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn mirror_co_read(void *opaque)
     s->bytes_in_flight += op->bytes;
     trace_mirror_one_iteration(s, op->offset, op->bytes);
 
-    ret = blk_co_preadv(source, op->offset, op->bytes, &op->qiov, 0);
+    ret = bdrv_co_preadv(s->mirror_top_bs->backing, op->offset, op->bytes,
+                         &op->qiov, 0);
     mirror_read_complete(op, ret);
 }
 
@@ -XXX,XX +XXX,XX @@ static unsigned mirror_perform(MirrorBlockJob *s, int64_t offset,
 
 static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
 {
-    BlockDriverState *source = s->source;
+    BlockDriverState *source = s->mirror_top_bs->backing->bs;
     MirrorOp *pseudo_op;
     int64_t offset;
     uint64_t delay_ns = 0, ret = 0;
@@ -XXX,XX +XXX,XX @@ static void mirror_exit(Job *job, void *opaque)
     BlockJob *bjob = &s->common;
     MirrorExitData *data = opaque;
     AioContext *replace_aio_context = NULL;
-    BlockDriverState *src = s->source;
+    BlockDriverState *src = s->mirror_top_bs->backing->bs;
     BlockDriverState *target_bs = blk_bs(s->target);
     BlockDriverState *mirror_top_bs = s->mirror_top_bs;
     Error *local_err = NULL;
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
 {
     int64_t offset;
     BlockDriverState *base = s->base;
-    BlockDriverState *bs = s->source;
+    BlockDriverState *bs = s->mirror_top_bs->backing->bs;
     BlockDriverState *target_bs = blk_bs(s->target);
     int ret;
     int64_t count;
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn mirror_run(void *opaque)
 {
     MirrorBlockJob *s = opaque;
     MirrorExitData *data;
-    BlockDriverState *bs = s->source;
+    BlockDriverState *bs = s->mirror_top_bs->backing->bs;
     BlockDriverState *target_bs = blk_bs(s->target);
     bool need_drain = true;
     int64_t length;
@@ -XXX,XX +XXX,XX @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs,
     /* The block job now has a reference to this node */
     bdrv_unref(mirror_top_bs);
 
-    s->source = bs;
     s->mirror_top_bs = mirror_top_bs;
 
     /* No resize for the target either; while the mirror is still running, a
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

Currently, bdrv_replace_node() refuses to create loops from one BDS to
itself if the BDS to be replaced is the backing node of the BDS to
replace it: Say there is a node A and a node B.  Replacing B by A means
making all references to B point to A.  If B is a child of A (i.e. A has
a reference to B), that would mean we would have to make this reference
point to A itself -- so we'd create a loop.

bdrv_replace_node() (through should_update_child()) refuses to do so if
B is the backing node of A.  There is no reason why we should create
loops if B is not the backing node of A, though.  The BDS graph should
never contain loops, so we should always refuse to create them.

If B is a child of A and B is to be replaced by A, we should simply
leave B in place there because it is the most sensible choice.

A more specific argument would be: Putting filter drivers into the BDS
graph is basically the same as appending an overlay to a backing chain.
But the main child BDS of a filter driver is not "backing" but "file",
so restricting the no-loop rule to backing nodes would fail here.

diff --git a/block.c b/block.c
index XXXXXXX..XXXXXXX 100644
--- a/block.c
+++ b/block.c
@@ -XXX,XX +XXX,XX @@ static bool should_update_child(BdrvChild *c, BlockDriverState *to)
         return false;
     }
 
-    if (c->role == &child_backing) {
-        /* If @from is a backing file of @to, ignore the child to avoid
-         * creating a loop. We only want to change the pointer of other
-         * parents. */
-        QLIST_FOREACH(to_c, &to->children, next) {
-            if (to_c == c) {
-                break;
-            }
-        }
-        if (to_c) {
+    /* If the child @c belongs to the BDS @to, replacing the current
+     * c->bs by @to would mean to create a loop.
+     *
+     * Such a case occurs when appending a BDS to a backing chain.
+     * For instance, imagine the following chain:
+     *
+     *   guest device -> node A -> further backing chain...
+     *
+     * Now we create a new BDS B which we want to put on top of this
+     * chain, so we first attach A as its backing node:
+     *
+     *                   node B
+     *                     |
+     *                     v
+     *   guest device -> node A -> further backing chain...
+     *
+     * Finally we want to replace A by B.  When doing that, we want to
+     * replace all pointers to A by pointers to B -- except for the
+     * pointer from B because (1) that would create a loop, and (2)
+     * that pointer should simply stay intact:
+     *
+     *   guest device -> node B
+     *                     |
+     *                     v
+     *                   node A -> further backing chain...
+     *
+     * In general, when replacing a node A (c->bs) by a node B (@to),
+     * if A is a child of B, that means we cannot replace A by B there
+     * because that would create a loop.  Silently detaching A from B
+     * is also not really an option.  So overall just leaving A in
+     * place there is the most sensible choice. */
+    QLIST_FOREACH(to_c, &to->children, next) {
+        if (to_c == c) {
             return false;
         }
     }
@@ -XXX,XX +XXX,XX @@ void bdrv_replace_node(BlockDriverState *from, BlockDriverState *to,
 
     /* Put all parents into @list and calculate their cumulative permissions */
     QLIST_FOREACH_SAFE(c, &from->parents, next_parent, next) {
+        assert(c->bs == from);
         if (!should_update_child(c, to)) {
             continue;
         }
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

This new parameter allows the caller to just query the next dirty
position without moving the iterator.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Reviewed-by: John Snow <jsnow@redhat.com>
Message-id: 20180613181823.13618-8-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 include/qemu/hbitmap.h |  5 ++++-
 block/backup.c         |  2 +-
 block/dirty-bitmap.c   |  2 +-
 tests/test-hbitmap.c   | 26 +++++++++++++-------------
 util/hbitmap.c         | 10 +++++++---
 5 files changed, 26 insertions(+), 19 deletions(-)

diff --git a/include/qemu/hbitmap.h b/include/qemu/hbitmap.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/hbitmap.h
+++ b/include/qemu/hbitmap.h
@@ -XXX,XX +XXX,XX @@ void hbitmap_free_meta(HBitmap *hb);
 /**
  * hbitmap_iter_next:
  * @hbi: HBitmapIter to operate on.
+ * @advance: If true, advance the iterator.  Otherwise, the next call
+ *           of this function will return the same result (if that
+ *           position is still dirty).
  *
  * Return the next bit that is set in @hbi's associated HBitmap,
  * or -1 if all remaining bits are zero.
  */
-int64_t hbitmap_iter_next(HBitmapIter *hbi);
+int64_t hbitmap_iter_next(HBitmapIter *hbi, bool advance);
 
 /**
  * hbitmap_iter_next_word:
diff --git a/block/backup.c b/block/backup.c
index XXXXXXX..XXXXXXX 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
     HBitmapIter hbi;
 
     hbitmap_iter_init(&hbi, job->copy_bitmap, 0);
-    while ((cluster = hbitmap_iter_next(&hbi)) != -1) {
+    while ((cluster = hbitmap_iter_next(&hbi, true)) != -1) {
         do {
             if (yield_and_check(job)) {
                 return 0;
diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c
index XXXXXXX..XXXXXXX 100644
--- a/block/dirty-bitmap.c
+++ b/block/dirty-bitmap.c
@@ -XXX,XX +XXX,XX @@ void bdrv_dirty_iter_free(BdrvDirtyBitmapIter *iter)
 
 int64_t bdrv_dirty_iter_next(BdrvDirtyBitmapIter *iter)
 {
-    return hbitmap_iter_next(&iter->hbi);
+    return hbitmap_iter_next(&iter->hbi, true);
 }
 
 /* Called within bdrv_dirty_bitmap_lock..unlock */
diff --git a/tests/test-hbitmap.c b/tests/test-hbitmap.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-hbitmap.c
+++ b/tests/test-hbitmap.c
@@ -XXX,XX +XXX,XX @@ static void hbitmap_test_check(TestHBitmapData *data,
 
     i = first;
     for (;;) {
-        next = hbitmap_iter_next(&hbi);
+        next = hbitmap_iter_next(&hbi, true);
         if (next < 0) {
             next = data->size;
         }
@@ -XXX,XX +XXX,XX @@ static void test_hbitmap_iter_granularity(TestHBitmapData *data,
     /* Note that hbitmap_test_check has to be invoked manually in this test.  */
     hbitmap_test_init(data, 131072 << 7, 7);
     hbitmap_iter_init(&hbi, data->hb, 0);
-    g_assert_cmpint(hbitmap_iter_next(&hbi), <, 0);
+    g_assert_cmpint(hbitmap_iter_next(&hbi, true), <, 0);
 
     hbitmap_test_set(data, ((L2 + L1 + 1) << 7) + 8, 8);
     hbitmap_iter_init(&hbi, data->hb, 0);
-    g_assert_cmpint(hbitmap_iter_next(&hbi), ==, (L2 + L1 + 1) << 7);
-    g_assert_cmpint(hbitmap_iter_next(&hbi), <, 0);
+    g_assert_cmpint(hbitmap_iter_next(&hbi, true), ==, (L2 + L1 + 1) << 7);
+    g_assert_cmpint(hbitmap_iter_next(&hbi, true), <, 0);
 
     hbitmap_iter_init(&hbi, data->hb, (L2 + L1 + 2) << 7);
-    g_assert_cmpint(hbitmap_iter_next(&hbi), <, 0);
+    g_assert_cmpint(hbitmap_iter_next(&hbi, true), <, 0);
 
     hbitmap_test_set(data, (131072 << 7) - 8, 8);
     hbitmap_iter_init(&hbi, data->hb, 0);
-    g_assert_cmpint(hbitmap_iter_next(&hbi), ==, (L2 + L1 + 1) << 7);
-    g_assert_cmpint(hbitmap_iter_next(&hbi), ==, 131071 << 7);
-    g_assert_cmpint(hbitmap_iter_next(&hbi), <, 0);
+    g_assert_cmpint(hbitmap_iter_next(&hbi, true), ==, (L2 + L1 + 1) << 7);
+    g_assert_cmpint(hbitmap_iter_next(&hbi, true), ==, 131071 << 7);
+    g_assert_cmpint(hbitmap_iter_next(&hbi, true), <, 0);
 
     hbitmap_iter_init(&hbi, data->hb, (L2 + L1 + 2) << 7);
-    g_assert_cmpint(hbitmap_iter_next(&hbi), ==, 131071 << 7);
-    g_assert_cmpint(hbitmap_iter_next(&hbi), <, 0);
+    g_assert_cmpint(hbitmap_iter_next(&hbi, true), ==, 131071 << 7);
+    g_assert_cmpint(hbitmap_iter_next(&hbi, true), <, 0);
 }
 
 static void hbitmap_test_set_boundary_bits(TestHBitmapData *data, ssize_t diff)
@@ -XXX,XX +XXX,XX @@ static void test_hbitmap_serialize_zeroes(TestHBitmapData *data,
     for (i = 0; i < num_positions; i++) {
         hbitmap_deserialize_zeroes(data->hb, positions[i], min_l1, true);
         hbitmap_iter_init(&iter, data->hb, 0);
-        next = hbitmap_iter_next(&iter);
+        next = hbitmap_iter_next(&iter, true);
         if (i == num_positions - 1) {
             g_assert_cmpint(next, ==, -1);
         } else {
@@ -XXX,XX +XXX,XX @@ static void test_hbitmap_iter_and_reset(TestHBitmapData *data,
 
     hbitmap_iter_init(&hbi, data->hb, BITS_PER_LONG - 1);
 
-    hbitmap_iter_next(&hbi);
+    hbitmap_iter_next(&hbi, true);
 
     hbitmap_reset_all(data->hb);
-    hbitmap_iter_next(&hbi);
+    hbitmap_iter_next(&hbi, true);
 }
 
 static void test_hbitmap_next_zero_check(TestHBitmapData *data, int64_t start)
diff --git a/util/hbitmap.c b/util/hbitmap.c
index XXXXXXX..XXXXXXX 100644
--- a/util/hbitmap.c
+++ b/util/hbitmap.c
@@ -XXX,XX +XXX,XX @@ unsigned long hbitmap_iter_skip_words(HBitmapIter *hbi)
     return cur;
 }
 
-int64_t hbitmap_iter_next(HBitmapIter *hbi)
+int64_t hbitmap_iter_next(HBitmapIter *hbi, bool advance)
 {
     unsigned long cur = hbi->cur[HBITMAP_LEVELS - 1] &
             hbi->hb->levels[HBITMAP_LEVELS - 1][hbi->pos];
@@ -XXX,XX +XXX,XX @@ int64_t hbitmap_iter_next(HBitmapIter *hbi)
         }
     }
 
-    /* The next call will resume work from the next bit.  */
-    hbi->cur[HBITMAP_LEVELS - 1] = cur & (cur - 1);
+    if (advance) {
+        /* The next call will resume work from the next bit.  */
+        hbi->cur[HBITMAP_LEVELS - 1] = cur & (cur - 1);
+    } else {
+        hbi->cur[HBITMAP_LEVELS - 1] = cur;
+    }
     item = ((uint64_t)hbi->pos << BITS_PER_LEVEL) + ctzl(cur);
 
     return item << hbi->granularity;
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

Add a function that wraps hbitmap_iter_next() and always calls it in
non-advancing mode first, and in advancing mode next.  The result should
always be the same.

By using this function everywhere we called hbitmap_iter_next() before,
we should get good test coverage for non-advancing hbitmap_iter_next().

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Reviewed-by: John Snow <jsnow@redhat.com>
Message-id: 20180613181823.13618-9-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/test-hbitmap.c | 36 ++++++++++++++++++++++++------------
 1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/tests/test-hbitmap.c b/tests/test-hbitmap.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-hbitmap.c
+++ b/tests/test-hbitmap.c
@@ -XXX,XX +XXX,XX @@ typedef struct TestHBitmapData {
 } TestHBitmapData;
 
 
+static int64_t check_hbitmap_iter_next(HBitmapIter *hbi)
+{
+    int next0, next1;
+
+    next0 = hbitmap_iter_next(hbi, false);
+    next1 = hbitmap_iter_next(hbi, true);
+
+    g_assert_cmpint(next0, ==, next1);
+
+    return next0;
+}
+
 /* Check that the HBitmap and the shadow bitmap contain the same data,
  * ignoring the same "first" bits.
  */
@@ -XXX,XX +XXX,XX @@ static void hbitmap_test_check(TestHBitmapData *data,
 
     i = first;
     for (;;) {
-        next = hbitmap_iter_next(&hbi, true);
+        next = check_hbitmap_iter_next(&hbi);
         if (next < 0) {
             next = data->size;
         }
@@ -XXX,XX +XXX,XX @@ static void test_hbitmap_iter_granularity(TestHBitmapData *data,
     /* Note that hbitmap_test_check has to be invoked manually in this test.  */
     hbitmap_test_init(data, 131072 << 7, 7);
     hbitmap_iter_init(&hbi, data->hb, 0);
-    g_assert_cmpint(hbitmap_iter_next(&hbi, true), <, 0);
+    g_assert_cmpint(check_hbitmap_iter_next(&hbi), <, 0);
 
     hbitmap_test_set(data, ((L2 + L1 + 1) << 7) + 8, 8);
     hbitmap_iter_init(&hbi, data->hb, 0);
-    g_assert_cmpint(hbitmap_iter_next(&hbi, true), ==, (L2 + L1 + 1) << 7);
-    g_assert_cmpint(hbitmap_iter_next(&hbi, true), <, 0);
+    g_assert_cmpint(check_hbitmap_iter_next(&hbi), ==, (L2 + L1 + 1) << 7);
+    g_assert_cmpint(check_hbitmap_iter_next(&hbi), <, 0);
 
     hbitmap_iter_init(&hbi, data->hb, (L2 + L1 + 2) << 7);
     g_assert_cmpint(hbitmap_iter_next(&hbi, true), <, 0);
 
     hbitmap_test_set(data, (131072 << 7) - 8, 8);
     hbitmap_iter_init(&hbi, data->hb, 0);
-    g_assert_cmpint(hbitmap_iter_next(&hbi, true), ==, (L2 + L1 + 1) << 7);
-    g_assert_cmpint(hbitmap_iter_next(&hbi, true), ==, 131071 << 7);
-    g_assert_cmpint(hbitmap_iter_next(&hbi, true), <, 0);
+    g_assert_cmpint(check_hbitmap_iter_next(&hbi), ==, (L2 + L1 + 1) << 7);
+    g_assert_cmpint(check_hbitmap_iter_next(&hbi), ==, 131071 << 7);
+    g_assert_cmpint(check_hbitmap_iter_next(&hbi), <, 0);
 
     hbitmap_iter_init(&hbi, data->hb, (L2 + L1 + 2) << 7);
-    g_assert_cmpint(hbitmap_iter_next(&hbi, true), ==, 131071 << 7);
-    g_assert_cmpint(hbitmap_iter_next(&hbi, true), <, 0);
+    g_assert_cmpint(check_hbitmap_iter_next(&hbi), ==, 131071 << 7);
+    g_assert_cmpint(check_hbitmap_iter_next(&hbi), <, 0);
 }
 
 static void hbitmap_test_set_boundary_bits(TestHBitmapData *data, ssize_t diff)
@@ -XXX,XX +XXX,XX @@ static void test_hbitmap_serialize_zeroes(TestHBitmapData *data,
     for (i = 0; i < num_positions; i++) {
         hbitmap_deserialize_zeroes(data->hb, positions[i], min_l1, true);
         hbitmap_iter_init(&iter, data->hb, 0);
-        next = hbitmap_iter_next(&iter, true);
+        next = check_hbitmap_iter_next(&iter);
         if (i == num_positions - 1) {
             g_assert_cmpint(next, ==, -1);
         } else {
@@ -XXX,XX +XXX,XX @@ static void test_hbitmap_iter_and_reset(TestHBitmapData *data,
 
     hbitmap_iter_init(&hbi, data->hb, BITS_PER_LONG - 1);
 
-    hbitmap_iter_next(&hbi, true);
+    check_hbitmap_iter_next(&hbi);
 
     hbitmap_reset_all(data->hb);
-    hbitmap_iter_next(&hbi, true);
+    check_hbitmap_iter_next(&hbi);
 }
 
 static void test_hbitmap_next_zero_check(TestHBitmapData *data, int64_t start)
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

This new function allows to look for a consecutively dirty area in a
dirty bitmap.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Reviewed-by: John Snow <jsnow@redhat.com>
Message-id: 20180613181823.13618-10-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 include/block/dirty-bitmap.h |  2 ++
 block/dirty-bitmap.c         | 55 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+)

diff --git a/include/block/dirty-bitmap.h b/include/block/dirty-bitmap.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/dirty-bitmap.h
+++ b/include/block/dirty-bitmap.h
@@ -XXX,XX +XXX,XX @@ void bdrv_set_dirty_bitmap_locked(BdrvDirtyBitmap *bitmap,
 void bdrv_reset_dirty_bitmap_locked(BdrvDirtyBitmap *bitmap,
                                     int64_t offset, int64_t bytes);
 int64_t bdrv_dirty_iter_next(BdrvDirtyBitmapIter *iter);
+bool bdrv_dirty_iter_next_area(BdrvDirtyBitmapIter *iter, uint64_t max_offset,
+                               uint64_t *offset, int *bytes);
 void bdrv_set_dirty_iter(BdrvDirtyBitmapIter *hbi, int64_t offset);
 int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap);
 int64_t bdrv_get_meta_dirty_count(BdrvDirtyBitmap *bitmap);
diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c
index XXXXXXX..XXXXXXX 100644
--- a/block/dirty-bitmap.c
+++ b/block/dirty-bitmap.c
@@ -XXX,XX +XXX,XX @@ int64_t bdrv_dirty_iter_next(BdrvDirtyBitmapIter *iter)
     return hbitmap_iter_next(&iter->hbi, true);
 }
 
+/**
+ * Return the next consecutively dirty area in the dirty bitmap
+ * belonging to the given iterator @iter.
+ *
+ * @max_offset: Maximum value that may be returned for
+ *              *offset + *bytes
+ * @offset:     Will contain the start offset of the next dirty area
+ * @bytes:      Will contain the length of the next dirty area
+ *
+ * Returns: True if a dirty area could be found before max_offset
+ *          (which means that *offset and *bytes then contain valid
+ *          values), false otherwise.
+ *
+ * Note that @iter is never advanced if false is returned.  If an area
+ * is found (which means that true is returned), it will be advanced
+ * past that area.
+ */
+bool bdrv_dirty_iter_next_area(BdrvDirtyBitmapIter *iter, uint64_t max_offset,
+                               uint64_t *offset, int *bytes)
+{
+    uint32_t granularity = bdrv_dirty_bitmap_granularity(iter->bitmap);
+    uint64_t gran_max_offset;
+    int64_t ret;
+    int size;
+
+    if (max_offset == iter->bitmap->size) {
+        /* If max_offset points to the image end, round it up by the
+         * bitmap granularity */
+        gran_max_offset = ROUND_UP(max_offset, granularity);
+    } else {
+        gran_max_offset = max_offset;
+    }
+
+    ret = hbitmap_iter_next(&iter->hbi, false);
+    if (ret < 0 || ret + granularity > gran_max_offset) {
+        return false;
+    }
+
+    *offset = ret;
+    size = 0;
+
+    assert(granularity <= INT_MAX);
+
+    do {
+        /* Advance iterator */
+        ret = hbitmap_iter_next(&iter->hbi, true);
+        size += granularity;
+    } while (ret + granularity <= gran_max_offset &&
+             hbitmap_iter_next(&iter->hbi, false) == ret + granularity &&
+             size <= INT_MAX - granularity);
+
+    *bytes = MIN(size, max_offset - *offset);
+    return true;
+}
+
 /* Called within bdrv_dirty_bitmap_lock..unlock */
 void bdrv_set_dirty_bitmap_locked(BdrvDirtyBitmap *bitmap,
                                   int64_t offset, int64_t bytes)
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

This will allow us to access the block job data when the mirror block
driver becomes more complex.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 20180613181823.13618-11-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/mirror.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/block/mirror.c b/block/mirror.c
index XXXXXXX..XXXXXXX 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -XXX,XX +XXX,XX @@ typedef struct MirrorBlockJob {
     bool initial_zeroing_ongoing;
 } MirrorBlockJob;
 
+typedef struct MirrorBDSOpaque {
+    MirrorBlockJob *job;
+} MirrorBDSOpaque;
+
 struct MirrorOp {
     MirrorBlockJob *s;
     QEMUIOVector qiov;
@@ -XXX,XX +XXX,XX @@ static void mirror_exit(Job *job, void *opaque)
     MirrorBlockJob *s = container_of(job, MirrorBlockJob, common.job);
     BlockJob *bjob = &s->common;
     MirrorExitData *data = opaque;
+    MirrorBDSOpaque *bs_opaque = s->mirror_top_bs->opaque;
     AioContext *replace_aio_context = NULL;
     BlockDriverState *src = s->mirror_top_bs->backing->bs;
     BlockDriverState *target_bs = blk_bs(s->target);
@@ -XXX,XX +XXX,XX @@ static void mirror_exit(Job *job, void *opaque)
     blk_set_perm(bjob->blk, 0, BLK_PERM_ALL, &error_abort);
     blk_insert_bs(bjob->blk, mirror_top_bs, &error_abort);
 
+    bs_opaque->job = NULL;
     job_completed(job, data->ret, NULL);
 
     g_free(data);
@@ -XXX,XX +XXX,XX @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs,
                              Error **errp)
 {
     MirrorBlockJob *s;
+    MirrorBDSOpaque *bs_opaque;
     BlockDriverState *mirror_top_bs;
     bool target_graph_mod;
     bool target_is_backing;
@@ -XXX,XX +XXX,XX @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs,
     mirror_top_bs->total_sectors = bs->total_sectors;
     mirror_top_bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED;
     mirror_top_bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED;
+    bs_opaque = g_new0(MirrorBDSOpaque, 1);
+    mirror_top_bs->opaque = bs_opaque;
     bdrv_set_aio_context(mirror_top_bs, bdrv_get_aio_context(bs));
 
     /* bdrv_append takes ownership of the mirror_top_bs reference, need to keep
@@ -XXX,XX +XXX,XX @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs,
     if (!s) {
         goto fail;
     }
+    bs_opaque->job = s;
+
     /* The block job now has a reference to this node */
     bdrv_unref(mirror_top_bs);
 
@@ -XXX,XX +XXX,XX @@ fail:
 
         g_free(s->replaces);
         blk_unref(s->target);
+        bs_opaque->job = NULL;
         job_early_fail(&s->common.job);
     }
 
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20180613181823.13618-12-mreitz@redhat.com
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 include/qemu/job.h | 15 +++++++++++++++
 job.c              |  5 +++++
 2 files changed, 20 insertions(+)

diff --git a/include/qemu/job.h b/include/qemu/job.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/job.h
+++ b/include/qemu/job.h
@@ -XXX,XX +XXX,XX @@ void job_progress_update(Job *job, uint64_t done);
  */
 void job_progress_set_remaining(Job *job, uint64_t remaining);
 
+/**
+ * @job: The job whose expected progress end value is updated
+ * @delta: Value which is to be added to the current expected end
+ *         value
+ *
+ * Increases the expected end value of the progress counter of a job.
+ * This is useful for parenthesis operations: If a job has to
+ * conditionally perform a high-priority operation as part of its
+ * progress, it calls this function with the expected operation's
+ * length before, and job_progress_update() afterwards.
+ * (So the operation acts as a parenthesis in regards to the main job
+ * operation running in background.)
+ */
+void job_progress_increase_remaining(Job *job, uint64_t delta);
+
 /** To be called when a cancelled job is finalised. */
 void job_event_cancelled(Job *job);
 
diff --git a/job.c b/job.c
index XXXXXXX..XXXXXXX 100644
--- a/job.c
+++ b/job.c
@@ -XXX,XX +XXX,XX @@ void job_progress_set_remaining(Job *job, uint64_t remaining)
     job->progress_total = job->progress_current + remaining;
 }
 
+void job_progress_increase_remaining(Job *job, uint64_t delta)
+{
+    job->progress_total += delta;
+}
+
 void job_event_cancelled(Job *job)
 {
     notifier_list_notify(&job->on_finalize_cancelled, job);
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

This patch implements active synchronous mirroring.  In active mode, the
passive mechanism will still be in place and is used to copy all
initially dirty clusters off the source disk; but every write request
will write data both to the source and the target disk, so the source
cannot be dirtied faster than data is mirrored to the target.  Also,
once the block job has converged (BLOCK_JOB_READY sent), source and
target are guaranteed to stay in sync (unless an error occurs).

Active mode is completely optional and currently disabled at runtime.  A
later patch will add a way for users to enable it.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 20180613181823.13618-13-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 qapi/block-core.json |  18 ++++
 block/mirror.c       | 252 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 265 insertions(+), 5 deletions(-)

diff --git a/qapi/block-core.json b/qapi/block-core.json
index XXXXXXX..XXXXXXX 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -XXX,XX +XXX,XX @@
   'data': ['top', 'full', 'none', 'incremental'] }
 
 ##
+# @MirrorCopyMode:
+#
+# An enumeration whose values tell the mirror block job when to
+# trigger writes to the target.
+#
+# @background: copy data in background only.
+#
+# @write-blocking: when data is written to the source, write it
+#                  (synchronously) to the target as well.  In
+#                  addition, data is copied in background just like in
+#                  @background mode.
+#
+# Since: 3.0
+##
+{ 'enum': 'MirrorCopyMode',
+  'data': ['background', 'write-blocking'] }
+
+##
 # @BlockJobInfo:
 #
 # Information about a long-running block device operation.
diff --git a/block/mirror.c b/block/mirror.c
index XXXXXXX..XXXXXXX 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -XXX,XX +XXX,XX @@ typedef struct MirrorBlockJob {
     Error *replace_blocker;
     bool is_none_mode;
     BlockMirrorBackingMode backing_mode;
+    MirrorCopyMode copy_mode;
     BlockdevOnError on_source_error, on_target_error;
     bool synced;
+    /* Set when the target is synced (dirty bitmap is clean, nothing
+     * in flight) and the job is running in active mode */
+    bool actively_synced;
     bool should_complete;
     int64_t granularity;
     size_t buf_size;
@@ -XXX,XX +XXX,XX @@ typedef struct MirrorBlockJob {
     int target_cluster_size;
     int max_iov;
     bool initial_zeroing_ongoing;
+    int in_active_write_counter;
 } MirrorBlockJob;
 
 typedef struct MirrorBDSOpaque {
@@ -XXX,XX +XXX,XX @@ struct MirrorOp {
     int64_t *bytes_handled;
 
     bool is_pseudo_op;
+    bool is_active_write;
     CoQueue waiting_requests;
 
     QTAILQ_ENTRY(MirrorOp) next;
@@ -XXX,XX +XXX,XX @@ static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read,
                                             int error)
 {
     s->synced = false;
+    s->actively_synced = false;
     if (read) {
         return block_job_error_action(&s->common, s->on_source_error,
                                       true, error);
@@ -XXX,XX +XXX,XX @@ static int mirror_cow_align(MirrorBlockJob *s, int64_t *offset,
     return ret;
 }
 
-static inline void mirror_wait_for_free_in_flight_slot(MirrorBlockJob *s)
+static inline void mirror_wait_for_any_operation(MirrorBlockJob *s, bool active)
 {
     MirrorOp *op;
 
@@ -XXX,XX +XXX,XX @@ static inline void mirror_wait_for_free_in_flight_slot(MirrorBlockJob *s)
          * caller of this function.  Since there is only one pseudo op
          * at any given time, we will always find some real operation
          * to wait on. */
-        if (!op->is_pseudo_op) {
+        if (!op->is_pseudo_op && op->is_active_write == active) {
             qemu_co_queue_wait(&op->waiting_requests, NULL);
             return;
         }
@@ -XXX,XX +XXX,XX @@ static inline void mirror_wait_for_free_in_flight_slot(MirrorBlockJob *s)
     abort();
 }
 
+static inline void mirror_wait_for_free_in_flight_slot(MirrorBlockJob *s)
+{
+    /* Only non-active operations use up in-flight slots */
+    mirror_wait_for_any_operation(s, false);
+}
+
 /* Perform a mirror copy operation.
  *
  * *op->bytes_handled is set to the number of bytes copied after and
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn mirror_run(void *opaque)
         /* Transition to the READY state and wait for complete. */
         job_transition_to_ready(&s->common.job);
         s->synced = true;
+        s->actively_synced = true;
         while (!job_is_cancelled(&s->common.job) && !s->should_complete) {
             job_yield(&s->common.job);
         }
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn mirror_run(void *opaque)
         int64_t cnt, delta;
         bool should_complete;
 
+        /* Do not start passive operations while there are active
+         * writes in progress */
+        while (s->in_active_write_counter) {
+            mirror_wait_for_any_operation(s, true);
+        }
+
         if (s->ret < 0) {
             ret = s->ret;
             goto immediate_exit;
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn mirror_run(void *opaque)
                  */
                 job_transition_to_ready(&s->common.job);
                 s->synced = true;
+                if (s->copy_mode != MIRROR_COPY_MODE_BACKGROUND) {
+                    s->actively_synced = true;
+                }
             }
 
             should_complete = s->should_complete ||
@@ -XXX,XX +XXX,XX @@ static const BlockJobDriver commit_active_job_driver = {
     .drain                  = mirror_drain,
 };
 
+static void do_sync_target_write(MirrorBlockJob *job, MirrorMethod method,
+                                 uint64_t offset, uint64_t bytes,
+                                 QEMUIOVector *qiov, int flags)
+{
+    BdrvDirtyBitmapIter *iter;
+    QEMUIOVector target_qiov;
+    uint64_t dirty_offset;
+    int dirty_bytes;
+
+    if (qiov) {
+        qemu_iovec_init(&target_qiov, qiov->niov);
+    }
+
+    iter = bdrv_dirty_iter_new(job->dirty_bitmap);
+    bdrv_set_dirty_iter(iter, offset);
+
+    while (true) {
+        bool valid_area;
+        int ret;
+
+        bdrv_dirty_bitmap_lock(job->dirty_bitmap);
+        valid_area = bdrv_dirty_iter_next_area(iter, offset + bytes,
+                                               &dirty_offset, &dirty_bytes);
+        if (!valid_area) {
+            bdrv_dirty_bitmap_unlock(job->dirty_bitmap);
+            break;
+        }
+
+        bdrv_reset_dirty_bitmap_locked(job->dirty_bitmap,
+                                       dirty_offset, dirty_bytes);
+        bdrv_dirty_bitmap_unlock(job->dirty_bitmap);
+
+        job_progress_increase_remaining(&job->common.job, dirty_bytes);
+
+        assert(dirty_offset - offset <= SIZE_MAX);
+        if (qiov) {
+            qemu_iovec_reset(&target_qiov);
+            qemu_iovec_concat(&target_qiov, qiov,
+                              dirty_offset - offset, dirty_bytes);
+        }
+
+        switch (method) {
+        case MIRROR_METHOD_COPY:
+            ret = blk_co_pwritev(job->target, dirty_offset, dirty_bytes,
+                                 qiov ? &target_qiov : NULL, flags);
+            break;
+
+        case MIRROR_METHOD_ZERO:
+            assert(!qiov);
+            ret = blk_co_pwrite_zeroes(job->target, dirty_offset, dirty_bytes,
+                                       flags);
+            break;
+
+        case MIRROR_METHOD_DISCARD:
+            assert(!qiov);
+            ret = blk_co_pdiscard(job->target, dirty_offset, dirty_bytes);
+            break;
+
+        default:
+            abort();
+        }
+
+        if (ret >= 0) {
+            job_progress_update(&job->common.job, dirty_bytes);
+        } else {
+            BlockErrorAction action;
+
+            bdrv_set_dirty_bitmap(job->dirty_bitmap, dirty_offset, dirty_bytes);
+            job->actively_synced = false;
+
+            action = mirror_error_action(job, false, -ret);
+            if (action == BLOCK_ERROR_ACTION_REPORT) {
+                if (!job->ret) {
+                    job->ret = ret;
+                }
+                break;
+            }
+        }
+    }
+
+    bdrv_dirty_iter_free(iter);
+    if (qiov) {
+        qemu_iovec_destroy(&target_qiov);
+    }
+}
+
+static MirrorOp *coroutine_fn active_write_prepare(MirrorBlockJob *s,
+                                                   uint64_t offset,
+                                                   uint64_t bytes)
+{
+    MirrorOp *op;
+    uint64_t start_chunk = offset / s->granularity;
+    uint64_t end_chunk = DIV_ROUND_UP(offset + bytes, s->granularity);
+
+    op = g_new(MirrorOp, 1);
+    *op = (MirrorOp){
+        .s                  = s,
+        .offset             = offset,
+        .bytes              = bytes,
+        .is_active_write    = true,
+    };
+    qemu_co_queue_init(&op->waiting_requests);
+    QTAILQ_INSERT_TAIL(&s->ops_in_flight, op, next);
+
+    s->in_active_write_counter++;
+
+    mirror_wait_on_conflicts(op, s, offset, bytes);
+
+    bitmap_set(s->in_flight_bitmap, start_chunk, end_chunk - start_chunk);
+
+    return op;
+}
+
+static void coroutine_fn active_write_settle(MirrorOp *op)
+{
+    uint64_t start_chunk = op->offset / op->s->granularity;
+    uint64_t end_chunk = DIV_ROUND_UP(op->offset + op->bytes,
+                                      op->s->granularity);
+
+    if (!--op->s->in_active_write_counter && op->s->actively_synced) {
+        BdrvChild *source = op->s->mirror_top_bs->backing;
+
+        if (QLIST_FIRST(&source->bs->parents) == source &&
+            QLIST_NEXT(source, next_parent) == NULL)
+        {
+            /* Assert that we are back in sync once all active write
+             * operations are settled.
+             * Note that we can only assert this if the mirror node
+             * is the source node's only parent. */
+            assert(!bdrv_get_dirty_count(op->s->dirty_bitmap));
+        }
+    }
+    bitmap_clear(op->s->in_flight_bitmap, start_chunk, end_chunk - start_chunk);
+    QTAILQ_REMOVE(&op->s->ops_in_flight, op, next);
+    qemu_co_queue_restart_all(&op->waiting_requests);
+    g_free(op);
+}
+
 static int coroutine_fn bdrv_mirror_top_preadv(BlockDriverState *bs,
     uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags)
 {
     return bdrv_co_preadv(bs->backing, offset, bytes, qiov, flags);
 }
 
+static int coroutine_fn bdrv_mirror_top_do_write(BlockDriverState *bs,
+    MirrorMethod method, uint64_t offset, uint64_t bytes, QEMUIOVector *qiov,
+    int flags)
+{
+    MirrorOp *op = NULL;
+    MirrorBDSOpaque *s = bs->opaque;
+    int ret = 0;
+    bool copy_to_target;
+
+    copy_to_target = s->job->ret >= 0 &&
+                     s->job->copy_mode == MIRROR_COPY_MODE_WRITE_BLOCKING;
+
+    if (copy_to_target) {
+        op = active_write_prepare(s->job, offset, bytes);
+    }
+
+    switch (method) {
+    case MIRROR_METHOD_COPY:
+        ret = bdrv_co_pwritev(bs->backing, offset, bytes, qiov, flags);
+        break;
+
+    case MIRROR_METHOD_ZERO:
+        ret = bdrv_co_pwrite_zeroes(bs->backing, offset, bytes, flags);
+        break;
+
+    case MIRROR_METHOD_DISCARD:
+        ret = bdrv_co_pdiscard(bs->backing->bs, offset, bytes);
+        break;
+
+    default:
+        abort();
+    }
+
+    if (ret < 0) {
+        goto out;
+    }
+
+    if (copy_to_target) {
+        do_sync_target_write(s->job, method, offset, bytes, qiov, flags);
+    }
+
+out:
+    if (copy_to_target) {
+        active_write_settle(op);
+    }
+    return ret;
+}
+
 static int coroutine_fn bdrv_mirror_top_pwritev(BlockDriverState *bs,
     uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags)
 {
-    return bdrv_co_pwritev(bs->backing, offset, bytes, qiov, flags);
+    MirrorBDSOpaque *s = bs->opaque;
+    QEMUIOVector bounce_qiov;
+    void *bounce_buf;
+    int ret = 0;
+    bool copy_to_target;
+
+    copy_to_target = s->job->ret >= 0 &&
+                     s->job->copy_mode == MIRROR_COPY_MODE_WRITE_BLOCKING;
+
+    if (copy_to_target) {
+        /* The guest might concurrently modify the data to write; but
+         * the data on source and destination must match, so we have
+         * to use a bounce buffer if we are going to write to the
+         * target now. */
+        bounce_buf = qemu_blockalign(bs, bytes);
+        iov_to_buf_full(qiov->iov, qiov->niov, 0, bounce_buf, bytes);
+
+        qemu_iovec_init(&bounce_qiov, 1);
+        qemu_iovec_add(&bounce_qiov, bounce_buf, bytes);
+        qiov = &bounce_qiov;
+    }
+
+    ret = bdrv_mirror_top_do_write(bs, MIRROR_METHOD_COPY, offset, bytes, qiov,
+                                   flags);
+
+    if (copy_to_target) {
+        qemu_iovec_destroy(&bounce_qiov);
+        qemu_vfree(bounce_buf);
+    }
+
+    return ret;
 }
 
 static int coroutine_fn bdrv_mirror_top_flush(BlockDriverState *bs)
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_mirror_top_flush(BlockDriverState *bs)
 static int coroutine_fn bdrv_mirror_top_pwrite_zeroes(BlockDriverState *bs,
     int64_t offset, int bytes, BdrvRequestFlags flags)
 {
-    return bdrv_co_pwrite_zeroes(bs->backing, offset, bytes, flags);
+    return bdrv_mirror_top_do_write(bs, MIRROR_METHOD_ZERO, offset, bytes, NULL,
+                                    flags);
 }
 
 static int coroutine_fn bdrv_mirror_top_pdiscard(BlockDriverState *bs,
     int64_t offset, int bytes)
 {
-    return bdrv_co_pdiscard(bs->backing->bs, offset, bytes);
+    return bdrv_mirror_top_do_write(bs, MIRROR_METHOD_DISCARD, offset, bytes,
+                                    NULL, 0);
 }
 
 static void bdrv_mirror_top_refresh_filename(BlockDriverState *bs, QDict *opts)
@@ -XXX,XX +XXX,XX @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs,
     s->on_target_error = on_target_error;
     s->is_none_mode = is_none_mode;
     s->backing_mode = backing_mode;
+    s->copy_mode = MIRROR_COPY_MODE_BACKGROUND;
     s->base = base;
     s->granularity = granularity;
     s->buf_size = ROUND_UP(buf_size, granularity);
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

This patch allows the user to specify whether to use active or only
background mode for mirror block jobs.  Currently, this setting will
remain constant for the duration of the entire block job.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Alberto Garcia <berto@igalia.com>
Message-id: 20180613181823.13618-14-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 qapi/block-core.json      | 11 +++++++++--
 include/block/block_int.h |  4 +++-
 block/mirror.c            | 12 +++++++-----
 blockdev.c                |  9 ++++++++-
 4 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/qapi/block-core.json b/qapi/block-core.json
index XXXXXXX..XXXXXXX 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -XXX,XX +XXX,XX @@
 #         written. Both will result in identical contents.
 #         Default is true. (Since 2.4)
 #
+# @copy-mode: when to copy data to the destination; defaults to 'background'
+#             (Since: 3.0)
+#
 # Since: 1.3
 ##
 { 'struct': 'DriveMirror',
@@ -XXX,XX +XXX,XX @@
             '*speed': 'int', '*granularity': 'uint32',
             '*buf-size': 'int', '*on-source-error': 'BlockdevOnError',
             '*on-target-error': 'BlockdevOnError',
-            '*unmap': 'bool' } }
+            '*unmap': 'bool', '*copy-mode': 'MirrorCopyMode' } }
 
 ##
 # @BlockDirtyBitmap:
@@ -XXX,XX +XXX,XX @@
 #                    above @device. If this option is not given, a node name is
 #                    autogenerated. (Since: 2.9)
 #
+# @copy-mode: when to copy data to the destination; defaults to 'background'
+#             (Since: 3.0)
+#
 # Returns: nothing on success.
 #
 # Since: 2.6
@@ -XXX,XX +XXX,XX @@
             '*speed': 'int', '*granularity': 'uint32',
             '*buf-size': 'int', '*on-source-error': 'BlockdevOnError',
             '*on-target-error': 'BlockdevOnError',
-            '*filter-node-name': 'str' } }
+            '*filter-node-name': 'str',
+            '*copy-mode': 'MirrorCopyMode' } }
 
 ##
 # @block_set_io_throttle:
diff --git a/include/block/block_int.h b/include/block/block_int.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -XXX,XX +XXX,XX @@ void commit_active_start(const char *job_id, BlockDriverState *bs,
  * @filter_node_name: The node name that should be assigned to the filter
  * driver that the mirror job inserts into the graph above @bs. NULL means that
  * a node name should be autogenerated.
+ * @copy_mode: When to trigger writes to the target.
  * @errp: Error object.
  *
  * Start a mirroring operation on @bs.  Clusters that are allocated
@@ -XXX,XX +XXX,XX @@ void mirror_start(const char *job_id, BlockDriverState *bs,
                   MirrorSyncMode mode, BlockMirrorBackingMode backing_mode,
                   BlockdevOnError on_source_error,
                   BlockdevOnError on_target_error,
-                  bool unmap, const char *filter_node_name, Error **errp);
+                  bool unmap, const char *filter_node_name,
+                  MirrorCopyMode copy_mode, Error **errp);
 
 /*
  * backup_job_create:
diff --git a/block/mirror.c b/block/mirror.c
index XXXXXXX..XXXXXXX 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -XXX,XX +XXX,XX @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs,
                              const BlockJobDriver *driver,
                              bool is_none_mode, BlockDriverState *base,
                              bool auto_complete, const char *filter_node_name,
-                             bool is_mirror,
+                             bool is_mirror, MirrorCopyMode copy_mode,
                              Error **errp)
 {
     MirrorBlockJob *s;
@@ -XXX,XX +XXX,XX @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs,
     s->on_target_error = on_target_error;
     s->is_none_mode = is_none_mode;
     s->backing_mode = backing_mode;
-    s->copy_mode = MIRROR_COPY_MODE_BACKGROUND;
+    s->copy_mode = copy_mode;
     s->base = base;
     s->granularity = granularity;
     s->buf_size = ROUND_UP(buf_size, granularity);
@@ -XXX,XX +XXX,XX @@ void mirror_start(const char *job_id, BlockDriverState *bs,
                   MirrorSyncMode mode, BlockMirrorBackingMode backing_mode,
                   BlockdevOnError on_source_error,
                   BlockdevOnError on_target_error,
-                  bool unmap, const char *filter_node_name, Error **errp)
+                  bool unmap, const char *filter_node_name,
+                  MirrorCopyMode copy_mode, Error **errp)
 {
     bool is_none_mode;
     BlockDriverState *base;
@@ -XXX,XX +XXX,XX @@ void mirror_start(const char *job_id, BlockDriverState *bs,
                      speed, granularity, buf_size, backing_mode,
                      on_source_error, on_target_error, unmap, NULL, NULL,
                      &mirror_job_driver, is_none_mode, base, false,
-                     filter_node_name, true, errp);
+                     filter_node_name, true, copy_mode, errp);
 }
 
 void commit_active_start(const char *job_id, BlockDriverState *bs,
@@ -XXX,XX +XXX,XX @@ void commit_active_start(const char *job_id, BlockDriverState *bs,
                      MIRROR_LEAVE_BACKING_CHAIN,
                      on_error, on_error, true, cb, opaque,
                      &commit_active_job_driver, false, base, auto_complete,
-                     filter_node_name, false, &local_err);
+                     filter_node_name, false, MIRROR_COPY_MODE_BACKGROUND,
+                     &local_err);
     if (local_err) {
         error_propagate(errp, local_err);
         goto error_restore_flags;
diff --git a/blockdev.c b/blockdev.c
index XXXXXXX..XXXXXXX 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -XXX,XX +XXX,XX @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
                                    bool has_unmap, bool unmap,
                                    bool has_filter_node_name,
                                    const char *filter_node_name,
+                                   bool has_copy_mode, MirrorCopyMode copy_mode,
                                    Error **errp)
 {
 
@@ -XXX,XX +XXX,XX @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
     if (!has_filter_node_name) {
         filter_node_name = NULL;
     }
+    if (!has_copy_mode) {
+        copy_mode = MIRROR_COPY_MODE_BACKGROUND;
+    }
 
     if (granularity != 0 && (granularity < 512 || granularity > 1048576 * 64)) {
         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "granularity",
@@ -XXX,XX +XXX,XX @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
                  has_replaces ? replaces : NULL,
                  speed, granularity, buf_size, sync, backing_mode,
                  on_source_error, on_target_error, unmap, filter_node_name,
-                 errp);
+                 copy_mode, errp);
 }
 
 void qmp_drive_mirror(DriveMirror *arg, Error **errp)
@@ -XXX,XX +XXX,XX @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp)
                            arg->has_on_target_error, arg->on_target_error,
                            arg->has_unmap, arg->unmap,
                            false, NULL,
+                           arg->has_copy_mode, arg->copy_mode,
                            &local_err);
     bdrv_unref(target_bs);
     error_propagate(errp, local_err);
@@ -XXX,XX +XXX,XX @@ void qmp_blockdev_mirror(bool has_job_id, const char *job_id,
                          BlockdevOnError on_target_error,
                          bool has_filter_node_name,
                          const char *filter_node_name,
+                         bool has_copy_mode, MirrorCopyMode copy_mode,
                          Error **errp)
 {
     BlockDriverState *bs;
@@ -XXX,XX +XXX,XX @@ void qmp_blockdev_mirror(bool has_job_id, const char *job_id,
                            has_on_target_error, on_target_error,
                            true, true,
                            has_filter_node_name, filter_node_name,
+                           has_copy_mode, copy_mode,
                            &local_err);
     error_propagate(errp, local_err);
 
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Alberto Garcia <berto@igalia.com>
Message-id: 20180613181823.13618-15-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/151     | 120 +++++++++++++++++++++++++++++++++++++++++++++
 tests/qemu-iotests/151.out |   5 ++
 tests/qemu-iotests/group   |   1 +
 3 files changed, 126 insertions(+)
 create mode 100755 tests/qemu-iotests/151
 create mode 100644 tests/qemu-iotests/151.out

diff --git a/tests/qemu-iotests/151 b/tests/qemu-iotests/151
new file mode 100755
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/qemu-iotests/151
@@ -XXX,XX +XXX,XX @@
+#!/usr/bin/env python
+#
+# Tests for active mirroring
+#
+# Copyright (C) 2018 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+import os
+import iotests
+from iotests import qemu_img
+
+source_img = os.path.join(iotests.test_dir, 'source.' + iotests.imgfmt)
+target_img = os.path.join(iotests.test_dir, 'target.' + iotests.imgfmt)
+
+class TestActiveMirror(iotests.QMPTestCase):
+    image_len = 128 * 1024 * 1024 # MB
+    potential_writes_in_flight = True
+
+    def setUp(self):
+        qemu_img('create', '-f', iotests.imgfmt, source_img, '128M')
+        qemu_img('create', '-f', iotests.imgfmt, target_img, '128M')
+
+        blk_source = {'id': 'source',
+                      'if': 'none',
+                      'node-name': 'source-node',
+                      'driver': iotests.imgfmt,
+                      'file': {'driver': 'file',
+                               'filename': source_img}}
+
+        blk_target = {'node-name': 'target-node',
+                      'driver': iotests.imgfmt,
+                      'file': {'driver': 'file',
+                               'filename': target_img}}
+
+        self.vm = iotests.VM()
+        self.vm.add_drive_raw(self.vm.qmp_to_opts(blk_source))
+        self.vm.add_blockdev(self.vm.qmp_to_opts(blk_target))
+        self.vm.add_device('virtio-blk,drive=source')
+        self.vm.launch()
+
+    def tearDown(self):
+        self.vm.shutdown()
+
+        if not self.potential_writes_in_flight:
+            self.assertTrue(iotests.compare_images(source_img, target_img),
+                            'mirror target does not match source')
+
+        os.remove(source_img)
+        os.remove(target_img)
+
+    def doActiveIO(self, sync_source_and_target):
+        # Fill the source image
+        self.vm.hmp_qemu_io('source',
+                            'write -P 1 0 %i' % self.image_len);
+
+        # Start some background requests
+        for offset in range(1 * self.image_len / 8, 3 * self.image_len / 8, 1024 * 1024):
+            self.vm.hmp_qemu_io('source', 'aio_write -P 2 %i 1M' % offset)
+        for offset in range(2 * self.image_len / 8, 3 * self.image_len / 8, 1024 * 1024):
+            self.vm.hmp_qemu_io('source', 'aio_write -z %i 1M' % offset)
+
+        # Start the block job
+        result = self.vm.qmp('blockdev-mirror',
+                             job_id='mirror',
+                             filter_node_name='mirror-node',
+                             device='source-node',
+                             target='target-node',
+                             sync='full',
+                             copy_mode='write-blocking')
+        self.assert_qmp(result, 'return', {})
+
+        # Start some more requests
+        for offset in range(3 * self.image_len / 8, 5 * self.image_len / 8, 1024 * 1024):
+            self.vm.hmp_qemu_io('source', 'aio_write -P 3 %i 1M' % offset)
+        for offset in range(4 * self.image_len / 8, 5 * self.image_len / 8, 1024 * 1024):
+            self.vm.hmp_qemu_io('source', 'aio_write -z %i 1M' % offset)
+
+        # Wait for the READY event
+        self.wait_ready(drive='mirror')
+
+        # Now start some final requests; all of these (which land on
+        # the source) should be settled using the active mechanism.
+        # The mirror code itself asserts that the source BDS's dirty
+        # bitmap will stay clean between READY and COMPLETED.
+        for offset in range(5 * self.image_len / 8, 7 * self.image_len / 8, 1024 * 1024):
+            self.vm.hmp_qemu_io('source', 'aio_write -P 3 %i 1M' % offset)
+        for offset in range(6 * self.image_len / 8, 7 * self.image_len / 8, 1024 * 1024):
+            self.vm.hmp_qemu_io('source', 'aio_write -z %i 1M' % offset)
+
+        if sync_source_and_target:
+            # If source and target should be in sync after the mirror,
+            # we have to flush before completion
+            self.vm.hmp_qemu_io('source', 'aio_flush')
+            self.potential_writes_in_flight = False
+
+        self.complete_and_wait(drive='mirror', wait_ready=False)
+
+    def testActiveIO(self):
+        self.doActiveIO(False)
+
+    def testActiveIOFlushed(self):
+        self.doActiveIO(True)
+
+
+
+if __name__ == '__main__':
+    iotests.main(supported_fmts=['qcow2', 'raw'])
diff --git a/tests/qemu-iotests/151.out b/tests/qemu-iotests/151.out
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/qemu-iotests/151.out
@@ -XXX,XX +XXX,XX @@
+..
+----------------------------------------------------------------------
+Ran 2 tests
+
+OK
diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
index XXXXXXX..XXXXXXX 100644
--- a/tests/qemu-iotests/group
+++ b/tests/qemu-iotests/group
@@ -XXX,XX +XXX,XX @@
 148 rw auto quick
 149 rw auto sudo
 150 rw auto quick
+151 rw auto
 152 rw auto quick
 153 rw auto quick
 154 rw auto backing quick
-- 
2.13.6

The following changes since commit 281f327487c9c9b1599f93c589a408bbf4a651b8:

Merge remote-tracking branch 'remotes/vivier/tags/m68k-for-2.12-pull-request' into staging (2017-12-22 00:11:36 +0000)

are available in the git repository at:

git://repo.or.cz/qemu/kevin.git tags/for-upstream

for you to fetch changes up to 1a63a907507fbbcfaee3f622907ec244b7eabda8:

block: Keep nodes drained between reopen_queue/multiple (2017-12-22 15:05:32 +0100)

----------------------------------------------------------------
Block layer patches

----------------------------------------------------------------
Doug Gale (1):
      nvme: Add tracing

Edgar Kaziakhmedov (1):
      qcow2: get rid of qcow2_backing_read1 routine

Fam Zheng (2):
      block: Open backing image in force share mode for size probe
      block: Remove unused bdrv_requests_pending

John Snow (1):
      iotests: fix 197 for vpc

Kevin Wolf (27):
      block: Formats don't need CONSISTENT_READ with NO_IO
      block: Make bdrv_drain_invoke() recursive
      block: Call .drain_begin only once in bdrv_drain_all_begin()
      test-bdrv-drain: Test BlockDriver callbacks for drain
      block: bdrv_drain_recurse(): Remove unused begin parameter
      block: Don't wait for requests in bdrv_drain*_end()
      block: Unify order in drain functions
      block: Don't acquire AioContext in hmp_qemu_io()
      block: Document that x-blockdev-change breaks quorum children list
      block: Assert drain_all is only called from main AioContext
      block: Make bdrv_drain() driver callbacks non-recursive
      test-bdrv-drain: Test callback for bdrv_drain
      test-bdrv-drain: Test bs->quiesce_counter
      blockjob: Pause job on draining any job BDS
      test-bdrv-drain: Test drain vs. block jobs
      block: Don't block_job_pause_all() in bdrv_drain_all()
      block: Nested drain_end must still call callbacks
      test-bdrv-drain: Test nested drain sections
      block: Don't notify parents in drain call chain
      block: Add bdrv_subtree_drained_begin/end()
      test-bdrv-drain: Tests for bdrv_subtree_drain
      test-bdrv-drain: Test behaviour in coroutine context
      test-bdrv-drain: Recursive draining with multiple parents
      block: Allow graph changes in subtree drained section
      test-bdrv-drain: Test graph changes in drained section
      commit: Simplify reopen of base
      block: Keep nodes drained between reopen_queue/multiple

Thomas Huth (3):
      block: Remove the obsolete -drive boot=on|off parameter
      block: Remove the deprecated -hdachs option
      block: Mention -drive cyls/heads/secs/trans/serial/addr in deprecation chapter

Commit 1f4ad7d fixed 'qemu-img info' for raw images that are currently
in use as a mirror target. It is not enough for image formats, though,
as these still unconditionally request BLK_PERM_CONSISTENT_READ.

As this permission is geared towards whether the guest-visible data is
consistent, and has no impact on whether the metadata is sane, and
'qemu-img info' does not read guest-visible data (except for the raw
format), it makes sense to not require BLK_PERM_CONSISTENT_READ if there
is not going to be any guest I/O performed, regardless of image format.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/block.c b/block.c
index XXXXXXX..XXXXXXX 100644
--- a/block.c
+++ b/block.c
@@ -XXX,XX +XXX,XX @@ void bdrv_format_default_perms(BlockDriverState *bs, BdrvChild *c,
     assert(role == &child_backing || role == &child_file);
 
     if (!backing) {
+        int flags = bdrv_reopen_get_flags(reopen_queue, bs);
+
         /* Apart from the modifications below, the same permissions are
          * forwarded and left alone as for filters */
         bdrv_filter_default_perms(bs, c, role, reopen_queue, perm, shared,
@@ -XXX,XX +XXX,XX @@ void bdrv_format_default_perms(BlockDriverState *bs, BdrvChild *c,
 
         /* bs->file always needs to be consistent because of the metadata. We
          * can never allow other users to resize or write to it. */
-        perm |= BLK_PERM_CONSISTENT_READ;
+        if (!(flags & BDRV_O_NO_IO)) {
+            perm |= BLK_PERM_CONSISTENT_READ;
+        }
         shared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
     } else {
         /* We want consistent read from backing files if the parent needs it.
-- 
2.13.6

From: John Snow <jsnow@redhat.com>

VPC has some difficulty creating geometries of particular size.
However, we can indeed force it to use a literal one, so let's
do that for the sake of test 197, which is testing some specific
offsets.

Signed-off-by: John Snow <jsnow@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Lukáš Doktor <ldoktor@redhat.com>
---
 tests/qemu-iotests/197           | 4 ++++
 tests/qemu-iotests/common.filter | 3 ++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/197 b/tests/qemu-iotests/197
index XXXXXXX..XXXXXXX 100755
--- a/tests/qemu-iotests/197
+++ b/tests/qemu-iotests/197
@@ -XXX,XX +XXX,XX @@ echo '=== Copy-on-read ==='
 echo
 
 # Prep the images
+# VPC rounds image sizes to a specific geometry, force a specific size.
+if [ "$IMGFMT" = "vpc" ]; then
+    IMGOPTS=$(_optstr_add "$IMGOPTS" "force_size")
+fi
 _make_test_img 4G
 $QEMU_IO -c "write -P 55 3G 1k" "$TEST_IMG" | _filter_qemu_io
 IMGPROTO=file IMGFMT=qcow2 IMGOPTS= TEST_IMG_FILE="$TEST_WRAP" \
diff --git a/tests/qemu-iotests/common.filter b/tests/qemu-iotests/common.filter
index XXXXXXX..XXXXXXX 100644
--- a/tests/qemu-iotests/common.filter
+++ b/tests/qemu-iotests/common.filter
@@ -XXX,XX +XXX,XX @@ _filter_img_create()
         -e "s# log_size=[0-9]\\+##g" \
         -e "s# refcount_bits=[0-9]\\+##g" \
         -e "s# key-secret=[a-zA-Z0-9]\\+##g" \
-        -e "s# iter-time=[0-9]\\+##g"
+        -e "s# iter-time=[0-9]\\+##g" \
+        -e "s# force_size=\$on\\|off\$##g"
 }
 
 _filter_img_info()
-- 
2.13.6

This change separates bdrv_drain_invoke(), which calls the BlockDriver
drain callbacks, from bdrv_drain_recurse(). Instead, the function
performs its own recursion now.

One reason for this is that bdrv_drain_recurse() can be called multiple
times by bdrv_drain_all_begin(), but the callbacks may only be called
once. The separation is necessary to fix this bug.

The other reason is that we intend to go to a model where we call all
driver callbacks first, and only then start polling. This is not fully
achieved yet with this patch, as bdrv_drain_invoke() contains a
BDRV_POLL_WHILE() loop for the block driver callbacks, which can still
call callbacks for any unrelated event. It's a step in this direction
anyway.

Cc: qemu-stable@nongnu.org
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/io.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
     bdrv_wakeup(bs);
 }
 
+/* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
 static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
 {
+    BdrvChild *child, *tmp;
     BdrvCoDrainData data = { .bs = bs, .done = false, .begin = begin};
 
     if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) ||
@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
     data.co = qemu_coroutine_create(bdrv_drain_invoke_entry, &data);
     bdrv_coroutine_enter(bs, data.co);
     BDRV_POLL_WHILE(bs, !data.done);
+
+    QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
+        bdrv_drain_invoke(child->bs, begin);
+    }
 }
 
 static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
     BdrvChild *child, *tmp;
     bool waited;
 
-    /* Ensure any pending metadata writes are submitted to bs->file.  */
-    bdrv_drain_invoke(bs, begin);
-
     /* Wait for drained requests to finish */
     waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0);
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
         bdrv_parent_drained_begin(bs);
     }
 
+    bdrv_drain_invoke(bs, true);
     bdrv_drain_recurse(bs, true);
 }
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
     }
 
     bdrv_parent_drained_end(bs);
+    bdrv_drain_invoke(bs, false);
     bdrv_drain_recurse(bs, false);
     aio_enable_external(bdrv_get_aio_context(bs));
 }
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
             aio_context_acquire(aio_context);
             for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
                 if (aio_context == bdrv_get_aio_context(bs)) {
+                    /* FIXME Calling this multiple times is wrong */
+                    bdrv_drain_invoke(bs, true);
                     waited |= bdrv_drain_recurse(bs, true);
                 }
             }
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
         aio_context_acquire(aio_context);
         aio_enable_external(aio_context);
         bdrv_parent_drained_end(bs);
+        bdrv_drain_invoke(bs, false);
         bdrv_drain_recurse(bs, false);
         aio_context_release(aio_context);
     }
-- 
2.13.6

bdrv_drain_all_begin() used to call the .bdrv_co_drain_begin() driver
callback inside its polling loop. This means that how many times it got
called for each node depended on long it had to poll the event loop.

This is obviously not right and results in nodes that stay drained even
after bdrv_drain_all_end(), which calls .bdrv_co_drain_begin() once per
node.

Fix bdrv_drain_all_begin() to call the callback only once, too.

Cc: qemu-stable@nongnu.org
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/io.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
         aio_context_acquire(aio_context);
         bdrv_parent_drained_begin(bs);
         aio_disable_external(aio_context);
+        bdrv_drain_invoke(bs, true);
         aio_context_release(aio_context);
 
         if (!g_slist_find(aio_ctxs, aio_context)) {
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
             aio_context_acquire(aio_context);
             for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
                 if (aio_context == bdrv_get_aio_context(bs)) {
-                    /* FIXME Calling this multiple times is wrong */
-                    bdrv_drain_invoke(bs, true);
                     waited |= bdrv_drain_recurse(bs, true);
                 }
             }
-- 
2.13.6

This adds a test case that the BlockDriver callbacks for drain are
called in bdrv_drained_all_begin/end(), and that both of them are called
exactly once.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
---
 tests/test-bdrv-drain.c | 137 ++++++++++++++++++++++++++++++++++++++++++++++++
 tests/Makefile.include  |   2 +
 2 files changed, 139 insertions(+)
 create mode 100644 tests/test-bdrv-drain.c

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * Block node draining tests
+ *
+ * Copyright (c) 2017 Kevin Wolf <kwolf@redhat.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "block/block.h"
+#include "sysemu/block-backend.h"
+#include "qapi/error.h"
+
+typedef struct BDRVTestState {
+    int drain_count;
+} BDRVTestState;
+
+static void coroutine_fn bdrv_test_co_drain_begin(BlockDriverState *bs)
+{
+    BDRVTestState *s = bs->opaque;
+    s->drain_count++;
+}
+
+static void coroutine_fn bdrv_test_co_drain_end(BlockDriverState *bs)
+{
+    BDRVTestState *s = bs->opaque;
+    s->drain_count--;
+}
+
+static void bdrv_test_close(BlockDriverState *bs)
+{
+    BDRVTestState *s = bs->opaque;
+    g_assert_cmpint(s->drain_count, >, 0);
+}
+
+static int coroutine_fn bdrv_test_co_preadv(BlockDriverState *bs,
+                                            uint64_t offset, uint64_t bytes,
+                                            QEMUIOVector *qiov, int flags)
+{
+    /* We want this request to stay until the polling loop in drain waits for
+     * it to complete. We need to sleep a while as bdrv_drain_invoke() comes
+     * first and polls its result, too, but it shouldn't accidentally complete
+     * this request yet. */
+    qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100000);
+
+    return 0;
+}
+
+static BlockDriver bdrv_test = {
+    .format_name            = "test",
+    .instance_size          = sizeof(BDRVTestState),
+
+    .bdrv_close             = bdrv_test_close,
+    .bdrv_co_preadv         = bdrv_test_co_preadv,
+
+    .bdrv_co_drain_begin    = bdrv_test_co_drain_begin,
+    .bdrv_co_drain_end      = bdrv_test_co_drain_end,
+};
+
+static void aio_ret_cb(void *opaque, int ret)
+{
+    int *aio_ret = opaque;
+    *aio_ret = ret;
+}
+
+static void test_drv_cb_drain_all(void)
+{
+    BlockBackend *blk;
+    BlockDriverState *bs;
+    BDRVTestState *s;
+    BlockAIOCB *acb;
+    int aio_ret;
+
+    QEMUIOVector qiov;
+    struct iovec iov = {
+        .iov_base = NULL,
+        .iov_len = 0,
+    };
+    qemu_iovec_init_external(&qiov, &iov, 1);
+
+    blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
+                              &error_abort);
+    s = bs->opaque;
+    blk_insert_bs(blk, bs, &error_abort);
+
+    /* Simple bdrv_drain_all_begin/end pair, check that CBs are called */
+    g_assert_cmpint(s->drain_count, ==, 0);
+    bdrv_drain_all_begin();
+    g_assert_cmpint(s->drain_count, ==, 1);
+    bdrv_drain_all_end();
+    g_assert_cmpint(s->drain_count, ==, 0);
+
+    /* Now do the same while a request is pending */
+    aio_ret = -EINPROGRESS;
+    acb = blk_aio_preadv(blk, 0, &qiov, 0, aio_ret_cb, &aio_ret);
+    g_assert(acb != NULL);
+    g_assert_cmpint(aio_ret, ==, -EINPROGRESS);
+
+    g_assert_cmpint(s->drain_count, ==, 0);
+    bdrv_drain_all_begin();
+    g_assert_cmpint(aio_ret, ==, 0);
+    g_assert_cmpint(s->drain_count, ==, 1);
+    bdrv_drain_all_end();
+    g_assert_cmpint(s->drain_count, ==, 0);
+
+    bdrv_unref(bs);
+    blk_unref(blk);
+}
+
+int main(int argc, char **argv)
+{
+    bdrv_init();
+    qemu_init_main_loop(&error_abort);
+
+    g_test_init(&argc, &argv, NULL);
+
+    g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
+
+    return g_test_run();
+}
diff --git a/tests/Makefile.include b/tests/Makefile.include
index XXXXXXX..XXXXXXX 100644
--- a/tests/Makefile.include
+++ b/tests/Makefile.include
@@ -XXX,XX +XXX,XX @@ gcov-files-test-thread-pool-y = thread-pool.c
 gcov-files-test-hbitmap-y = util/hbitmap.c
 check-unit-y += tests/test-hbitmap$(EXESUF)
 gcov-files-test-hbitmap-y = blockjob.c
+check-unit-y += tests/test-bdrv-drain$(EXESUF)
 check-unit-y += tests/test-blockjob$(EXESUF)
 check-unit-y += tests/test-blockjob-txn$(EXESUF)
 check-unit-y += tests/test-x86-cpuid$(EXESUF)
@@ -XXX,XX +XXX,XX @@ tests/test-coroutine$(EXESUF): tests/test-coroutine.o $(test-block-obj-y)
 tests/test-aio$(EXESUF): tests/test-aio.o $(test-block-obj-y)
 tests/test-aio-multithread$(EXESUF): tests/test-aio-multithread.o $(test-block-obj-y)
 tests/test-throttle$(EXESUF): tests/test-throttle.o $(test-block-obj-y)
+tests/test-bdrv-drain$(EXESUF): tests/test-bdrv-drain.o $(test-block-obj-y) $(test-util-obj-y)
 tests/test-blockjob$(EXESUF): tests/test-blockjob.o $(test-block-obj-y) $(test-util-obj-y)
 tests/test-blockjob-txn$(EXESUF): tests/test-blockjob-txn.o $(test-block-obj-y) $(test-util-obj-y)
 tests/test-thread-pool$(EXESUF): tests/test-thread-pool.o $(test-block-obj-y)
-- 
2.13.6

Now that the bdrv_drain_invoke() calls are pulled up to the callers of
bdrv_drain_recurse(), the 'begin' parameter isn't needed any more.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/io.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
     }
 }
 
-static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
+static bool bdrv_drain_recurse(BlockDriverState *bs)
 {
     BdrvChild *child, *tmp;
     bool waited;
@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
              */
             bdrv_ref(bs);
         }
-        waited |= bdrv_drain_recurse(bs, begin);
+        waited |= bdrv_drain_recurse(bs);
         if (in_main_loop) {
             bdrv_unref(bs);
         }
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
     }
 
     bdrv_drain_invoke(bs, true);
-    bdrv_drain_recurse(bs, true);
+    bdrv_drain_recurse(bs);
 }
 
 void bdrv_drained_end(BlockDriverState *bs)
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
 
     bdrv_parent_drained_end(bs);
     bdrv_drain_invoke(bs, false);
-    bdrv_drain_recurse(bs, false);
+    bdrv_drain_recurse(bs);
     aio_enable_external(bdrv_get_aio_context(bs));
 }
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
             aio_context_acquire(aio_context);
             for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
                 if (aio_context == bdrv_get_aio_context(bs)) {
-                    waited |= bdrv_drain_recurse(bs, true);
+                    waited |= bdrv_drain_recurse(bs);
                 }
             }
             aio_context_release(aio_context);
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
         aio_enable_external(aio_context);
         bdrv_parent_drained_end(bs);
         bdrv_drain_invoke(bs, false);
-        bdrv_drain_recurse(bs, false);
+        bdrv_drain_recurse(bs);
         aio_context_release(aio_context);
     }
 
-- 
2.13.6

The device is drained, so there is no point in waiting for requests at
the end of the drained section. Remove the bdrv_drain_recurse() calls
there.

The bdrv_drain_recurse() calls were introduced in commit 481cad48e5e
in order to call the .bdrv_co_drain_end() driver callback. This is now
done by a separate bdrv_drain_invoke() call.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/io.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
 
     bdrv_parent_drained_end(bs);
     bdrv_drain_invoke(bs, false);
-    bdrv_drain_recurse(bs);
     aio_enable_external(bdrv_get_aio_context(bs));
 }
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
         aio_enable_external(aio_context);
         bdrv_parent_drained_end(bs);
         bdrv_drain_invoke(bs, false);
-        bdrv_drain_recurse(bs);
         aio_context_release(aio_context);
     }
 
-- 
2.13.6

Drain requests are propagated to child nodes, parent nodes and directly
to the AioContext. The order in which this happened was different
between all combinations of drain/drain_all and begin/end.

The correct order is to keep children only drained when their parents
are also drained. This means that at the start of a drained section, the
AioContext needs to be drained first, the parents second and only then
the children. The correct order for the end of a drained section is the
opposite.

This patch changes the three other functions to follow the example of
bdrv_drained_begin(), which is the only one that got it right.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/io.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
         return;
     }
 
+    /* Stop things in parent-to-child order */
     if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
         aio_disable_external(bdrv_get_aio_context(bs));
         bdrv_parent_drained_begin(bs);
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
         return;
     }
 
-    bdrv_parent_drained_end(bs);
+    /* Re-enable things in child-to-parent order */
     bdrv_drain_invoke(bs, false);
+    bdrv_parent_drained_end(bs);
     aio_enable_external(bdrv_get_aio_context(bs));
 }
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
         AioContext *aio_context = bdrv_get_aio_context(bs);
 
+        /* Stop things in parent-to-child order */
         aio_context_acquire(aio_context);
-        bdrv_parent_drained_begin(bs);
         aio_disable_external(aio_context);
+        bdrv_parent_drained_begin(bs);
         bdrv_drain_invoke(bs, true);
         aio_context_release(aio_context);
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
         AioContext *aio_context = bdrv_get_aio_context(bs);
 
+        /* Re-enable things in child-to-parent order */
         aio_context_acquire(aio_context);
-        aio_enable_external(aio_context);
-        bdrv_parent_drained_end(bs);
         bdrv_drain_invoke(bs, false);
+        bdrv_parent_drained_end(bs);
+        aio_enable_external(aio_context);
         aio_context_release(aio_context);
     }
 
-- 
2.13.6

Commit 15afd94a047 added code to acquire and release the AioContext in
qemuio_command(). This means that the lock is taken twice now in the
call path from hmp_qemu_io(). This causes BDRV_POLL_WHILE() to hang for
any requests issued to nodes in a non-mainloop AioContext.

Dropping the first locking from hmp_qemu_io() fixes the problem.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 hmp.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/hmp.c b/hmp.c
index XXXXXXX..XXXXXXX 100644
--- a/hmp.c
+++ b/hmp.c
@@ -XXX,XX +XXX,XX @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
 {
     BlockBackend *blk;
     BlockBackend *local_blk = NULL;
-    AioContext *aio_context;
     const char* device = qdict_get_str(qdict, "device");
     const char* command = qdict_get_str(qdict, "command");
     Error *err = NULL;
@@ -XXX,XX +XXX,XX @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
         }
     }
 
-    aio_context = blk_get_aio_context(blk);
-    aio_context_acquire(aio_context);
-
     /*
      * Notably absent: Proper permission management. This is sad, but it seems
      * almost impossible to achieve without changing the semantics and thereby
@@ -XXX,XX +XXX,XX @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
      */
     qemuio_command(blk, command);
 
-    aio_context_release(aio_context);
-
 fail:
     blk_unref(local_blk);
     hmp_handle_error(mon, &err);
-- 
2.13.6

From: Edgar Kaziakhmedov <edgar.kaziakhmedov@virtuozzo.com>

Since bdrv_co_preadv does all neccessary checks including
reading after the end of the backing file, avoid duplication
of verification before bdrv_co_preadv call.

Signed-off-by: Edgar Kaziakhmedov <edgar.kaziakhmedov@virtuozzo.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2.h |  3 ---
 block/qcow2.c | 51 ++++++++-------------------------------------------
 2 files changed, 8 insertions(+), 46 deletions(-)

diff --git a/block/qcow2.h b/block/qcow2.h
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -XXX,XX +XXX,XX @@ uint32_t offset_to_reftable_index(BDRVQcow2State *s, uint64_t offset)
 }
 
 /* qcow2.c functions */
-int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
-                  int64_t sector_num, int nb_sectors);
-
 int64_t qcow2_refcount_metadata_size(int64_t clusters, size_t cluster_size,
                                      int refcount_order, bool generous_increase,
                                      uint64_t *refblock_count);
diff --git a/block/qcow2.c b/block/qcow2.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -XXX,XX +XXX,XX @@ static int64_t coroutine_fn qcow2_co_get_block_status(BlockDriverState *bs,
     return status;
 }
 
-/* handle reading after the end of the backing file */
-int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
-                        int64_t offset, int bytes)
-{
-    uint64_t bs_size = bs->total_sectors * BDRV_SECTOR_SIZE;
-    int n1;
-
-    if ((offset + bytes) <= bs_size) {
-        return bytes;
-    }
-
-    if (offset >= bs_size) {
-        n1 = 0;
-    } else {
-        n1 = bs_size - offset;
-    }
-
-    qemu_iovec_memset(qiov, n1, 0, bytes - n1);
-
-    return n1;
-}
-
 static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
                                         uint64_t bytes, QEMUIOVector *qiov,
                                         int flags)
 {
     BDRVQcow2State *s = bs->opaque;
-    int offset_in_cluster, n1;
+    int offset_in_cluster;
     int ret;
     unsigned int cur_bytes; /* number of bytes in current iteration */
     uint64_t cluster_offset = 0;
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
         case QCOW2_CLUSTER_UNALLOCATED:
 
             if (bs->backing) {
-                /* read from the base image */
-                n1 = qcow2_backing_read1(bs->backing->bs, &hd_qiov,
-                                         offset, cur_bytes);
-                if (n1 > 0) {
-                    QEMUIOVector local_qiov;
-
-                    qemu_iovec_init(&local_qiov, hd_qiov.niov);
-                    qemu_iovec_concat(&local_qiov, &hd_qiov, 0, n1);
-
-                    BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
-                    qemu_co_mutex_unlock(&s->lock);
-                    ret = bdrv_co_preadv(bs->backing, offset, n1,
-                                         &local_qiov, 0);
-                    qemu_co_mutex_lock(&s->lock);
-
-                    qemu_iovec_destroy(&local_qiov);
-
-                    if (ret < 0) {
-                        goto fail;
-                    }
+                BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
+                qemu_co_mutex_unlock(&s->lock);
+                ret = bdrv_co_preadv(bs->backing, offset, cur_bytes,
+                                     &hd_qiov, 0);
+                qemu_co_mutex_lock(&s->lock);
+                if (ret < 0) {
+                    goto fail;
                 }
             } else {
                 /* Note: in this case, no need to wait */
-- 
2.13.6

Removing a quorum child node with x-blockdev-change results in a quorum
driver state that cannot be recreated with create options because it
would require a list with gaps. This causes trouble in at least
.bdrv_refresh_filename().

Document this problem so that we won't accidentally mark the command
stable without having addressed it.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Alberto Garcia <berto@igalia.com>
---
 qapi/block-core.json | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/qapi/block-core.json b/qapi/block-core.json
index XXXXXXX..XXXXXXX 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -XXX,XX +XXX,XX @@
 # does not support all kinds of operations, all kinds of children, nor
 # all block drivers.
 #
+# FIXME Removing children from a quorum node means introducing gaps in the
+# child indices. This cannot be represented in the 'children' list of
+# BlockdevOptionsQuorum, as returned by .bdrv_refresh_filename().
+#
 # Warning: The data in a new quorum child MUST be consistent with that of
 # the rest of the array.
 #
-- 
2.13.6

From: Doug Gale <doug16k@gmail.com>

Add trace output for commands, errors, and undefined behavior.
Add guest error log output for undefined behavior.
Report invalid undefined accesses to MMIO.
Annotate unlikely error checks with unlikely.

Signed-off-by: Doug Gale <doug16k@gmail.com>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/block/nvme.c       | 349 ++++++++++++++++++++++++++++++++++++++++++--------
 hw/block/trace-events |  93 ++++++++++++++
 2 files changed, 390 insertions(+), 52 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -XXX,XX +XXX,XX @@
 #include "qapi/visitor.h"
 #include "sysemu/block-backend.h"
 
+#include "qemu/log.h"
+#include "trace.h"
 #include "nvme.h"
 
+#define NVME_GUEST_ERR(trace, fmt, ...) \
+    do { \
+        (trace_##trace)(__VA_ARGS__); \
+        qemu_log_mask(LOG_GUEST_ERROR, #trace \
+            " in %s: " fmt "\n", __func__, ## __VA_ARGS__); \
+    } while (0)
+
 static void nvme_process_sq(void *opaque);
 
 static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
@@ -XXX,XX +XXX,XX @@ static void nvme_isr_notify(NvmeCtrl *n, NvmeCQueue *cq)
 {
     if (cq->irq_enabled) {
         if (msix_enabled(&(n->parent_obj))) {
+            trace_nvme_irq_msix(cq->vector);
             msix_notify(&(n->parent_obj), cq->vector);
         } else {
+            trace_nvme_irq_pin();
             pci_irq_pulse(&n->parent_obj);
         }
+    } else {
+        trace_nvme_irq_masked();
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
     trans_len = MIN(len, trans_len);
     int num_prps = (len >> n->page_bits) + 1;
 
-    if (!prp1) {
+    if (unlikely(!prp1)) {
+        trace_nvme_err_invalid_prp();
         return NVME_INVALID_FIELD | NVME_DNR;
     } else if (n->cmbsz && prp1 >= n->ctrl_mem.addr &&
                prp1 < n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size)) {
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
     }
     len -= trans_len;
     if (len) {
-        if (!prp2) {
+        if (unlikely(!prp2)) {
+            trace_nvme_err_invalid_prp2_missing();
             goto unmap;
         }
         if (len > n->page_size) {
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
                 uint64_t prp_ent = le64_to_cpu(prp_list[i]);
 
                 if (i == n->max_prp_ents - 1 && len > n->page_size) {
-                    if (!prp_ent || prp_ent & (n->page_size - 1)) {
+                    if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) {
+                        trace_nvme_err_invalid_prplist_ent(prp_ent);
                         goto unmap;
                     }
 
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
                     prp_ent = le64_to_cpu(prp_list[i]);
                 }
 
-                if (!prp_ent || prp_ent & (n->page_size - 1)) {
+                if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) {
+                    trace_nvme_err_invalid_prplist_ent(prp_ent);
                     goto unmap;
                 }
 
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
                 i++;
             }
         } else {
-            if (prp2 & (n->page_size - 1)) {
+            if (unlikely(prp2 & (n->page_size - 1))) {
+                trace_nvme_err_invalid_prp2_align(prp2);
                 goto unmap;
             }
             if (qsg->nsg) {
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_dma_read_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
     QEMUIOVector iov;
     uint16_t status = NVME_SUCCESS;
 
+    trace_nvme_dma_read(prp1, prp2);
+
     if (nvme_map_prp(&qsg, &iov, prp1, prp2, len, n)) {
         return NVME_INVALID_FIELD | NVME_DNR;
     }
     if (qsg.nsg > 0) {
-        if (dma_buf_read(ptr, len, &qsg)) {
+        if (unlikely(dma_buf_read(ptr, len, &qsg))) {
+            trace_nvme_err_invalid_dma();
             status = NVME_INVALID_FIELD | NVME_DNR;
         }
         qemu_sglist_destroy(&qsg);
     } else {
-        if (qemu_iovec_to_buf(&iov, 0, ptr, len) != len) {
+        if (unlikely(qemu_iovec_to_buf(&iov, 0, ptr, len) != len)) {
+            trace_nvme_err_invalid_dma();
             status = NVME_INVALID_FIELD | NVME_DNR;
         }
         qemu_iovec_destroy(&iov);
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_write_zeros(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
     uint64_t aio_slba = slba << (data_shift - BDRV_SECTOR_BITS);
     uint32_t aio_nlb = nlb << (data_shift - BDRV_SECTOR_BITS);
 
-    if (slba + nlb > ns->id_ns.nsze) {
+    if (unlikely(slba + nlb > ns->id_ns.nsze)) {
+        trace_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
         return NVME_LBA_RANGE | NVME_DNR;
     }
 
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
     int is_write = rw->opcode == NVME_CMD_WRITE ? 1 : 0;
     enum BlockAcctType acct = is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ;
 
-    if ((slba + nlb) > ns->id_ns.nsze) {
+    trace_nvme_rw(is_write ? "write" : "read", nlb, data_size, slba);
+
+    if (unlikely((slba + nlb) > ns->id_ns.nsze)) {
         block_acct_invalid(blk_get_stats(n->conf.blk), acct);
+        trace_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
         return NVME_LBA_RANGE | NVME_DNR;
     }
 
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
     NvmeNamespace *ns;
     uint32_t nsid = le32_to_cpu(cmd->nsid);
 
-    if (nsid == 0 || nsid > n->num_namespaces) {
+    if (unlikely(nsid == 0 || nsid > n->num_namespaces)) {
+        trace_nvme_err_invalid_ns(nsid, n->num_namespaces);
         return NVME_INVALID_NSID | NVME_DNR;
     }
 
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
     case NVME_CMD_READ:
         return nvme_rw(n, ns, cmd, req);
     default:
+        trace_nvme_err_invalid_opc(cmd->opcode);
         return NVME_INVALID_OPCODE | NVME_DNR;
     }
 }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeCmd *cmd)
     NvmeCQueue *cq;
     uint16_t qid = le16_to_cpu(c->qid);
 
-    if (!qid || nvme_check_sqid(n, qid)) {
+    if (unlikely(!qid || nvme_check_sqid(n, qid))) {
+        trace_nvme_err_invalid_del_sq(qid);
         return NVME_INVALID_QID | NVME_DNR;
     }
 
+    trace_nvme_del_sq(qid);
+
     sq = n->sq[qid];
     while (!QTAILQ_EMPTY(&sq->out_req_list)) {
         req = QTAILQ_FIRST(&sq->out_req_list);
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeCmd *cmd)
     uint16_t qflags = le16_to_cpu(c->sq_flags);
     uint64_t prp1 = le64_to_cpu(c->prp1);
 
-    if (!cqid || nvme_check_cqid(n, cqid)) {
+    trace_nvme_create_sq(prp1, sqid, cqid, qsize, qflags);
+
+    if (unlikely(!cqid || nvme_check_cqid(n, cqid))) {
+        trace_nvme_err_invalid_create_sq_cqid(cqid);
         return NVME_INVALID_CQID | NVME_DNR;
     }
-    if (!sqid || !nvme_check_sqid(n, sqid)) {
+    if (unlikely(!sqid || !nvme_check_sqid(n, sqid))) {
+        trace_nvme_err_invalid_create_sq_sqid(sqid);
         return NVME_INVALID_QID | NVME_DNR;
     }
-    if (!qsize || qsize > NVME_CAP_MQES(n->bar.cap)) {
+    if (unlikely(!qsize || qsize > NVME_CAP_MQES(n->bar.cap))) {
+        trace_nvme_err_invalid_create_sq_size(qsize);
         return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
     }
-    if (!prp1 || prp1 & (n->page_size - 1)) {
+    if (unlikely(!prp1 || prp1 & (n->page_size - 1))) {
+        trace_nvme_err_invalid_create_sq_addr(prp1);
         return NVME_INVALID_FIELD | NVME_DNR;
     }
-    if (!(NVME_SQ_FLAGS_PC(qflags))) {
+    if (unlikely(!(NVME_SQ_FLAGS_PC(qflags)))) {
+        trace_nvme_err_invalid_create_sq_qflags(NVME_SQ_FLAGS_PC(qflags));
         return NVME_INVALID_FIELD | NVME_DNR;
     }
     sq = g_malloc0(sizeof(*sq));
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeCmd *cmd)
     NvmeCQueue *cq;
     uint16_t qid = le16_to_cpu(c->qid);
 
-    if (!qid || nvme_check_cqid(n, qid)) {
+    if (unlikely(!qid || nvme_check_cqid(n, qid))) {
+        trace_nvme_err_invalid_del_cq_cqid(qid);
         return NVME_INVALID_CQID | NVME_DNR;
     }
 
     cq = n->cq[qid];
-    if (!QTAILQ_EMPTY(&cq->sq_list)) {
+    if (unlikely(!QTAILQ_EMPTY(&cq->sq_list))) {
+        trace_nvme_err_invalid_del_cq_notempty(qid);
         return NVME_INVALID_QUEUE_DEL;
     }
+    trace_nvme_del_cq(qid);
     nvme_free_cq(cq, n);
     return NVME_SUCCESS;
 }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeCmd *cmd)
     uint16_t qflags = le16_to_cpu(c->cq_flags);
     uint64_t prp1 = le64_to_cpu(c->prp1);
 
-    if (!cqid || !nvme_check_cqid(n, cqid)) {
+    trace_nvme_create_cq(prp1, cqid, vector, qsize, qflags,
+                         NVME_CQ_FLAGS_IEN(qflags) != 0);
+
+    if (unlikely(!cqid || !nvme_check_cqid(n, cqid))) {
+        trace_nvme_err_invalid_create_cq_cqid(cqid);
         return NVME_INVALID_CQID | NVME_DNR;
     }
-    if (!qsize || qsize > NVME_CAP_MQES(n->bar.cap)) {
+    if (unlikely(!qsize || qsize > NVME_CAP_MQES(n->bar.cap))) {
+        trace_nvme_err_invalid_create_cq_size(qsize);
         return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
     }
-    if (!prp1) {
+    if (unlikely(!prp1)) {
+        trace_nvme_err_invalid_create_cq_addr(prp1);
         return NVME_INVALID_FIELD | NVME_DNR;
     }
-    if (vector > n->num_queues) {
+    if (unlikely(vector > n->num_queues)) {
+        trace_nvme_err_invalid_create_cq_vector(vector);
         return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
     }
-    if (!(NVME_CQ_FLAGS_PC(qflags))) {
+    if (unlikely(!(NVME_CQ_FLAGS_PC(qflags)))) {
+        trace_nvme_err_invalid_create_cq_qflags(NVME_CQ_FLAGS_PC(qflags));
         return NVME_INVALID_FIELD | NVME_DNR;
     }
 
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeIdentify *c)
     uint64_t prp1 = le64_to_cpu(c->prp1);
     uint64_t prp2 = le64_to_cpu(c->prp2);
 
+    trace_nvme_identify_ctrl();
+
     return nvme_dma_read_prp(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl),
         prp1, prp2);
 }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeIdentify *c)
     uint64_t prp1 = le64_to_cpu(c->prp1);
     uint64_t prp2 = le64_to_cpu(c->prp2);
 
-    if (nsid == 0 || nsid > n->num_namespaces) {
+    trace_nvme_identify_ns(nsid);
+
+    if (unlikely(nsid == 0 || nsid > n->num_namespaces)) {
+        trace_nvme_err_invalid_ns(nsid, n->num_namespaces);
         return NVME_INVALID_NSID | NVME_DNR;
     }
 
     ns = &n->namespaces[nsid - 1];
+
     return nvme_dma_read_prp(n, (uint8_t *)&ns->id_ns, sizeof(ns->id_ns),
         prp1, prp2);
 }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeIdentify *c)
     uint16_t ret;
     int i, j = 0;
 
+    trace_nvme_identify_nslist(min_nsid);
+
     list = g_malloc0(data_len);
     for (i = 0; i < n->num_namespaces; i++) {
         if (i < min_nsid) {
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeCmd *cmd)
     case 0x02:
         return nvme_identify_nslist(n, c);
     default:
+        trace_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns));
         return NVME_INVALID_FIELD | NVME_DNR;
     }
 }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
     switch (dw10) {
     case NVME_VOLATILE_WRITE_CACHE:
         result = blk_enable_write_cache(n->conf.blk);
+        trace_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
         break;
     case NVME_NUMBER_OF_QUEUES:
         result = cpu_to_le32((n->num_queues - 2) | ((n->num_queues - 2) << 16));
+        trace_nvme_getfeat_numq(result);
         break;
     default:
+        trace_nvme_err_invalid_getfeat(dw10);
         return NVME_INVALID_FIELD | NVME_DNR;
     }
 
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
         blk_set_enable_write_cache(n->conf.blk, dw11 & 1);
         break;
     case NVME_NUMBER_OF_QUEUES:
+        trace_nvme_setfeat_numq((dw11 & 0xFFFF) + 1,
+                                ((dw11 >> 16) & 0xFFFF) + 1,
+                                n->num_queues - 1, n->num_queues - 1);
         req->cqe.result =
             cpu_to_le32((n->num_queues - 2) | ((n->num_queues - 2) << 16));
         break;
     default:
+        trace_nvme_err_invalid_setfeat(dw10);
         return NVME_INVALID_FIELD | NVME_DNR;
     }
     return NVME_SUCCESS;
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
     case NVME_ADM_CMD_GET_FEATURES:
         return nvme_get_feature(n, cmd, req);
     default:
+        trace_nvme_err_invalid_admin_opc(cmd->opcode);
         return NVME_INVALID_OPCODE | NVME_DNR;
     }
 }
@@ -XXX,XX +XXX,XX @@ static int nvme_start_ctrl(NvmeCtrl *n)
     uint32_t page_bits = NVME_CC_MPS(n->bar.cc) + 12;
     uint32_t page_size = 1 << page_bits;
 
-    if (n->cq[0] || n->sq[0] || !n->bar.asq || !n->bar.acq ||
-            n->bar.asq & (page_size - 1) || n->bar.acq & (page_size - 1) ||
-            NVME_CC_MPS(n->bar.cc) < NVME_CAP_MPSMIN(n->bar.cap) ||
-            NVME_CC_MPS(n->bar.cc) > NVME_CAP_MPSMAX(n->bar.cap) ||
-            NVME_CC_IOCQES(n->bar.cc) < NVME_CTRL_CQES_MIN(n->id_ctrl.cqes) ||
-            NVME_CC_IOCQES(n->bar.cc) > NVME_CTRL_CQES_MAX(n->id_ctrl.cqes) ||
-            NVME_CC_IOSQES(n->bar.cc) < NVME_CTRL_SQES_MIN(n->id_ctrl.sqes) ||
-            NVME_CC_IOSQES(n->bar.cc) > NVME_CTRL_SQES_MAX(n->id_ctrl.sqes) ||
-            !NVME_AQA_ASQS(n->bar.aqa) || !NVME_AQA_ACQS(n->bar.aqa)) {
+    if (unlikely(n->cq[0])) {
+        trace_nvme_err_startfail_cq();
+        return -1;
+    }
+    if (unlikely(n->sq[0])) {
+        trace_nvme_err_startfail_sq();
+        return -1;
+    }
+    if (unlikely(!n->bar.asq)) {
+        trace_nvme_err_startfail_nbarasq();
+        return -1;
+    }
+    if (unlikely(!n->bar.acq)) {
+        trace_nvme_err_startfail_nbaracq();
+        return -1;
+    }
+    if (unlikely(n->bar.asq & (page_size - 1))) {
+        trace_nvme_err_startfail_asq_misaligned(n->bar.asq);
+        return -1;
+    }
+    if (unlikely(n->bar.acq & (page_size - 1))) {
+        trace_nvme_err_startfail_acq_misaligned(n->bar.acq);
+        return -1;
+    }
+    if (unlikely(NVME_CC_MPS(n->bar.cc) <
+                 NVME_CAP_MPSMIN(n->bar.cap))) {
+        trace_nvme_err_startfail_page_too_small(
+                    NVME_CC_MPS(n->bar.cc),
+                    NVME_CAP_MPSMIN(n->bar.cap));
+        return -1;
+    }
+    if (unlikely(NVME_CC_MPS(n->bar.cc) >
+                 NVME_CAP_MPSMAX(n->bar.cap))) {
+        trace_nvme_err_startfail_page_too_large(
+                    NVME_CC_MPS(n->bar.cc),
+                    NVME_CAP_MPSMAX(n->bar.cap));
+        return -1;
+    }
+    if (unlikely(NVME_CC_IOCQES(n->bar.cc) <
+                 NVME_CTRL_CQES_MIN(n->id_ctrl.cqes))) {
+        trace_nvme_err_startfail_cqent_too_small(
+                    NVME_CC_IOCQES(n->bar.cc),
+                    NVME_CTRL_CQES_MIN(n->bar.cap));
+        return -1;
+    }
+    if (unlikely(NVME_CC_IOCQES(n->bar.cc) >
+                 NVME_CTRL_CQES_MAX(n->id_ctrl.cqes))) {
+        trace_nvme_err_startfail_cqent_too_large(
+                    NVME_CC_IOCQES(n->bar.cc),
+                    NVME_CTRL_CQES_MAX(n->bar.cap));
+        return -1;
+    }
+    if (unlikely(NVME_CC_IOSQES(n->bar.cc) <
+                 NVME_CTRL_SQES_MIN(n->id_ctrl.sqes))) {
+        trace_nvme_err_startfail_sqent_too_small(
+                    NVME_CC_IOSQES(n->bar.cc),
+                    NVME_CTRL_SQES_MIN(n->bar.cap));
+        return -1;
+    }
+    if (unlikely(NVME_CC_IOSQES(n->bar.cc) >
+                 NVME_CTRL_SQES_MAX(n->id_ctrl.sqes))) {
+        trace_nvme_err_startfail_sqent_too_large(
+                    NVME_CC_IOSQES(n->bar.cc),
+                    NVME_CTRL_SQES_MAX(n->bar.cap));
+        return -1;
+    }
+    if (unlikely(!NVME_AQA_ASQS(n->bar.aqa))) {
+        trace_nvme_err_startfail_asqent_sz_zero();
+        return -1;
+    }
+    if (unlikely(!NVME_AQA_ACQS(n->bar.aqa))) {
+        trace_nvme_err_startfail_acqent_sz_zero();
         return -1;
     }
 
@@ -XXX,XX +XXX,XX @@ static int nvme_start_ctrl(NvmeCtrl *n)
 static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
     unsigned size)
 {
+    if (unlikely(offset & (sizeof(uint32_t) - 1))) {
+        NVME_GUEST_ERR(nvme_ub_mmiowr_misaligned32,
+                       "MMIO write not 32-bit aligned,"
+                       " offset=0x%"PRIx64"", offset);
+        /* should be ignored, fall through for now */
+    }
+
+    if (unlikely(size < sizeof(uint32_t))) {
+        NVME_GUEST_ERR(nvme_ub_mmiowr_toosmall,
+                       "MMIO write smaller than 32-bits,"
+                       " offset=0x%"PRIx64", size=%u",
+                       offset, size);
+        /* should be ignored, fall through for now */
+    }
+
     switch (offset) {
-    case 0xc:
+    case 0xc:   /* INTMS */
+        if (unlikely(msix_enabled(&(n->parent_obj)))) {
+            NVME_GUEST_ERR(nvme_ub_mmiowr_intmask_with_msix,
+                           "undefined access to interrupt mask set"
+                           " when MSI-X is enabled");
+            /* should be ignored, fall through for now */
+        }
         n->bar.intms |= data & 0xffffffff;
         n->bar.intmc = n->bar.intms;
+        trace_nvme_mmio_intm_set(data & 0xffffffff,
+                                 n->bar.intmc);
         break;
-    case 0x10:
+    case 0x10:  /* INTMC */
+        if (unlikely(msix_enabled(&(n->parent_obj)))) {
+            NVME_GUEST_ERR(nvme_ub_mmiowr_intmask_with_msix,
+                           "undefined access to interrupt mask clr"
+                           " when MSI-X is enabled");
+            /* should be ignored, fall through for now */
+        }
         n->bar.intms &= ~(data & 0xffffffff);
         n->bar.intmc = n->bar.intms;
+        trace_nvme_mmio_intm_clr(data & 0xffffffff,
+                                 n->bar.intmc);
         break;
-    case 0x14:
+    case 0x14:  /* CC */
+        trace_nvme_mmio_cfg(data & 0xffffffff);
         /* Windows first sends data, then sends enable bit */
         if (!NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc) &&
             !NVME_CC_SHN(data) && !NVME_CC_SHN(n->bar.cc))
@@ -XXX,XX +XXX,XX @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
 
         if (NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc)) {
             n->bar.cc = data;
-            if (nvme_start_ctrl(n)) {
+            if (unlikely(nvme_start_ctrl(n))) {
+                trace_nvme_err_startfail();
                 n->bar.csts = NVME_CSTS_FAILED;
             } else {
+                trace_nvme_mmio_start_success();
                 n->bar.csts = NVME_CSTS_READY;
             }
         } else if (!NVME_CC_EN(data) && NVME_CC_EN(n->bar.cc)) {
+            trace_nvme_mmio_stopped();
             nvme_clear_ctrl(n);
             n->bar.csts &= ~NVME_CSTS_READY;
         }
         if (NVME_CC_SHN(data) && !(NVME_CC_SHN(n->bar.cc))) {
-                nvme_clear_ctrl(n);
-                n->bar.cc = data;
-                n->bar.csts |= NVME_CSTS_SHST_COMPLETE;
+            trace_nvme_mmio_shutdown_set();
+            nvme_clear_ctrl(n);
+            n->bar.cc = data;
+            n->bar.csts |= NVME_CSTS_SHST_COMPLETE;
         } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(n->bar.cc)) {
-                n->bar.csts &= ~NVME_CSTS_SHST_COMPLETE;
-                n->bar.cc = data;
+            trace_nvme_mmio_shutdown_cleared();
+            n->bar.csts &= ~NVME_CSTS_SHST_COMPLETE;
+            n->bar.cc = data;
+        }
+        break;
+    case 0x1C:  /* CSTS */
+        if (data & (1 << 4)) {
+            NVME_GUEST_ERR(nvme_ub_mmiowr_ssreset_w1c_unsupported,
+                           "attempted to W1C CSTS.NSSRO"
+                           " but CAP.NSSRS is zero (not supported)");
+        } else if (data != 0) {
+            NVME_GUEST_ERR(nvme_ub_mmiowr_ro_csts,
+                           "attempted to set a read only bit"
+                           " of controller status");
+        }
+        break;
+    case 0x20:  /* NSSR */
+        if (data == 0x4E564D65) {
+            trace_nvme_ub_mmiowr_ssreset_unsupported();
+        } else {
+            /* The spec says that writes of other values have no effect */
+            return;
         }
         break;
-    case 0x24:
+    case 0x24:  /* AQA */
         n->bar.aqa = data & 0xffffffff;
+        trace_nvme_mmio_aqattr(data & 0xffffffff);
         break;
-    case 0x28:
+    case 0x28:  /* ASQ */
         n->bar.asq = data;
+        trace_nvme_mmio_asqaddr(data);
         break;
-    case 0x2c:
+    case 0x2c:  /* ASQ hi */
         n->bar.asq |= data << 32;
+        trace_nvme_mmio_asqaddr_hi(data, n->bar.asq);
         break;
-    case 0x30:
+    case 0x30:  /* ACQ */
+        trace_nvme_mmio_acqaddr(data);
         n->bar.acq = data;
         break;
-    case 0x34:
+    case 0x34:  /* ACQ hi */
         n->bar.acq |= data << 32;
+        trace_nvme_mmio_acqaddr_hi(data, n->bar.acq);
         break;
+    case 0x38:  /* CMBLOC */
+        NVME_GUEST_ERR(nvme_ub_mmiowr_cmbloc_reserved,
+                       "invalid write to reserved CMBLOC"
+                       " when CMBSZ is zero, ignored");
+        return;
+    case 0x3C:  /* CMBSZ */
+        NVME_GUEST_ERR(nvme_ub_mmiowr_cmbsz_readonly,
+                       "invalid write to read only CMBSZ, ignored");
+        return;
     default:
+        NVME_GUEST_ERR(nvme_ub_mmiowr_invalid,
+                       "invalid MMIO write,"
+                       " offset=0x%"PRIx64", data=%"PRIx64"",
+                       offset, data);
         break;
     }
 }
@@ -XXX,XX +XXX,XX @@ static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
     uint8_t *ptr = (uint8_t *)&n->bar;
     uint64_t val = 0;
 
+    if (unlikely(addr & (sizeof(uint32_t) - 1))) {
+        NVME_GUEST_ERR(nvme_ub_mmiord_misaligned32,
+                       "MMIO read not 32-bit aligned,"
+                       " offset=0x%"PRIx64"", addr);
+        /* should RAZ, fall through for now */
+    } else if (unlikely(size < sizeof(uint32_t))) {
+        NVME_GUEST_ERR(nvme_ub_mmiord_toosmall,
+                       "MMIO read smaller than 32-bits,"
+                       " offset=0x%"PRIx64"", addr);
+        /* should RAZ, fall through for now */
+    }
+
     if (addr < sizeof(n->bar)) {
         memcpy(&val, ptr + addr, size);
+    } else {
+        NVME_GUEST_ERR(nvme_ub_mmiord_invalid_ofs,
+                       "MMIO read beyond last register,"
+                       " offset=0x%"PRIx64", returning 0", addr);
     }
+
     return val;
 }
 
@@ -XXX,XX +XXX,XX @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
 {
     uint32_t qid;
 
-    if (addr & ((1 << 2) - 1)) {
+    if (unlikely(addr & ((1 << 2) - 1))) {
+        NVME_GUEST_ERR(nvme_ub_db_wr_misaligned,
+                       "doorbell write not 32-bit aligned,"
+                       " offset=0x%"PRIx64", ignoring", addr);
         return;
     }
 
     if (((addr - 0x1000) >> 2) & 1) {
+        /* Completion queue doorbell write */
+
         uint16_t new_head = val & 0xffff;
         int start_sqs;
         NvmeCQueue *cq;
 
         qid = (addr - (0x1000 + (1 << 2))) >> 3;
-        if (nvme_check_cqid(n, qid)) {
+        if (unlikely(nvme_check_cqid(n, qid))) {
+            NVME_GUEST_ERR(nvme_ub_db_wr_invalid_cq,
+                           "completion queue doorbell write"
+                           " for nonexistent queue,"
+                           " sqid=%"PRIu32", ignoring", qid);
             return;
         }
 
         cq = n->cq[qid];
-        if (new_head >= cq->size) {
+        if (unlikely(new_head >= cq->size)) {
+            NVME_GUEST_ERR(nvme_ub_db_wr_invalid_cqhead,
+                           "completion queue doorbell write value"
+                           " beyond queue size, sqid=%"PRIu32","
+                           " new_head=%"PRIu16", ignoring",
+                           qid, new_head);
             return;
         }
 
@@ -XXX,XX +XXX,XX @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
             nvme_isr_notify(n, cq);
         }
     } else {
+        /* Submission queue doorbell write */
+
         uint16_t new_tail = val & 0xffff;
         NvmeSQueue *sq;
 
         qid = (addr - 0x1000) >> 3;
-        if (nvme_check_sqid(n, qid)) {
+        if (unlikely(nvme_check_sqid(n, qid))) {
+            NVME_GUEST_ERR(nvme_ub_db_wr_invalid_sq,
+                           "submission queue doorbell write"
+                           " for nonexistent queue,"
+                           " sqid=%"PRIu32", ignoring", qid);
             return;
         }
 
         sq = n->sq[qid];
-        if (new_tail >= sq->size) {
+        if (unlikely(new_tail >= sq->size)) {
+            NVME_GUEST_ERR(nvme_ub_db_wr_invalid_sqtail,
+                           "submission queue doorbell write value"
+                           " beyond queue size, sqid=%"PRIu32","
+                           " new_tail=%"PRIu16", ignoring",
+                           qid, new_tail);
             return;
         }
 
diff --git a/hw/block/trace-events b/hw/block/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -XXX,XX +XXX,XX @@ virtio_blk_submit_multireq(void *vdev, void *mrb, int start, int num_reqs, uint6
 hd_geometry_lchs_guess(void *blk, int cyls, int heads, int secs) "blk %p LCHS %d %d %d"
 hd_geometry_guess(void *blk, uint32_t cyls, uint32_t heads, uint32_t secs, int trans) "blk %p CHS %u %u %u trans %d"
 
+# hw/block/nvme.c
+# nvme traces for successful events
+nvme_irq_msix(uint32_t vector) "raising MSI-X IRQ vector %u"
+nvme_irq_pin(void) "pulsing IRQ pin"
+nvme_irq_masked(void) "IRQ is masked"
+nvme_dma_read(uint64_t prp1, uint64_t prp2) "DMA read, prp1=0x%"PRIx64" prp2=0x%"PRIx64""
+nvme_rw(char const *verb, uint32_t blk_count, uint64_t byte_count, uint64_t lba) "%s %"PRIu32" blocks (%"PRIu64" bytes) from LBA %"PRIu64""
+nvme_create_sq(uint64_t addr, uint16_t sqid, uint16_t cqid, uint16_t qsize, uint16_t qflags) "create submission queue, addr=0x%"PRIx64", sqid=%"PRIu16", cqid=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16""
+nvme_create_cq(uint64_t addr, uint16_t cqid, uint16_t vector, uint16_t size, uint16_t qflags, int ien) "create completion queue, addr=0x%"PRIx64", cqid=%"PRIu16", vector=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16", ien=%d"
+nvme_del_sq(uint16_t qid) "deleting submission queue sqid=%"PRIu16""
+nvme_del_cq(uint16_t cqid) "deleted completion queue, sqid=%"PRIu16""
+nvme_identify_ctrl(void) "identify controller"
+nvme_identify_ns(uint16_t ns) "identify namespace, nsid=%"PRIu16""
+nvme_identify_nslist(uint16_t ns) "identify namespace list, nsid=%"PRIu16""
+nvme_getfeat_vwcache(char const* result) "get feature volatile write cache, result=%s"
+nvme_getfeat_numq(int result) "get feature number of queues, result=%d"
+nvme_setfeat_numq(int reqcq, int reqsq, int gotcq, int gotsq) "requested cq_count=%d sq_count=%d, responding with cq_count=%d sq_count=%d"
+nvme_mmio_intm_set(uint64_t data, uint64_t new_mask) "wrote MMIO, interrupt mask set, data=0x%"PRIx64", new_mask=0x%"PRIx64""
+nvme_mmio_intm_clr(uint64_t data, uint64_t new_mask) "wrote MMIO, interrupt mask clr, data=0x%"PRIx64", new_mask=0x%"PRIx64""
+nvme_mmio_cfg(uint64_t data) "wrote MMIO, config controller config=0x%"PRIx64""
+nvme_mmio_aqattr(uint64_t data) "wrote MMIO, admin queue attributes=0x%"PRIx64""
+nvme_mmio_asqaddr(uint64_t data) "wrote MMIO, admin submission queue address=0x%"PRIx64""
+nvme_mmio_acqaddr(uint64_t data) "wrote MMIO, admin completion queue address=0x%"PRIx64""
+nvme_mmio_asqaddr_hi(uint64_t data, uint64_t new_addr) "wrote MMIO, admin submission queue high half=0x%"PRIx64", new_address=0x%"PRIx64""
+nvme_mmio_acqaddr_hi(uint64_t data, uint64_t new_addr) "wrote MMIO, admin completion queue high half=0x%"PRIx64", new_address=0x%"PRIx64""
+nvme_mmio_start_success(void) "setting controller enable bit succeeded"
+nvme_mmio_stopped(void) "cleared controller enable bit"
+nvme_mmio_shutdown_set(void) "shutdown bit set"
+nvme_mmio_shutdown_cleared(void) "shutdown bit cleared"
+
+# nvme traces for error conditions
+nvme_err_invalid_dma(void) "PRP/SGL is too small for transfer size"
+nvme_err_invalid_prplist_ent(uint64_t prplist) "PRP list entry is null or not page aligned: 0x%"PRIx64""
+nvme_err_invalid_prp2_align(uint64_t prp2) "PRP2 is not page aligned: 0x%"PRIx64""
+nvme_err_invalid_prp2_missing(void) "PRP2 is null and more data to be transferred"
+nvme_err_invalid_field(void) "invalid field"
+nvme_err_invalid_prp(void) "invalid PRP"
+nvme_err_invalid_sgl(void) "invalid SGL"
+nvme_err_invalid_ns(uint32_t ns, uint32_t limit) "invalid namespace %u not within 1-%u"
+nvme_err_invalid_opc(uint8_t opc) "invalid opcode 0x%"PRIx8""
+nvme_err_invalid_admin_opc(uint8_t opc) "invalid admin opcode 0x%"PRIx8""
+nvme_err_invalid_lba_range(uint64_t start, uint64_t len, uint64_t limit) "Invalid LBA start=%"PRIu64" len=%"PRIu64" limit=%"PRIu64""
+nvme_err_invalid_del_sq(uint16_t qid) "invalid submission queue deletion, sid=%"PRIu16""
+nvme_err_invalid_create_sq_cqid(uint16_t cqid) "failed creating submission queue, invalid cqid=%"PRIu16""
+nvme_err_invalid_create_sq_sqid(uint16_t sqid) "failed creating submission queue, invalid sqid=%"PRIu16""
+nvme_err_invalid_create_sq_size(uint16_t qsize) "failed creating submission queue, invalid qsize=%"PRIu16""
+nvme_err_invalid_create_sq_addr(uint64_t addr) "failed creating submission queue, addr=0x%"PRIx64""
+nvme_err_invalid_create_sq_qflags(uint16_t qflags) "failed creating submission queue, qflags=%"PRIu16""
+nvme_err_invalid_del_cq_cqid(uint16_t cqid) "failed deleting completion queue, cqid=%"PRIu16""
+nvme_err_invalid_del_cq_notempty(uint16_t cqid) "failed deleting completion queue, it is not empty, cqid=%"PRIu16""
+nvme_err_invalid_create_cq_cqid(uint16_t cqid) "failed creating completion queue, cqid=%"PRIu16""
+nvme_err_invalid_create_cq_size(uint16_t size) "failed creating completion queue, size=%"PRIu16""
+nvme_err_invalid_create_cq_addr(uint64_t addr) "failed creating completion queue, addr=0x%"PRIx64""
+nvme_err_invalid_create_cq_vector(uint16_t vector) "failed creating completion queue, vector=%"PRIu16""
+nvme_err_invalid_create_cq_qflags(uint16_t qflags) "failed creating completion queue, qflags=%"PRIu16""
+nvme_err_invalid_identify_cns(uint16_t cns) "identify, invalid cns=0x%"PRIx16""
+nvme_err_invalid_getfeat(int dw10) "invalid get features, dw10=0x%"PRIx32""
+nvme_err_invalid_setfeat(uint32_t dw10) "invalid set features, dw10=0x%"PRIx32""
+nvme_err_startfail_cq(void) "nvme_start_ctrl failed because there are non-admin completion queues"
+nvme_err_startfail_sq(void) "nvme_start_ctrl failed because there are non-admin submission queues"
+nvme_err_startfail_nbarasq(void) "nvme_start_ctrl failed because the admin submission queue address is null"
+nvme_err_startfail_nbaracq(void) "nvme_start_ctrl failed because the admin completion queue address is null"
+nvme_err_startfail_asq_misaligned(uint64_t addr) "nvme_start_ctrl failed because the admin submission queue address is misaligned: 0x%"PRIx64""
+nvme_err_startfail_acq_misaligned(uint64_t addr) "nvme_start_ctrl failed because the admin completion queue address is misaligned: 0x%"PRIx64""
+nvme_err_startfail_page_too_small(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the page size is too small: log2size=%u, min=%u"
+nvme_err_startfail_page_too_large(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the page size is too large: log2size=%u, max=%u"
+nvme_err_startfail_cqent_too_small(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the completion queue entry size is too small: log2size=%u, min=%u"
+nvme_err_startfail_cqent_too_large(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the completion queue entry size is too large: log2size=%u, max=%u"
+nvme_err_startfail_sqent_too_small(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the submission queue entry size is too small: log2size=%u, min=%u"
+nvme_err_startfail_sqent_too_large(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the submission queue entry size is too large: log2size=%u, max=%u"
+nvme_err_startfail_asqent_sz_zero(void) "nvme_start_ctrl failed because the admin submission queue size is zero"
+nvme_err_startfail_acqent_sz_zero(void) "nvme_start_ctrl failed because the admin completion queue size is zero"
+nvme_err_startfail(void) "setting controller enable bit failed"
+
+# Traces for undefined behavior
+nvme_ub_mmiowr_misaligned32(uint64_t offset) "MMIO write not 32-bit aligned, offset=0x%"PRIx64""
+nvme_ub_mmiowr_toosmall(uint64_t offset, unsigned size) "MMIO write smaller than 32 bits, offset=0x%"PRIx64", size=%u"
+nvme_ub_mmiowr_intmask_with_msix(void) "undefined access to interrupt mask set when MSI-X is enabled"
+nvme_ub_mmiowr_ro_csts(void) "attempted to set a read only bit of controller status"
+nvme_ub_mmiowr_ssreset_w1c_unsupported(void) "attempted to W1C CSTS.NSSRO but CAP.NSSRS is zero (not supported)"
+nvme_ub_mmiowr_ssreset_unsupported(void) "attempted NVM subsystem reset but CAP.NSSRS is zero (not supported)"
+nvme_ub_mmiowr_cmbloc_reserved(void) "invalid write to reserved CMBLOC when CMBSZ is zero, ignored"
+nvme_ub_mmiowr_cmbsz_readonly(void) "invalid write to read only CMBSZ, ignored"
+nvme_ub_mmiowr_invalid(uint64_t offset, uint64_t data) "invalid MMIO write, offset=0x%"PRIx64", data=0x%"PRIx64""
+nvme_ub_mmiord_misaligned32(uint64_t offset) "MMIO read not 32-bit aligned, offset=0x%"PRIx64""
+nvme_ub_mmiord_toosmall(uint64_t offset) "MMIO read smaller than 32-bits, offset=0x%"PRIx64""
+nvme_ub_mmiord_invalid_ofs(uint64_t offset) "MMIO read beyond last register, offset=0x%"PRIx64", returning 0"
+nvme_ub_db_wr_misaligned(uint64_t offset) "doorbell write not 32-bit aligned, offset=0x%"PRIx64", ignoring"
+nvme_ub_db_wr_invalid_cq(uint32_t qid) "completion queue doorbell write for nonexistent queue, cqid=%"PRIu32", ignoring"
+nvme_ub_db_wr_invalid_cqhead(uint32_t qid, uint16_t new_head) "completion queue doorbell write value beyond queue size, cqid=%"PRIu32", new_head=%"PRIu16", ignoring"
+nvme_ub_db_wr_invalid_sq(uint32_t qid) "submission queue doorbell write for nonexistent queue, sqid=%"PRIu32", ignoring"
+nvme_ub_db_wr_invalid_sqtail(uint32_t qid, uint16_t new_tail) "submission queue doorbell write value beyond queue size, sqid=%"PRIu32", new_head=%"PRIu16", ignoring"
+
 # hw/block/xen_disk.c
 xen_disk_alloc(char *name) "%s"
 xen_disk_init(char *name) "%s"
-- 
2.13.6

From: Fam Zheng <famz@redhat.com>

Management tools create overlays of running guests with qemu-img:

$ qemu-img create -b /image/in/use.qcow2 -f qcow2 /overlay/image.qcow2

but this doesn't work anymore due to image locking:

qemu-img: /overlay/image.qcow2: Failed to get shared "write" lock
    Is another process using the image?
    Could not open backing image to determine size.
Use the force share option to allow this use case again.

Cc: qemu-stable@nongnu.org
Signed-off-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/block.c b/block.c
index XXXXXXX..XXXXXXX 100644
--- a/block.c
+++ b/block.c
@@ -XXX,XX +XXX,XX @@ void bdrv_img_create(const char *filename, const char *fmt,
         back_flags = flags;
         back_flags &= ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
 
+        backing_options = qdict_new();
         if (backing_fmt) {
-            backing_options = qdict_new();
             qdict_put_str(backing_options, "driver", backing_fmt);
         }
+        qdict_put_bool(backing_options, BDRV_OPT_FORCE_SHARE, true);
 
         bs = bdrv_open(full_backing, NULL, backing_options, back_flags,
                        &local_err);
-- 
2.13.6

From: Thomas Huth <thuth@redhat.com>

It's not working anymore since QEMU v1.3.0 - time to remove it now.

Signed-off-by: Thomas Huth <thuth@redhat.com>
Reviewed-by: John Snow <jsnow@redhat.com>
Reviewed-by: Markus Armbruster <armbru@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 blockdev.c    | 11 -----------
 qemu-doc.texi |  6 ------
 2 files changed, 17 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index XXXXXXX..XXXXXXX 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -XXX,XX +XXX,XX @@ QemuOptsList qemu_legacy_drive_opts = {
             .type = QEMU_OPT_STRING,
             .help = "chs translation (auto, lba, none)",
         },{
-            .name = "boot",
-            .type = QEMU_OPT_BOOL,
-            .help = "(deprecated, ignored)",
-        },{
             .name = "addr",
             .type = QEMU_OPT_STRING,
             .help = "pci address (virtio only)",
@@ -XXX,XX +XXX,XX @@ DriveInfo *drive_new(QemuOpts *all_opts, BlockInterfaceType block_default_type)
         goto fail;
     }
 
-    /* Deprecated option boot=[on|off] */
-    if (qemu_opt_get(legacy_opts, "boot") != NULL) {
-        fprintf(stderr, "qemu-kvm: boot=on|off is deprecated and will be "
-                "ignored. Future versions will reject this parameter. Please "
-                "update your scripts.\n");
-    }
-
     /* Other deprecated options */
     if (!qtest_enabled()) {
         for (i = 0; i < ARRAY_SIZE(deprecated); i++) {
diff --git a/qemu-doc.texi b/qemu-doc.texi
index XXXXXXX..XXXXXXX 100644
--- a/qemu-doc.texi
+++ b/qemu-doc.texi
@@ -XXX,XX +XXX,XX @@ deprecated.
 
 @section System emulator command line arguments
 
-@subsection -drive boot=on|off (since 1.3.0)
-
-The ``boot=on|off'' option to the ``-drive'' argument is
-ignored. Applications should use the ``bootindex=N'' parameter
-to set an absolute ordering between devices instead.
-
 @subsection -tdf (since 1.3.0)
 
 The ``-tdf'' argument is ignored. The behaviour implemented
-- 
2.13.6

From: Thomas Huth <thuth@redhat.com>

It's been marked as deprecated since QEMU v2.10.0, and so far nobody
complained that we should keep it, so let's remove this legacy option
now to simplify the code quite a bit.

Signed-off-by: Thomas Huth <thuth@redhat.com>
Reviewed-by: John Snow <jsnow@redhat.com>
Reviewed-by: Markus Armbruster <armbru@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 vl.c            | 86 ++-------------------------------------------------------
 qemu-doc.texi   |  8 ------
 qemu-options.hx | 19 ++-----------
 3 files changed, 4 insertions(+), 109 deletions(-)

diff --git a/vl.c b/vl.c
index XXXXXXX..XXXXXXX 100644
--- a/vl.c
+++ b/vl.c
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
     const char *boot_order = NULL;
     const char *boot_once = NULL;
     DisplayState *ds;
-    int cyls, heads, secs, translation;
     QemuOpts *opts, *machine_opts;
-    QemuOpts *hda_opts = NULL, *icount_opts = NULL, *accel_opts = NULL;
+    QemuOpts *icount_opts = NULL, *accel_opts = NULL;
     QemuOptsList *olist;
     int optind;
     const char *optarg;
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
 
     cpu_model = NULL;
     snapshot = 0;
-    cyls = heads = secs = 0;
-    translation = BIOS_ATA_TRANSLATION_AUTO;
 
     nb_nics = 0;
 
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
         if (optind >= argc)
             break;
         if (argv[optind][0] != '-') {
-            hda_opts = drive_add(IF_DEFAULT, 0, argv[optind++], HD_OPTS);
+            drive_add(IF_DEFAULT, 0, argv[optind++], HD_OPTS);
         } else {
             const QEMUOption *popt;
 
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
                 cpu_model = optarg;
                 break;
             case QEMU_OPTION_hda:
-                {
-                    char buf[256];
-                    if (cyls == 0)
-                        snprintf(buf, sizeof(buf), "%s", HD_OPTS);
-                    else
-                        snprintf(buf, sizeof(buf),
-                                 "%s,cyls=%d,heads=%d,secs=%d%s",
-                                 HD_OPTS , cyls, heads, secs,
-                                 translation == BIOS_ATA_TRANSLATION_LBA ?
-                                 ",trans=lba" :
-                                 translation == BIOS_ATA_TRANSLATION_NONE ?
-                                 ",trans=none" : "");
-                    drive_add(IF_DEFAULT, 0, optarg, buf);
-                    break;
-                }
             case QEMU_OPTION_hdb:
             case QEMU_OPTION_hdc:
             case QEMU_OPTION_hdd:
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
             case QEMU_OPTION_snapshot:
                 snapshot = 1;
                 break;
-            case QEMU_OPTION_hdachs:
-                {
-                    const char *p;
-                    p = optarg;
-                    cyls = strtol(p, (char **)&p, 0);
-                    if (cyls < 1 || cyls > 16383)
-                        goto chs_fail;
-                    if (*p != ',')
-                        goto chs_fail;
-                    p++;
-                    heads = strtol(p, (char **)&p, 0);
-                    if (heads < 1 || heads > 16)
-                        goto chs_fail;
-                    if (*p != ',')
-                        goto chs_fail;
-                    p++;
-                    secs = strtol(p, (char **)&p, 0);
-                    if (secs < 1 || secs > 63)
-                        goto chs_fail;
-                    if (*p == ',') {
-                        p++;
-                        if (!strcmp(p, "large")) {
-                            translation = BIOS_ATA_TRANSLATION_LARGE;
-                        } else if (!strcmp(p, "rechs")) {
-                            translation = BIOS_ATA_TRANSLATION_RECHS;
-                        } else if (!strcmp(p, "none")) {
-                            translation = BIOS_ATA_TRANSLATION_NONE;
-                        } else if (!strcmp(p, "lba")) {
-                            translation = BIOS_ATA_TRANSLATION_LBA;
-                        } else if (!strcmp(p, "auto")) {
-                            translation = BIOS_ATA_TRANSLATION_AUTO;
-                        } else {
-                            goto chs_fail;
-                        }
-                    } else if (*p != '\0') {
-                    chs_fail:
-                        error_report("invalid physical CHS format");
-                        exit(1);
-                    }
-                    if (hda_opts != NULL) {
-                        qemu_opt_set_number(hda_opts, "cyls", cyls,
-                                            &error_abort);
-                        qemu_opt_set_number(hda_opts, "heads", heads,
-                                            &error_abort);
-                        qemu_opt_set_number(hda_opts, "secs", secs,
-                                            &error_abort);
-                        if (translation == BIOS_ATA_TRANSLATION_LARGE) {
-                            qemu_opt_set(hda_opts, "trans", "large",
-                                         &error_abort);
-                        } else if (translation == BIOS_ATA_TRANSLATION_RECHS) {
-                            qemu_opt_set(hda_opts, "trans", "rechs",
-                                         &error_abort);
-                        } else if (translation == BIOS_ATA_TRANSLATION_LBA) {
-                            qemu_opt_set(hda_opts, "trans", "lba",
-                                         &error_abort);
-                        } else if (translation == BIOS_ATA_TRANSLATION_NONE) {
-                            qemu_opt_set(hda_opts, "trans", "none",
-                                         &error_abort);
-                        }
-                    }
-                }
-                error_report("'-hdachs' is deprecated, please use '-device"
-                             " ide-hd,cyls=c,heads=h,secs=s,...' instead");
-                break;
             case QEMU_OPTION_numa:
                 opts = qemu_opts_parse_noisily(qemu_find_opts("numa"),
                                                optarg, true);
diff --git a/qemu-doc.texi b/qemu-doc.texi
index XXXXXXX..XXXXXXX 100644
--- a/qemu-doc.texi
+++ b/qemu-doc.texi
@@ -XXX,XX +XXX,XX @@ The ``--net dump'' argument is now replaced with the
 ``-object filter-dump'' argument which works in combination
 with the modern ``-netdev`` backends instead.
 
-@subsection -hdachs (since 2.10.0)
-
-The ``-hdachs'' argument is now a synonym for setting
-the ``cyls'', ``heads'', ``secs'', and ``trans'' properties
-on the ``ide-hd'' device using the ``-device'' argument.
-The new syntax allows different settings to be provided
-per disk.
-
 @subsection -usbdevice (since 2.10.0)
 
 The ``-usbdevice DEV'' argument is now a synonym for setting
diff --git a/qemu-options.hx b/qemu-options.hx
index XXXXXXX..XXXXXXX 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -XXX,XX +XXX,XX @@ of available connectors of a given interface type.
 @item media=@var{media}
 This option defines the type of the media: disk or cdrom.
 @item cyls=@var{c},heads=@var{h},secs=@var{s}[,trans=@var{t}]
-These options have the same definition as they have in @option{-hdachs}.
-These parameters are deprecated, use the corresponding parameters
+Force disk physical geometry and the optional BIOS translation (trans=none or
+lba). These parameters are deprecated, use the corresponding parameters
 of @code{-device} instead.
 @item snapshot=@var{snapshot}
 @var{snapshot} is "on" or "off" and controls snapshot mode for the given drive
@@ -XXX,XX +XXX,XX @@ the raw disk image you use is not written back. You can however force
 the write back by pressing @key{C-a s} (@pxref{disk_images}).
 ETEXI
 
-DEF("hdachs", HAS_ARG, QEMU_OPTION_hdachs, \
-    "-hdachs c,h,s[,t]\n" \
-    "                force hard disk 0 physical geometry and the optional BIOS\n" \
-    "                translation (t=none or lba) (usually QEMU can guess them)\n",
-    QEMU_ARCH_ALL)
-STEXI
-@item -hdachs @var{c},@var{h},@var{s},[,@var{t}]
-@findex -hdachs
-Force hard disk 0 physical geometry (1 <= @var{c} <= 16383, 1 <=
-@var{h} <= 16, 1 <= @var{s} <= 63) and optionally force the BIOS
-translation mode (@var{t}=none, lba or auto). Usually QEMU can guess
-all those parameters. This option is deprecated, please use
-@code{-device ide-hd,cyls=c,heads=h,secs=s,...} instead.
-ETEXI
-
 DEF("fsdev", HAS_ARG, QEMU_OPTION_fsdev,
     "-fsdev fsdriver,id=id[,path=path,][security_model={mapped-xattr|mapped-file|passthrough|none}]\n"
     " [,writeout=immediate][,readonly][,socket=socket|sock_fd=sock_fd][,fmode=fmode][,dmode=dmode]\n"
-- 
2.13.6

From: Thomas Huth <thuth@redhat.com>

Looks like we forgot to announce the deprecation of these options in
the corresponding chapter of the qemu-doc text, so let's do that now.

diff --git a/qemu-doc.texi b/qemu-doc.texi
index XXXXXXX..XXXXXXX 100644
--- a/qemu-doc.texi
+++ b/qemu-doc.texi
@@ -XXX,XX +XXX,XX @@ longer be directly supported in QEMU.
 The ``-drive if=scsi'' argument is replaced by the the
 ``-device BUS-TYPE'' argument combined with ``-drive if=none''.
 
+@subsection -drive cyls=...,heads=...,secs=...,trans=... (since 2.10.0)
+
+The drive geometry arguments are replaced by the the geometry arguments
+that can be specified with the ``-device'' parameter.
+
+@subsection -drive serial=... (since 2.10.0)
+
+The drive serial argument is replaced by the the serial argument
+that can be specified with the ``-device'' parameter.
+
+@subsection -drive addr=... (since 2.10.0)
+
+The drive addr argument is replaced by the the addr argument
+that can be specified with the ``-device'' parameter.
+
 @subsection -net dump (since 2.10.0)
 
 The ``--net dump'' argument is now replaced with the
-- 
2.13.6

From: Fam Zheng <famz@redhat.com>

Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/block_int.h |  1 -
 block/io.c                | 18 ------------------
 2 files changed, 19 deletions(-)

diff --git a/include/block/block_int.h b/include/block/block_int.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -XXX,XX +XXX,XX @@ bool blk_dev_is_tray_open(BlockBackend *blk);
 bool blk_dev_is_medium_locked(BlockBackend *blk);
 
 void bdrv_set_dirty(BlockDriverState *bs, int64_t offset, int64_t bytes);
-bool bdrv_requests_pending(BlockDriverState *bs);
 
 void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out);
 void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in);
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ void bdrv_disable_copy_on_read(BlockDriverState *bs)
     assert(old >= 1);
 }
 
-/* Check if any requests are in-flight (including throttled requests) */
-bool bdrv_requests_pending(BlockDriverState *bs)
-{
-    BdrvChild *child;
-
-    if (atomic_read(&bs->in_flight)) {
-        return true;
-    }
-
-    QLIST_FOREACH(child, &bs->children, next) {
-        if (bdrv_requests_pending(child->bs)) {
-            return true;
-        }
-    }
-
-    return false;
-}
-
 typedef struct {
     Coroutine *co;
     BlockDriverState *bs;
-- 
2.13.6

bdrv_drained_begin() doesn't increase bs->quiesce_counter recursively
and also doesn't notify other parent nodes of children, which both means
that the child nodes are not actually drained, and bdrv_drained_begin()
is providing useful functionality only on a single node.

To keep things consistent, we also shouldn't call the block driver
callbacks recursively.

A proper recursive drain version that provides an actually working
drained section for child nodes will be introduced later.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
---
 block/io.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
 }
 
 /* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
-static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
+static void bdrv_drain_invoke(BlockDriverState *bs, bool begin, bool recursive)
 {
     BdrvChild *child, *tmp;
     BdrvCoDrainData data = { .bs = bs, .done = false, .begin = begin};
@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
     bdrv_coroutine_enter(bs, data.co);
     BDRV_POLL_WHILE(bs, !data.done);
 
-    QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
-        bdrv_drain_invoke(child->bs, begin);
+    if (recursive) {
+        QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
+            bdrv_drain_invoke(child->bs, begin, true);
+        }
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
         bdrv_parent_drained_begin(bs);
     }
 
-    bdrv_drain_invoke(bs, true);
+    bdrv_drain_invoke(bs, true, false);
     bdrv_drain_recurse(bs);
 }
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
     }
 
     /* Re-enable things in child-to-parent order */
-    bdrv_drain_invoke(bs, false);
+    bdrv_drain_invoke(bs, false, false);
     bdrv_parent_drained_end(bs);
     aio_enable_external(bdrv_get_aio_context(bs));
 }
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
         aio_context_acquire(aio_context);
         aio_disable_external(aio_context);
         bdrv_parent_drained_begin(bs);
-        bdrv_drain_invoke(bs, true);
+        bdrv_drain_invoke(bs, true, true);
         aio_context_release(aio_context);
 
         if (!g_slist_find(aio_ctxs, aio_context)) {
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
 
         /* Re-enable things in child-to-parent order */
         aio_context_acquire(aio_context);
-        bdrv_drain_invoke(bs, false);
+        bdrv_drain_invoke(bs, false, true);
         bdrv_parent_drained_end(bs);
         aio_enable_external(aio_context);
         aio_context_release(aio_context);
-- 
2.13.6

The existing test is for bdrv_drain_all_begin/end() only. Generalise the
test case so that it can be run for the other variants as well. At the
moment this is only bdrv_drain_begin/end(), but in a while, we'll add
another one.

Also, add a backing file to the test node to test whether the operations
work recursively.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/test-bdrv-drain.c | 69 ++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 62 insertions(+), 7 deletions(-)

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_test = {
 
     .bdrv_co_drain_begin    = bdrv_test_co_drain_begin,
     .bdrv_co_drain_end      = bdrv_test_co_drain_end,
+
+    .bdrv_child_perm        = bdrv_format_default_perms,
 };
 
 static void aio_ret_cb(void *opaque, int ret)
@@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret)
     *aio_ret = ret;
 }
 
-static void test_drv_cb_drain_all(void)
+enum drain_type {
+    BDRV_DRAIN_ALL,
+    BDRV_DRAIN,
+};
+
+static void do_drain_begin(enum drain_type drain_type, BlockDriverState *bs)
+{
+    switch (drain_type) {
+    case BDRV_DRAIN_ALL:        bdrv_drain_all_begin(); break;
+    case BDRV_DRAIN:            bdrv_drained_begin(bs); break;
+    default:                    g_assert_not_reached();
+    }
+}
+
+static void do_drain_end(enum drain_type drain_type, BlockDriverState *bs)
+{
+    switch (drain_type) {
+    case BDRV_DRAIN_ALL:        bdrv_drain_all_end(); break;
+    case BDRV_DRAIN:            bdrv_drained_end(bs); break;
+    default:                    g_assert_not_reached();
+    }
+}
+
+static void test_drv_cb_common(enum drain_type drain_type, bool recursive)
 {
     BlockBackend *blk;
-    BlockDriverState *bs;
-    BDRVTestState *s;
+    BlockDriverState *bs, *backing;
+    BDRVTestState *s, *backing_s;
     BlockAIOCB *acb;
     int aio_ret;
 
@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain_all(void)
     s = bs->opaque;
     blk_insert_bs(blk, bs, &error_abort);
 
+    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
+    backing_s = backing->opaque;
+    bdrv_set_backing_hd(bs, backing, &error_abort);
+
     /* Simple bdrv_drain_all_begin/end pair, check that CBs are called */
     g_assert_cmpint(s->drain_count, ==, 0);
-    bdrv_drain_all_begin();
+    g_assert_cmpint(backing_s->drain_count, ==, 0);
+
+    do_drain_begin(drain_type, bs);
+
     g_assert_cmpint(s->drain_count, ==, 1);
-    bdrv_drain_all_end();
+    g_assert_cmpint(backing_s->drain_count, ==, !!recursive);
+
+    do_drain_end(drain_type, bs);
+
     g_assert_cmpint(s->drain_count, ==, 0);
+    g_assert_cmpint(backing_s->drain_count, ==, 0);
 
     /* Now do the same while a request is pending */
     aio_ret = -EINPROGRESS;
@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain_all(void)
     g_assert_cmpint(aio_ret, ==, -EINPROGRESS);
 
     g_assert_cmpint(s->drain_count, ==, 0);
-    bdrv_drain_all_begin();
+    g_assert_cmpint(backing_s->drain_count, ==, 0);
+
+    do_drain_begin(drain_type, bs);
+
     g_assert_cmpint(aio_ret, ==, 0);
     g_assert_cmpint(s->drain_count, ==, 1);
-    bdrv_drain_all_end();
+    g_assert_cmpint(backing_s->drain_count, ==, !!recursive);
+
+    do_drain_end(drain_type, bs);
+
     g_assert_cmpint(s->drain_count, ==, 0);
+    g_assert_cmpint(backing_s->drain_count, ==, 0);
 
+    bdrv_unref(backing);
     bdrv_unref(bs);
     blk_unref(blk);
 }
 
+static void test_drv_cb_drain_all(void)
+{
+    test_drv_cb_common(BDRV_DRAIN_ALL, true);
+}
+
+static void test_drv_cb_drain(void)
+{
+    test_drv_cb_common(BDRV_DRAIN, false);
+}
+
 int main(int argc, char **argv)
 {
     bdrv_init();
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
     g_test_init(&argc, &argv, NULL);
 
     g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
+    g_test_add_func("/bdrv-drain/driver-cb/drain", test_drv_cb_drain);
 
     return g_test_run();
 }
-- 
2.13.6

This is currently only working correctly for bdrv_drain(), not for
bdrv_drain_all(). Leave a comment for the drain_all case, we'll address
it later.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/test-bdrv-drain.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain(void)
     test_drv_cb_common(BDRV_DRAIN, false);
 }
 
+static void test_quiesce_common(enum drain_type drain_type, bool recursive)
+{
+    BlockBackend *blk;
+    BlockDriverState *bs, *backing;
+
+    blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
+                              &error_abort);
+    blk_insert_bs(blk, bs, &error_abort);
+
+    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
+    bdrv_set_backing_hd(bs, backing, &error_abort);
+
+    g_assert_cmpint(bs->quiesce_counter, ==, 0);
+    g_assert_cmpint(backing->quiesce_counter, ==, 0);
+
+    do_drain_begin(drain_type, bs);
+
+    g_assert_cmpint(bs->quiesce_counter, ==, 1);
+    g_assert_cmpint(backing->quiesce_counter, ==, !!recursive);
+
+    do_drain_end(drain_type, bs);
+
+    g_assert_cmpint(bs->quiesce_counter, ==, 0);
+    g_assert_cmpint(backing->quiesce_counter, ==, 0);
+
+    bdrv_unref(backing);
+    bdrv_unref(bs);
+    blk_unref(blk);
+}
+
+static void test_quiesce_drain_all(void)
+{
+    // XXX drain_all doesn't quiesce
+    //test_quiesce_common(BDRV_DRAIN_ALL, true);
+}
+
+static void test_quiesce_drain(void)
+{
+    test_quiesce_common(BDRV_DRAIN, false);
+}
+
 int main(int argc, char **argv)
 {
     bdrv_init();
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
     g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
     g_test_add_func("/bdrv-drain/driver-cb/drain", test_drv_cb_drain);
 
+    g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
+    g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
+
     return g_test_run();
 }
-- 
2.13.6

Block jobs already paused themselves when their main BlockBackend
entered a drained section. This is not good enough: We also want to
pause a block job and may not submit new requests if, for example, the
mirror target node should be drained.

This implements .drained_begin/end callbacks in child_job in order to
consider all block nodes related to the job, and removes the
BlockBackend callbacks which are unnecessary now because the root of the
job main BlockBackend is always referenced with a child_job, too.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 blockjob.c | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/blockjob.c b/blockjob.c
index XXXXXXX..XXXXXXX 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -XXX,XX +XXX,XX @@ static char *child_job_get_parent_desc(BdrvChild *c)
                            job->id);
 }
 
-static const BdrvChildRole child_job = {
-    .get_parent_desc    = child_job_get_parent_desc,
-    .stay_at_node       = true,
-};
-
-static void block_job_drained_begin(void *opaque)
+static void child_job_drained_begin(BdrvChild *c)
 {
-    BlockJob *job = opaque;
+    BlockJob *job = c->opaque;
     block_job_pause(job);
 }
 
-static void block_job_drained_end(void *opaque)
+static void child_job_drained_end(BdrvChild *c)
 {
-    BlockJob *job = opaque;
+    BlockJob *job = c->opaque;
     block_job_resume(job);
 }
 
-static const BlockDevOps block_job_dev_ops = {
-    .drained_begin = block_job_drained_begin,
-    .drained_end = block_job_drained_end,
+static const BdrvChildRole child_job = {
+    .get_parent_desc    = child_job_get_parent_desc,
+    .drained_begin      = child_job_drained_begin,
+    .drained_end        = child_job_drained_end,
+    .stay_at_node       = true,
 };
 
 void block_job_remove_all_bdrv(BlockJob *job)
@@ -XXX,XX +XXX,XX @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver,
     block_job_add_bdrv(job, "main node", bs, 0, BLK_PERM_ALL, &error_abort);
     bs->job = job;
 
-    blk_set_dev_ops(blk, &block_job_dev_ops, job);
     bdrv_op_unblock(bs, BLOCK_OP_TYPE_DATAPLANE, job->blocker);
 
     QLIST_INSERT_HEAD(&block_jobs, job, job_list);
-- 
2.13.6

Block jobs must be paused if any of the involved nodes are drained.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/test-bdrv-drain.c | 121 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 121 insertions(+)

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@
 
 #include "qemu/osdep.h"
 #include "block/block.h"
+#include "block/blockjob_int.h"
 #include "sysemu/block-backend.h"
 #include "qapi/error.h"
 
@@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain(void)
     test_quiesce_common(BDRV_DRAIN, false);
 }
 
+
+typedef struct TestBlockJob {
+    BlockJob common;
+    bool should_complete;
+} TestBlockJob;
+
+static void test_job_completed(BlockJob *job, void *opaque)
+{
+    block_job_completed(job, 0);
+}
+
+static void coroutine_fn test_job_start(void *opaque)
+{
+    TestBlockJob *s = opaque;
+
+    while (!s->should_complete) {
+        block_job_sleep_ns(&s->common, 100000);
+    }
+
+    block_job_defer_to_main_loop(&s->common, test_job_completed, NULL);
+}
+
+static void test_job_complete(BlockJob *job, Error **errp)
+{
+    TestBlockJob *s = container_of(job, TestBlockJob, common);
+    s->should_complete = true;
+}
+
+BlockJobDriver test_job_driver = {
+    .instance_size  = sizeof(TestBlockJob),
+    .start          = test_job_start,
+    .complete       = test_job_complete,
+};
+
+static void test_blockjob_common(enum drain_type drain_type)
+{
+    BlockBackend *blk_src, *blk_target;
+    BlockDriverState *src, *target;
+    BlockJob *job;
+    int ret;
+
+    src = bdrv_new_open_driver(&bdrv_test, "source", BDRV_O_RDWR,
+                               &error_abort);
+    blk_src = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    blk_insert_bs(blk_src, src, &error_abort);
+
+    target = bdrv_new_open_driver(&bdrv_test, "target", BDRV_O_RDWR,
+                                  &error_abort);
+    blk_target = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    blk_insert_bs(blk_target, target, &error_abort);
+
+    job = block_job_create("job0", &test_job_driver, src, 0, BLK_PERM_ALL, 0,
+                           0, NULL, NULL, &error_abort);
+    block_job_add_bdrv(job, "target", target, 0, BLK_PERM_ALL, &error_abort);
+    block_job_start(job);
+
+    g_assert_cmpint(job->pause_count, ==, 0);
+    g_assert_false(job->paused);
+    g_assert_false(job->busy); /* We're in block_job_sleep_ns() */
+
+    do_drain_begin(drain_type, src);
+
+    if (drain_type == BDRV_DRAIN_ALL) {
+        /* bdrv_drain_all() drains both src and target, and involves an
+         * additional block_job_pause_all() */
+        g_assert_cmpint(job->pause_count, ==, 3);
+    } else {
+        g_assert_cmpint(job->pause_count, ==, 1);
+    }
+    /* XXX We don't wait until the job is actually paused. Is this okay? */
+    /* g_assert_true(job->paused); */
+    g_assert_false(job->busy); /* The job is paused */
+
+    do_drain_end(drain_type, src);
+
+    g_assert_cmpint(job->pause_count, ==, 0);
+    g_assert_false(job->paused);
+    g_assert_false(job->busy); /* We're in block_job_sleep_ns() */
+
+    do_drain_begin(drain_type, target);
+
+    if (drain_type == BDRV_DRAIN_ALL) {
+        /* bdrv_drain_all() drains both src and target, and involves an
+         * additional block_job_pause_all() */
+        g_assert_cmpint(job->pause_count, ==, 3);
+    } else {
+        g_assert_cmpint(job->pause_count, ==, 1);
+    }
+    /* XXX We don't wait until the job is actually paused. Is this okay? */
+    /* g_assert_true(job->paused); */
+    g_assert_false(job->busy); /* The job is paused */
+
+    do_drain_end(drain_type, target);
+
+    g_assert_cmpint(job->pause_count, ==, 0);
+    g_assert_false(job->paused);
+    g_assert_false(job->busy); /* We're in block_job_sleep_ns() */
+
+    ret = block_job_complete_sync(job, &error_abort);
+    g_assert_cmpint(ret, ==, 0);
+
+    blk_unref(blk_src);
+    blk_unref(blk_target);
+    bdrv_unref(src);
+    bdrv_unref(target);
+}
+
+static void test_blockjob_drain_all(void)
+{
+    test_blockjob_common(BDRV_DRAIN_ALL);
+}
+
+static void test_blockjob_drain(void)
+{
+    test_blockjob_common(BDRV_DRAIN);
+}
+
 int main(int argc, char **argv)
 {
     bdrv_init();
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
     g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
     g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
 
+    g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
+    g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
+
     return g_test_run();
 }
-- 
2.13.6

Block jobs are already paused using the BdrvChildRole drain callbacks,
so we don't need an additional block_job_pause_all() call.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/io.c              |  4 ----
 tests/test-bdrv-drain.c | 10 ++++------
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
      * context. */
     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
 
-    block_job_pause_all();
-
     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
         AioContext *aio_context = bdrv_get_aio_context(bs);
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
         aio_enable_external(aio_context);
         aio_context_release(aio_context);
     }
-
-    block_job_resume_all();
 }
 
 void bdrv_drain_all(void)
diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static void test_blockjob_common(enum drain_type drain_type)
     do_drain_begin(drain_type, src);
 
     if (drain_type == BDRV_DRAIN_ALL) {
-        /* bdrv_drain_all() drains both src and target, and involves an
-         * additional block_job_pause_all() */
-        g_assert_cmpint(job->pause_count, ==, 3);
+        /* bdrv_drain_all() drains both src and target */
+        g_assert_cmpint(job->pause_count, ==, 2);
     } else {
         g_assert_cmpint(job->pause_count, ==, 1);
     }
@@ -XXX,XX +XXX,XX @@ static void test_blockjob_common(enum drain_type drain_type)
     do_drain_begin(drain_type, target);
 
     if (drain_type == BDRV_DRAIN_ALL) {
-        /* bdrv_drain_all() drains both src and target, and involves an
-         * additional block_job_pause_all() */
-        g_assert_cmpint(job->pause_count, ==, 3);
+        /* bdrv_drain_all() drains both src and target */
+        g_assert_cmpint(job->pause_count, ==, 2);
     } else {
         g_assert_cmpint(job->pause_count, ==, 1);
     }
-- 
2.13.6

bdrv_do_drained_begin() restricts the call of parent callbacks and
aio_disable_external() to the outermost drain section, but the block
driver callbacks are always called. bdrv_do_drained_end() must match
this behaviour, otherwise nodes stay drained even if begin/end calls
were balanced.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/io.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
 
 void bdrv_drained_end(BlockDriverState *bs)
 {
+    int old_quiesce_counter;
+
     if (qemu_in_coroutine()) {
         bdrv_co_yield_to_drain(bs, false);
         return;
     }
     assert(bs->quiesce_counter > 0);
-    if (atomic_fetch_dec(&bs->quiesce_counter) > 1) {
-        return;
-    }
+    old_quiesce_counter = atomic_fetch_dec(&bs->quiesce_counter);
 
     /* Re-enable things in child-to-parent order */
     bdrv_drain_invoke(bs, false, false);
-    bdrv_parent_drained_end(bs);
-    aio_enable_external(bdrv_get_aio_context(bs));
+    if (old_quiesce_counter == 1) {
+        bdrv_parent_drained_end(bs);
+        aio_enable_external(bdrv_get_aio_context(bs));
+    }
 }
 
 /*
-- 
2.13.6

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/test-bdrv-drain.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret)
 enum drain_type {
     BDRV_DRAIN_ALL,
     BDRV_DRAIN,
+    DRAIN_TYPE_MAX,
 };
 
 static void do_drain_begin(enum drain_type drain_type, BlockDriverState *bs)
@@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain(void)
     test_quiesce_common(BDRV_DRAIN, false);
 }
 
+static void test_nested(void)
+{
+    BlockBackend *blk;
+    BlockDriverState *bs, *backing;
+    BDRVTestState *s, *backing_s;
+    enum drain_type outer, inner;
+
+    blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
+                              &error_abort);
+    s = bs->opaque;
+    blk_insert_bs(blk, bs, &error_abort);
+
+    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
+    backing_s = backing->opaque;
+    bdrv_set_backing_hd(bs, backing, &error_abort);
+
+    for (outer = 0; outer < DRAIN_TYPE_MAX; outer++) {
+        for (inner = 0; inner < DRAIN_TYPE_MAX; inner++) {
+            /* XXX bdrv_drain_all() doesn't increase the quiesce_counter */
+            int bs_quiesce      = (outer != BDRV_DRAIN_ALL) +
+                                  (inner != BDRV_DRAIN_ALL);
+            int backing_quiesce = 0;
+            int backing_cb_cnt  = (outer != BDRV_DRAIN) +
+                                  (inner != BDRV_DRAIN);
+
+            g_assert_cmpint(bs->quiesce_counter, ==, 0);
+            g_assert_cmpint(backing->quiesce_counter, ==, 0);
+            g_assert_cmpint(s->drain_count, ==, 0);
+            g_assert_cmpint(backing_s->drain_count, ==, 0);
+
+            do_drain_begin(outer, bs);
+            do_drain_begin(inner, bs);
+
+            g_assert_cmpint(bs->quiesce_counter, ==, bs_quiesce);
+            g_assert_cmpint(backing->quiesce_counter, ==, backing_quiesce);
+            g_assert_cmpint(s->drain_count, ==, 2);
+            g_assert_cmpint(backing_s->drain_count, ==, backing_cb_cnt);
+
+            do_drain_end(inner, bs);
+            do_drain_end(outer, bs);
+
+            g_assert_cmpint(bs->quiesce_counter, ==, 0);
+            g_assert_cmpint(backing->quiesce_counter, ==, 0);
+            g_assert_cmpint(s->drain_count, ==, 0);
+            g_assert_cmpint(backing_s->drain_count, ==, 0);
+        }
+    }
+
+    bdrv_unref(backing);
+    bdrv_unref(bs);
+    blk_unref(blk);
+}
+
 
 typedef struct TestBlockJob {
     BlockJob common;
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
     g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
     g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
 
+    g_test_add_func("/bdrv-drain/nested", test_nested);
+
     g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
     g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
 
-- 
2.13.6

This is in preparation for subtree drains, i.e. drained sections that
affect not only a single node, but recursively all child nodes, too.

Calling the parent callbacks for drain is pointless when we just came
from that parent node recursively and leads to multiple increases of
bs->quiesce_counter in a single drain call. Don't do it.

In order for this to work correctly, the parent callback must be called
for every bdrv_drain_begin/end() call, not only for the outermost one:

If we have a node N with two parents A and B, recursive draining of A
should cause the quiesce_counter of B to increase because its child N is
drained independently of B. If now B is recursively drained, too, A must
increase its quiesce_counter because N is drained independently of A
only now, even if N is going from quiesce_counter 1 to 2.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/block.h |  4 ++--
 block.c               | 13 +++++++++----
 block/io.c            | 47 ++++++++++++++++++++++++++++++++++-------------
 3 files changed, 45 insertions(+), 19 deletions(-)

bdrv_drained_begin() waits for the completion of requests in the whole
subtree, but it only actually keeps its immediate bs parameter quiesced
until bdrv_drained_end().

Add a version that keeps the whole subtree drained. As of this commit,
graph changes cannot be allowed during a subtree drained section, but
this will be fixed soon.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/block.h | 13 +++++++++++++
 block/io.c            | 54 ++++++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 56 insertions(+), 11 deletions(-)

diff --git a/include/block/block.h b/include/block/block.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -XXX,XX +XXX,XX @@ void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore);
 void bdrv_drained_begin(BlockDriverState *bs);
 
 /**
+ * Like bdrv_drained_begin, but recursively begins a quiesced section for
+ * exclusive access to all child nodes as well.
+ *
+ * Graph changes are not allowed during a subtree drain section.
+ */
+void bdrv_subtree_drained_begin(BlockDriverState *bs);
+
+/**
  * bdrv_drained_end:
  *
  * End a quiescent section started by bdrv_drained_begin().
  */
 void bdrv_drained_end(BlockDriverState *bs);
 
+/**
+ * End a quiescent section started by bdrv_subtree_drained_begin().
+ */
+void bdrv_subtree_drained_end(BlockDriverState *bs);
+
 void bdrv_add_child(BlockDriverState *parent, BlockDriverState *child,
                     Error **errp);
 void bdrv_del_child(BlockDriverState *parent, BdrvChild *child, Error **errp);
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ typedef struct {
     BlockDriverState *bs;
     bool done;
     bool begin;
+    bool recursive;
     BdrvChild *parent;
 } BdrvCoDrainData;
 
@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs)
     return waited;
 }
 
-static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent);
-static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent);
+static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
+                                  BdrvChild *parent);
+static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
+                                BdrvChild *parent);
 
 static void bdrv_co_drain_bh_cb(void *opaque)
 {
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
 
     bdrv_dec_in_flight(bs);
     if (data->begin) {
-        bdrv_do_drained_begin(bs, data->parent);
+        bdrv_do_drained_begin(bs, data->recursive, data->parent);
     } else {
-        bdrv_do_drained_end(bs, data->parent);
+        bdrv_do_drained_end(bs, data->recursive, data->parent);
     }
 
     data->done = true;
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
 }
 
 static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
-                                                bool begin, BdrvChild *parent)
+                                                bool begin, bool recursive,
+                                                BdrvChild *parent)
 {
     BdrvCoDrainData data;
 
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
         .bs = bs,
         .done = false,
         .begin = begin,
+        .recursive = recursive,
         .parent = parent,
     };
     bdrv_inc_in_flight(bs);
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
     assert(data.done);
 }
 
-static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent)
+static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
+                                  BdrvChild *parent)
 {
+    BdrvChild *child, *next;
+
     if (qemu_in_coroutine()) {
-        bdrv_co_yield_to_drain(bs, true, parent);
+        bdrv_co_yield_to_drain(bs, true, recursive, parent);
         return;
     }
 
@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent)
     bdrv_parent_drained_begin(bs, parent);
     bdrv_drain_invoke(bs, true, false);
     bdrv_drain_recurse(bs);
+
+    if (recursive) {
+        QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
+            bdrv_do_drained_begin(child->bs, true, child);
+        }
+    }
 }
 
 void bdrv_drained_begin(BlockDriverState *bs)
 {
-    bdrv_do_drained_begin(bs, NULL);
+    bdrv_do_drained_begin(bs, false, NULL);
+}
+
+void bdrv_subtree_drained_begin(BlockDriverState *bs)
+{
+    bdrv_do_drained_begin(bs, true, NULL);
 }
 
-static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent)
+static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
+                                BdrvChild *parent)
 {
+    BdrvChild *child, *next;
     int old_quiesce_counter;
 
     if (qemu_in_coroutine()) {
-        bdrv_co_yield_to_drain(bs, false, parent);
+        bdrv_co_yield_to_drain(bs, false, recursive, parent);
         return;
     }
     assert(bs->quiesce_counter > 0);
@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent)
     if (old_quiesce_counter == 1) {
         aio_enable_external(bdrv_get_aio_context(bs));
     }
+
+    if (recursive) {
+        QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
+            bdrv_do_drained_end(child->bs, true, child);
+        }
+    }
 }
 
 void bdrv_drained_end(BlockDriverState *bs)
 {
-    bdrv_do_drained_end(bs, NULL);
+    bdrv_do_drained_end(bs, false, NULL);
+}
+
+void bdrv_subtree_drained_end(BlockDriverState *bs)
+{
+    bdrv_do_drained_end(bs, true, NULL);
 }
 
 /*
-- 
2.13.6

Add a subtree drain version to the existing test cases.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/test-bdrv-drain.c | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret)
 enum drain_type {
     BDRV_DRAIN_ALL,
     BDRV_DRAIN,
+    BDRV_SUBTREE_DRAIN,
     DRAIN_TYPE_MAX,
 };
 
@@ -XXX,XX +XXX,XX @@ static void do_drain_begin(enum drain_type drain_type, BlockDriverState *bs)
     switch (drain_type) {
     case BDRV_DRAIN_ALL:        bdrv_drain_all_begin(); break;
     case BDRV_DRAIN:            bdrv_drained_begin(bs); break;
+    case BDRV_SUBTREE_DRAIN:    bdrv_subtree_drained_begin(bs); break;
     default:                    g_assert_not_reached();
     }
 }
@@ -XXX,XX +XXX,XX @@ static void do_drain_end(enum drain_type drain_type, BlockDriverState *bs)
     switch (drain_type) {
     case BDRV_DRAIN_ALL:        bdrv_drain_all_end(); break;
     case BDRV_DRAIN:            bdrv_drained_end(bs); break;
+    case BDRV_SUBTREE_DRAIN:    bdrv_subtree_drained_end(bs); break;
     default:                    g_assert_not_reached();
     }
 }
@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain(void)
     test_drv_cb_common(BDRV_DRAIN, false);
 }
 
+static void test_drv_cb_drain_subtree(void)
+{
+    test_drv_cb_common(BDRV_SUBTREE_DRAIN, true);
+}
+
 static void test_quiesce_common(enum drain_type drain_type, bool recursive)
 {
     BlockBackend *blk;
@@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain(void)
     test_quiesce_common(BDRV_DRAIN, false);
 }
 
+static void test_quiesce_drain_subtree(void)
+{
+    test_quiesce_common(BDRV_SUBTREE_DRAIN, true);
+}
+
 static void test_nested(void)
 {
     BlockBackend *blk;
@@ -XXX,XX +XXX,XX @@ static void test_nested(void)
             /* XXX bdrv_drain_all() doesn't increase the quiesce_counter */
             int bs_quiesce      = (outer != BDRV_DRAIN_ALL) +
                                   (inner != BDRV_DRAIN_ALL);
-            int backing_quiesce = 0;
+            int backing_quiesce = (outer == BDRV_SUBTREE_DRAIN) +
+                                  (inner == BDRV_SUBTREE_DRAIN);
             int backing_cb_cnt  = (outer != BDRV_DRAIN) +
                                   (inner != BDRV_DRAIN);
 
@@ -XXX,XX +XXX,XX @@ static void test_blockjob_drain(void)
     test_blockjob_common(BDRV_DRAIN);
 }
 
+static void test_blockjob_drain_subtree(void)
+{
+    test_blockjob_common(BDRV_SUBTREE_DRAIN);
+}
+
 int main(int argc, char **argv)
 {
     bdrv_init();
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
 
     g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
     g_test_add_func("/bdrv-drain/driver-cb/drain", test_drv_cb_drain);
+    g_test_add_func("/bdrv-drain/driver-cb/drain_subtree",
+                    test_drv_cb_drain_subtree);
 
     g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
     g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
+    g_test_add_func("/bdrv-drain/quiesce/drain_subtree",
+                    test_quiesce_drain_subtree);
 
     g_test_add_func("/bdrv-drain/nested", test_nested);
 
     g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
     g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
+    g_test_add_func("/bdrv-drain/blockjob/drain_subtree",
+                    test_blockjob_drain_subtree);
 
     return g_test_run();
 }
-- 
2.13.6

If bdrv_do_drained_begin/end() are called in coroutine context, they
first use a BH to get out of the coroutine context. Call some existing
tests again from a coroutine to cover this code path.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/test-bdrv-drain.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret)
     *aio_ret = ret;
 }
 
+typedef struct CallInCoroutineData {
+    void (*entry)(void);
+    bool done;
+} CallInCoroutineData;
+
+static coroutine_fn void call_in_coroutine_entry(void *opaque)
+{
+    CallInCoroutineData *data = opaque;
+
+    data->entry();
+    data->done = true;
+}
+
+static void call_in_coroutine(void (*entry)(void))
+{
+    Coroutine *co;
+    CallInCoroutineData data = {
+        .entry  = entry,
+        .done   = false,
+    };
+
+    co = qemu_coroutine_create(call_in_coroutine_entry, &data);
+    qemu_coroutine_enter(co);
+    while (!data.done) {
+        aio_poll(qemu_get_aio_context(), true);
+    }
+}
+
 enum drain_type {
     BDRV_DRAIN_ALL,
     BDRV_DRAIN,
@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain_subtree(void)
     test_drv_cb_common(BDRV_SUBTREE_DRAIN, true);
 }
 
+static void test_drv_cb_co_drain(void)
+{
+    call_in_coroutine(test_drv_cb_drain);
+}
+
+static void test_drv_cb_co_drain_subtree(void)
+{
+    call_in_coroutine(test_drv_cb_drain_subtree);
+}
+
 static void test_quiesce_common(enum drain_type drain_type, bool recursive)
 {
     BlockBackend *blk;
@@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain_subtree(void)
     test_quiesce_common(BDRV_SUBTREE_DRAIN, true);
 }
 
+static void test_quiesce_co_drain(void)
+{
+    call_in_coroutine(test_quiesce_drain);
+}
+
+static void test_quiesce_co_drain_subtree(void)
+{
+    call_in_coroutine(test_quiesce_drain_subtree);
+}
+
 static void test_nested(void)
 {
     BlockBackend *blk;
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
     g_test_add_func("/bdrv-drain/driver-cb/drain_subtree",
                     test_drv_cb_drain_subtree);
 
+    // XXX bdrv_drain_all() doesn't work in coroutine context
+    g_test_add_func("/bdrv-drain/driver-cb/co/drain", test_drv_cb_co_drain);
+    g_test_add_func("/bdrv-drain/driver-cb/co/drain_subtree",
+                    test_drv_cb_co_drain_subtree);
+
+
     g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
     g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
     g_test_add_func("/bdrv-drain/quiesce/drain_subtree",
                     test_quiesce_drain_subtree);
 
+    // XXX bdrv_drain_all() doesn't work in coroutine context
+    g_test_add_func("/bdrv-drain/quiesce/co/drain", test_quiesce_co_drain);
+    g_test_add_func("/bdrv-drain/quiesce/co/drain_subtree",
+                    test_quiesce_co_drain_subtree);
+
     g_test_add_func("/bdrv-drain/nested", test_nested);
 
     g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
-- 
2.13.6

Test that drain sections are correctly propagated through the graph.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/test-bdrv-drain.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static void test_nested(void)
     blk_unref(blk);
 }
 
+static void test_multiparent(void)
+{
+    BlockBackend *blk_a, *blk_b;
+    BlockDriverState *bs_a, *bs_b, *backing;
+    BDRVTestState *a_s, *b_s, *backing_s;
+
+    blk_a = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    bs_a = bdrv_new_open_driver(&bdrv_test, "test-node-a", BDRV_O_RDWR,
+                                &error_abort);
+    a_s = bs_a->opaque;
+    blk_insert_bs(blk_a, bs_a, &error_abort);
+
+    blk_b = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    bs_b = bdrv_new_open_driver(&bdrv_test, "test-node-b", BDRV_O_RDWR,
+                                &error_abort);
+    b_s = bs_b->opaque;
+    blk_insert_bs(blk_b, bs_b, &error_abort);
+
+    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
+    backing_s = backing->opaque;
+    bdrv_set_backing_hd(bs_a, backing, &error_abort);
+    bdrv_set_backing_hd(bs_b, backing, &error_abort);
+
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
+    g_assert_cmpint(backing->quiesce_counter, ==, 0);
+    g_assert_cmpint(a_s->drain_count, ==, 0);
+    g_assert_cmpint(b_s->drain_count, ==, 0);
+    g_assert_cmpint(backing_s->drain_count, ==, 0);
+
+    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
+
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
+    g_assert_cmpint(backing->quiesce_counter, ==, 1);
+    g_assert_cmpint(a_s->drain_count, ==, 1);
+    g_assert_cmpint(b_s->drain_count, ==, 1);
+    g_assert_cmpint(backing_s->drain_count, ==, 1);
+
+    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b);
+
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 2);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 2);
+    g_assert_cmpint(backing->quiesce_counter, ==, 2);
+    g_assert_cmpint(a_s->drain_count, ==, 2);
+    g_assert_cmpint(b_s->drain_count, ==, 2);
+    g_assert_cmpint(backing_s->drain_count, ==, 2);
+
+    do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
+
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
+    g_assert_cmpint(backing->quiesce_counter, ==, 1);
+    g_assert_cmpint(a_s->drain_count, ==, 1);
+    g_assert_cmpint(b_s->drain_count, ==, 1);
+    g_assert_cmpint(backing_s->drain_count, ==, 1);
+
+    do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
+
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
+    g_assert_cmpint(backing->quiesce_counter, ==, 0);
+    g_assert_cmpint(a_s->drain_count, ==, 0);
+    g_assert_cmpint(b_s->drain_count, ==, 0);
+    g_assert_cmpint(backing_s->drain_count, ==, 0);
+
+    bdrv_unref(backing);
+    bdrv_unref(bs_a);
+    bdrv_unref(bs_b);
+    blk_unref(blk_a);
+    blk_unref(blk_b);
+}
+
 
 typedef struct TestBlockJob {
     BlockJob common;
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
                     test_quiesce_co_drain_subtree);
 
     g_test_add_func("/bdrv-drain/nested", test_nested);
+    g_test_add_func("/bdrv-drain/multiparent", test_multiparent);
 
     g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
     g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
-- 
2.13.6

We need to remember how many of the drain sections in which a node is
were recursive (i.e. subtree drain rather than node drain), so that they
can be correctly applied when children are added or removed during the
drained section.

With this change, it is safe to modify the graph even inside a
bdrv_subtree_drained_begin/end() section.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/block.h     |  2 --
 include/block/block_int.h |  5 +++++
 block.c                   | 32 +++++++++++++++++++++++++++++---
 block/io.c                | 28 ++++++++++++++++++++++++----
 4 files changed, 58 insertions(+), 9 deletions(-)

diff --git a/include/block/block.h b/include/block/block.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs);
 /**
  * Like bdrv_drained_begin, but recursively begins a quiesced section for
  * exclusive access to all child nodes as well.
- *
- * Graph changes are not allowed during a subtree drain section.
  */
 void bdrv_subtree_drained_begin(BlockDriverState *bs);
 
diff --git a/include/block/block_int.h b/include/block/block_int.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
 
     /* Accessed with atomic ops.  */
     int quiesce_counter;
+    int recursive_quiesce_counter;
+
     unsigned int write_gen;               /* Current data generation */
 
     /* Protected by reqs_lock.  */
@@ -XXX,XX +XXX,XX @@ int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
     BdrvRequestFlags flags);
 
+void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent);
+void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent);
+
 int get_tmp_filename(char *filename, int size);
 BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
                             const char *filename);
diff --git a/block.c b/block.c
index XXXXXXX..XXXXXXX 100644
--- a/block.c
+++ b/block.c
@@ -XXX,XX +XXX,XX @@ static void bdrv_child_cb_drained_end(BdrvChild *child)
     bdrv_drained_end(bs);
 }
 
+static void bdrv_child_cb_attach(BdrvChild *child)
+{
+    BlockDriverState *bs = child->opaque;
+    bdrv_apply_subtree_drain(child, bs);
+}
+
+static void bdrv_child_cb_detach(BdrvChild *child)
+{
+    BlockDriverState *bs = child->opaque;
+    bdrv_unapply_subtree_drain(child, bs);
+}
+
 static int bdrv_child_cb_inactivate(BdrvChild *child)
 {
     BlockDriverState *bs = child->opaque;
@@ -XXX,XX +XXX,XX @@ const BdrvChildRole child_file = {
     .inherit_options = bdrv_inherited_options,
     .drained_begin   = bdrv_child_cb_drained_begin,
     .drained_end     = bdrv_child_cb_drained_end,
+    .attach          = bdrv_child_cb_attach,
+    .detach          = bdrv_child_cb_detach,
     .inactivate      = bdrv_child_cb_inactivate,
 };
 
@@ -XXX,XX +XXX,XX @@ const BdrvChildRole child_format = {
     .inherit_options = bdrv_inherited_fmt_options,
     .drained_begin   = bdrv_child_cb_drained_begin,
     .drained_end     = bdrv_child_cb_drained_end,
+    .attach          = bdrv_child_cb_attach,
+    .detach          = bdrv_child_cb_detach,
     .inactivate      = bdrv_child_cb_inactivate,
 };
 
@@ -XXX,XX +XXX,XX @@ static void bdrv_backing_attach(BdrvChild *c)
                     parent->backing_blocker);
     bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_BACKUP_TARGET,
                     parent->backing_blocker);
+
+    bdrv_child_cb_attach(c);
 }
 
 static void bdrv_backing_detach(BdrvChild *c)
@@ -XXX,XX +XXX,XX @@ static void bdrv_backing_detach(BdrvChild *c)
     bdrv_op_unblock_all(c->bs, parent->backing_blocker);
     error_free(parent->backing_blocker);
     parent->backing_blocker = NULL;
+
+    bdrv_child_cb_detach(c);
 }
 
 /*
@@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child,
         assert(bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs));
     }
     if (old_bs) {
+        /* Detach first so that the recursive drain sections coming from @child
+         * are already gone and we only end the drain sections that came from
+         * elsewhere. */
+        if (child->role->detach) {
+            child->role->detach(child);
+        }
         if (old_bs->quiesce_counter && child->role->drained_end) {
             for (i = 0; i < old_bs->quiesce_counter; i++) {
                 child->role->drained_end(child);
             }
         }
-        if (child->role->detach) {
-            child->role->detach(child);
-        }
         QLIST_REMOVE(child, next_parent);
     }
 
@@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child,
             }
         }
 
+        /* Attach only after starting new drained sections, so that recursive
+         * drain sections coming from @child don't get an extra .drained_begin
+         * callback. */
         if (child->role->attach) {
             child->role->attach(child);
         }
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
     assert(data.done);
 }
 
-static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
-                                  BdrvChild *parent)
+void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
+                           BdrvChild *parent)
 {
     BdrvChild *child, *next;
 
@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
     bdrv_drain_recurse(bs);
 
     if (recursive) {
+        bs->recursive_quiesce_counter++;
         QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
             bdrv_do_drained_begin(child->bs, true, child);
         }
@@ -XXX,XX +XXX,XX @@ void bdrv_subtree_drained_begin(BlockDriverState *bs)
     bdrv_do_drained_begin(bs, true, NULL);
 }
 
-static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
-                                BdrvChild *parent)
+void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
+                         BdrvChild *parent)
 {
     BdrvChild *child, *next;
     int old_quiesce_counter;
@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
     }
 
     if (recursive) {
+        bs->recursive_quiesce_counter--;
         QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
             bdrv_do_drained_end(child->bs, true, child);
         }
@@ -XXX,XX +XXX,XX @@ void bdrv_subtree_drained_end(BlockDriverState *bs)
     bdrv_do_drained_end(bs, true, NULL);
 }
 
+void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent)
+{
+    int i;
+
+    for (i = 0; i < new_parent->recursive_quiesce_counter; i++) {
+        bdrv_do_drained_begin(child->bs, true, child);
+    }
+}
+
+void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent)
+{
+    int i;
+
+    for (i = 0; i < old_parent->recursive_quiesce_counter; i++) {
+        bdrv_do_drained_end(child->bs, true, child);
+    }
+}
+
 /*
  * Wait for pending requests to complete on a single BlockDriverState subtree,
  * and suspend block driver's internal I/O until next request arrives.
-- 
2.13.6

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/test-bdrv-drain.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static void test_multiparent(void)
     blk_unref(blk_b);
 }
 
+static void test_graph_change(void)
+{
+    BlockBackend *blk_a, *blk_b;
+    BlockDriverState *bs_a, *bs_b, *backing;
+    BDRVTestState *a_s, *b_s, *backing_s;
+
+    blk_a = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    bs_a = bdrv_new_open_driver(&bdrv_test, "test-node-a", BDRV_O_RDWR,
+                                &error_abort);
+    a_s = bs_a->opaque;
+    blk_insert_bs(blk_a, bs_a, &error_abort);
+
+    blk_b = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    bs_b = bdrv_new_open_driver(&bdrv_test, "test-node-b", BDRV_O_RDWR,
+                                &error_abort);
+    b_s = bs_b->opaque;
+    blk_insert_bs(blk_b, bs_b, &error_abort);
+
+    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
+    backing_s = backing->opaque;
+    bdrv_set_backing_hd(bs_a, backing, &error_abort);
+
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
+    g_assert_cmpint(backing->quiesce_counter, ==, 0);
+    g_assert_cmpint(a_s->drain_count, ==, 0);
+    g_assert_cmpint(b_s->drain_count, ==, 0);
+    g_assert_cmpint(backing_s->drain_count, ==, 0);
+
+    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
+    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
+    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
+    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b);
+    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b);
+
+    bdrv_set_backing_hd(bs_b, backing, &error_abort);
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 5);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 5);
+    g_assert_cmpint(backing->quiesce_counter, ==, 5);
+    g_assert_cmpint(a_s->drain_count, ==, 5);
+    g_assert_cmpint(b_s->drain_count, ==, 5);
+    g_assert_cmpint(backing_s->drain_count, ==, 5);
+
+    bdrv_set_backing_hd(bs_b, NULL, &error_abort);
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 3);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 2);
+    g_assert_cmpint(backing->quiesce_counter, ==, 3);
+    g_assert_cmpint(a_s->drain_count, ==, 3);
+    g_assert_cmpint(b_s->drain_count, ==, 2);
+    g_assert_cmpint(backing_s->drain_count, ==, 3);
+
+    bdrv_set_backing_hd(bs_b, backing, &error_abort);
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 5);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 5);
+    g_assert_cmpint(backing->quiesce_counter, ==, 5);
+    g_assert_cmpint(a_s->drain_count, ==, 5);
+    g_assert_cmpint(b_s->drain_count, ==, 5);
+    g_assert_cmpint(backing_s->drain_count, ==, 5);
+
+    do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
+    do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
+    do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
+    do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
+    do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
+
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
+    g_assert_cmpint(backing->quiesce_counter, ==, 0);
+    g_assert_cmpint(a_s->drain_count, ==, 0);
+    g_assert_cmpint(b_s->drain_count, ==, 0);
+    g_assert_cmpint(backing_s->drain_count, ==, 0);
+
+    bdrv_unref(backing);
+    bdrv_unref(bs_a);
+    bdrv_unref(bs_b);
+    blk_unref(blk_a);
+    blk_unref(blk_b);
+}
+
 
 typedef struct TestBlockJob {
     BlockJob common;
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
 
     g_test_add_func("/bdrv-drain/nested", test_nested);
     g_test_add_func("/bdrv-drain/multiparent", test_multiparent);
+    g_test_add_func("/bdrv-drain/graph-change", test_graph_change);
 
     g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
     g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
-- 
2.13.6

Since commit bde70715, base is the only node that is reopened in
commit_start(). This means that the code, which still involves an
explicit BlockReopenQueue, can now be simplified by using bdrv_reopen().

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
---
 block/commit.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/block/commit.c b/block/commit.c
index XXXXXXX..XXXXXXX 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -XXX,XX +XXX,XX @@ void commit_start(const char *job_id, BlockDriverState *bs,
                   const char *filter_node_name, Error **errp)
 {
     CommitBlockJob *s;
-    BlockReopenQueue *reopen_queue = NULL;
     int orig_base_flags;
     BlockDriverState *iter;
     BlockDriverState *commit_top_bs = NULL;
@@ -XXX,XX +XXX,XX @@ void commit_start(const char *job_id, BlockDriverState *bs,
     /* convert base to r/w, if necessary */
     orig_base_flags = bdrv_get_flags(base);
     if (!(orig_base_flags & BDRV_O_RDWR)) {
-        reopen_queue = bdrv_reopen_queue(reopen_queue, base, NULL,
-                                         orig_base_flags | BDRV_O_RDWR);
-    }
-
-    if (reopen_queue) {
-        bdrv_reopen_multiple(bdrv_get_aio_context(bs), reopen_queue, &local_err);
+        bdrv_reopen(base, orig_base_flags | BDRV_O_RDWR, &local_err);
         if (local_err != NULL) {
             error_propagate(errp, local_err);
             goto fail;
-- 
2.13.6

The bdrv_reopen*() implementation doesn't like it if the graph is
changed between queuing nodes for reopen and actually reopening them
(one of the reasons is that queuing can be recursive).

So instead of draining the device only in bdrv_reopen_multiple(),
require that callers already drained all affected nodes, and assert this
in bdrv_reopen_queue().

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
---
 block.c             | 23 ++++++++++++++++-------
 block/replication.c |  6 ++++++
 qemu-io-cmds.c      |  3 +++
 3 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/block.c b/block.c
index XXXXXXX..XXXXXXX 100644
--- a/block.c
+++ b/block.c
@@ -XXX,XX +XXX,XX @@ BlockDriverState *bdrv_open(const char *filename, const char *reference,
  * returns a pointer to bs_queue, which is either the newly allocated
  * bs_queue, or the existing bs_queue being used.
  *
+ * bs must be drained between bdrv_reopen_queue() and bdrv_reopen_multiple().
  */
 static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue,
                                                  BlockDriverState *bs,
@@ -XXX,XX +XXX,XX @@ static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue,
     BdrvChild *child;
     QDict *old_options, *explicit_options;
 
+    /* Make sure that the caller remembered to use a drained section. This is
+     * important to avoid graph changes between the recursive queuing here and
+     * bdrv_reopen_multiple(). */
+    assert(bs->quiesce_counter > 0);
+
     if (bs_queue == NULL) {
         bs_queue = g_new0(BlockReopenQueue, 1);
         QSIMPLEQ_INIT(bs_queue);
@@ -XXX,XX +XXX,XX @@ BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
  * If all devices prepare successfully, then the changes are committed
  * to all devices.
  *
+ * All affected nodes must be drained between bdrv_reopen_queue() and
+ * bdrv_reopen_multiple().
  */
 int bdrv_reopen_multiple(AioContext *ctx, BlockReopenQueue *bs_queue, Error **errp)
 {
@@ -XXX,XX +XXX,XX @@ int bdrv_reopen_multiple(AioContext *ctx, BlockReopenQueue *bs_queue, Error **er
 
     assert(bs_queue != NULL);
 
-    aio_context_release(ctx);
-    bdrv_drain_all_begin();
-    aio_context_acquire(ctx);
-
     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
+        assert(bs_entry->state.bs->quiesce_counter > 0);
         if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
             error_propagate(errp, local_err);
             goto cleanup;
@@ -XXX,XX +XXX,XX @@ cleanup:
     }
     g_free(bs_queue);
 
-    bdrv_drain_all_end();
-
     return ret;
 }
 
@@ -XXX,XX +XXX,XX @@ int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
 {
     int ret = -1;
     Error *local_err = NULL;
-    BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, NULL, bdrv_flags);
+    BlockReopenQueue *queue;
 
+    bdrv_subtree_drained_begin(bs);
+
+    queue = bdrv_reopen_queue(NULL, bs, NULL, bdrv_flags);
     ret = bdrv_reopen_multiple(bdrv_get_aio_context(bs), queue, &local_err);
     if (local_err != NULL) {
         error_propagate(errp, local_err);
     }
+
+    bdrv_subtree_drained_end(bs);
+
     return ret;
 }
 
diff --git a/block/replication.c b/block/replication.c
index XXXXXXX..XXXXXXX 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -XXX,XX +XXX,XX @@ static void reopen_backing_file(BlockDriverState *bs, bool writable,
         new_secondary_flags = s->orig_secondary_flags;
     }
 
+    bdrv_subtree_drained_begin(s->hidden_disk->bs);
+    bdrv_subtree_drained_begin(s->secondary_disk->bs);
+
     if (orig_hidden_flags != new_hidden_flags) {
         reopen_queue = bdrv_reopen_queue(reopen_queue, s->hidden_disk->bs, NULL,
                                          new_hidden_flags);
@@ -XXX,XX +XXX,XX @@ static void reopen_backing_file(BlockDriverState *bs, bool writable,
                              reopen_queue, &local_err);
         error_propagate(errp, local_err);
     }
+
+    bdrv_subtree_drained_end(s->hidden_disk->bs);
+    bdrv_subtree_drained_end(s->secondary_disk->bs);
 }
 
 static void backup_job_cleanup(BlockDriverState *bs)
diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
index XXXXXXX..XXXXXXX 100644
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -XXX,XX +XXX,XX @@ static int reopen_f(BlockBackend *blk, int argc, char **argv)
     opts = qopts ? qemu_opts_to_qdict(qopts, NULL) : NULL;
     qemu_opts_reset(&reopen_opts);
 
+    bdrv_subtree_drained_begin(bs);
     brq = bdrv_reopen_queue(NULL, bs, opts, flags);
     bdrv_reopen_multiple(bdrv_get_aio_context(bs), brq, &local_err);
+    bdrv_subtree_drained_end(bs);
+
     if (local_err) {
         error_report_err(local_err);
     } else {
-- 
2.13.6