Series comparison

-[Qemu-devel] [PULL 00/37] Block layer patches
+[Qemu-devel] [PULL v3 00/35] Block layer patches
-The following changes since commit ad1b4ec39caa5b3f17cbd8160283a03a3dcfe2ae:
+The following changes since commit 281f327487c9c9b1599f93c589a408bbf4a651b8:
-  Merge remote-tracking branch 'remotes/kraxel/tags/input-20180515-pull-request' into staging (2018-05-15 12:50:06 +0100)
+  Merge remote-tracking branch 'remotes/vivier/tags/m68k-for-2.12-pull-request' into staging (2017-12-22 00:11:36 +0000)
 are available in the git repository at:
   git://repo.or.cz/qemu/kevin.git tags/for-upstream
-for you to fetch changes up to 1fce860ea5eba1ca00a67911fc0b8a5d80009514:
+for you to fetch changes up to 1a63a907507fbbcfaee3f622907ec244b7eabda8:
-  Merge remote-tracking branch 'mreitz/tags/pull-block-2018-05-15' into queue-block (2018-05-15 16:19:53 +0200)
+  block: Keep nodes drained between reopen_queue/multiple (2017-12-22 15:05:32 +0100)
 ----------------------------------------------------------------
-Block layer patches:
+Block layer patches
-- Switch AIO/callback based block drivers to a byte-based interface
-- Block jobs: Expose error string via query-block-jobs
-- Block job cleanups and fixes
-- hmp: Allow using a qdev id in block_set_io_throttle
-- Copy-on-read block driver
-- The qcow2 default refcount cache size has been decreased
-- Various bug fixes
 ----------------------------------------------------------------
-Alberto Garcia (5):
+Doug Gale (1):
-      hmp: Allow using a qdev id in block_set_io_throttle
+      nvme: Add tracing
       Fix error message about compressed clusters with OFLAG_COPIED
       specs/qcow2: Clarify that compressed clusters have the COPIED bit reset
       qcow2: Give the refcount cache the minimum possible size by default
       docs: Document the new default sizes of the qcow2 caches
-Daniel Henrique Barboza (1):
+Edgar Kaziakhmedov (1):
-      block-backend: simplify blk_get_aio_context
+      qcow2: get rid of qcow2_backing_read1 routine
-Eric Blake (7):
+Fam Zheng (2):
-      block: Support byte-based aio callbacks
+      block: Open backing image in force share mode for size probe
-      file-win32: Switch to byte-based callbacks
+      block: Remove unused bdrv_requests_pending
       null: Switch to byte-based read/write
       rbd: Switch to byte-based callbacks
       vxhs: Switch to byte-based callbacks
       block: Drop last of the sector-based aio callbacks
       block: Merge .bdrv_co_writev{,_flags} in drivers
 John Snow (1):
-      blockjob: expose error string via query
+      iotests: fix 197 for vpc
-Kevin Wolf (7):
+Kevin Wolf (27):
-      blockjob: Fix assertion in block_job_finalize()
+      block: Formats don't need CONSISTENT_READ with NO_IO
-      blockjob: Wrappers for progress counter access
+      block: Make bdrv_drain_invoke() recursive
-      blockjob: Move RateLimit to BlockJob
+      block: Call .drain_begin only once in bdrv_drain_all_begin()
-      blockjob: Implement block_job_set_speed() centrally
+      test-bdrv-drain: Test BlockDriver callbacks for drain
-      blockjob: Introduce block_job_ratelimit_get_delay()
+      block: bdrv_drain_recurse(): Remove unused begin parameter
-      blockjob: Add block_job_driver()
+      block: Don't wait for requests in bdrv_drain*_end()
-      Merge remote-tracking branch 'mreitz/tags/pull-block-2018-05-15' into queue-block
+      block: Unify order in drain functions
       block: Don't acquire AioContext in hmp_qemu_io()
       block: Document that x-blockdev-change breaks quorum children list
       block: Assert drain_all is only called from main AioContext
       block: Make bdrv_drain() driver callbacks non-recursive
       test-bdrv-drain: Test callback for bdrv_drain
       test-bdrv-drain: Test bs->quiesce_counter
       blockjob: Pause job on draining any job BDS
       test-bdrv-drain: Test drain vs. block jobs
       block: Don't block_job_pause_all() in bdrv_drain_all()
       block: Nested drain_end must still call callbacks
       test-bdrv-drain: Test nested drain sections
       block: Don't notify parents in drain call chain
       block: Add bdrv_subtree_drained_begin/end()
       test-bdrv-drain: Tests for bdrv_subtree_drain
       test-bdrv-drain: Test behaviour in coroutine context
       test-bdrv-drain: Recursive draining with multiple parents
       block: Allow graph changes in subtree drained section
       test-bdrv-drain: Test graph changes in drained section
       commit: Simplify reopen of base
       block: Keep nodes drained between reopen_queue/multiple
-Max Reitz (17):
+Thomas Huth (3):
-      iotests: Split 214 off of 122
+      block: Remove the obsolete -drive boot=on|off parameter
-      iotests: Add failure matching to common.qemu
+      block: Remove the deprecated -hdachs option
-      iotests: Skip 181 and 201 without userfaultfd
+      block: Mention -drive cyls/heads/secs/trans/serial/addr in deprecation chapter
       block: Add COR filter driver
       block: BLK_PERM_WRITE includes ..._UNCHANGED
       block: Add BDRV_REQ_WRITE_UNCHANGED flag
       block: Set BDRV_REQ_WRITE_UNCHANGED for COR writes
       block/quorum: Support BDRV_REQ_WRITE_UNCHANGED
       block: Support BDRV_REQ_WRITE_UNCHANGED in filters
       iotests: Clean up wrap image in 197
       iotests: Copy 197 for COR filter driver
       iotests: Add test for COR across nodes
       qemu-img: Check post-truncation size
       block: Document BDRV_REQ_WRITE_UNCHANGED support
       qemu-io: Use purely string blockdev options
       qemu-img: Use only string options in img_open_opts
       iotests: Add test for -U/force-share conflicts
- qapi/block-core.json           |  11 ++-
+ qapi/block-core.json             |   4 +
- docs/interop/qcow2.txt         |   8 +-
+ block/qcow2.h                    |   3 -
- docs/qcow2-cache.txt           |  33 ++++----
+ include/block/block.h            |  15 +-
- block/qcow2.h                  |   4 -
+ include/block/block_int.h        |   6 +-
- include/block/block.h          |   9 ++-
+ block.c                          |  75 ++++-
- include/block/block_int.h      |  28 +++++--
+ block/commit.c                   |   8 +-
- include/block/blockjob.h       |  32 ++++++++
+ block/io.c                       | 164 +++++++---
- include/block/blockjob_int.h   |  11 ++-
+ block/qcow2.c                    |  51 +--
- include/block/raw-aio.h        |   2 +-
+ block/replication.c              |   6 +
- block/backup.c                 |  62 ++++++---------
+ blockdev.c                       |  11 -
- block/blkdebug.c               |   9 ++-
+ blockjob.c                       |  22 +-
- block/blkreplay.c              |   3 +
+ hmp.c                            |   6 -
- block/blkverify.c              |   3 +
+ hw/block/nvme.c                  | 349 +++++++++++++++++----
- block/block-backend.c          |   8 +-
+ qemu-io-cmds.c                   |   3 +
- block/commit.c                 |  35 +++------
+ tests/test-bdrv-drain.c          | 651 +++++++++++++++++++++++++++++++++++++++
- block/copy-on-read.c           | 173 +++++++++++++++++++++++++++++++++++++++++
+ vl.c                             |  86 +-----
- block/file-win32.c             |  47 ++++++-----
+ hw/block/trace-events            |  93 ++++++
- block/gluster.c                |   4 +-
+ qemu-doc.texi                    |  29 +-
- block/io.c                     |  75 ++++++++++--------
+ qemu-options.hx                  |  19 +-
- block/iscsi.c                  |   8 +-
+ tests/Makefile.include           |   2 +
- block/mirror.c                 |  44 ++++-------
+ tests/qemu-iotests/197           |   4 +
- block/null.c                   |  45 +++++------
+ tests/qemu-iotests/common.filter |   3 +-
- block/parallels.c              |   4 +-
+files changed, 1294 insertions(+), 316 deletions(-)
- block/qcow.c                   |   6 +-
+ create mode 100644 tests/test-bdrv-drain.c
  block/qcow2-refcount.c         |   4 +-
  block/qcow2.c                  |  31 +++++---
  block/qed.c                    |   3 +-
  block/quorum.c                 |  19 +++--
  block/raw-format.c             |   9 ++-
  block/rbd.c                    |  40 +++++-----
  block/replication.c            |   4 +-
  block/sheepdog.c               |   4 +-
  block/ssh.c                    |   4 +-
  block/stream.c                 |  33 +++-----
  block/throttle.c               |   6 +-
  block/vhdx.c                   |   4 +-
  block/vxhs.c                   |  43 +++++-----
  block/win32-aio.c              |   5 +-
  blockjob.c                     |  40 +++++++---
  hmp.c                          |  14 +++-
  qemu-img.c                     |  43 ++++++++--
  qemu-io.c                      |   4 +-
  block/Makefile.objs            |   2 +-
  hmp-commands.hx                |   3 +-
  tests/qemu-iotests/122         |  47 -----------
  tests/qemu-iotests/122.out     |  33 --------
  tests/qemu-iotests/137.out     |   2 +-
  tests/qemu-iotests/153         |  17 ++++
  tests/qemu-iotests/153.out     |  16 ++++
  tests/qemu-iotests/181         |  13 ++++
  tests/qemu-iotests/197         |   1 +
  tests/qemu-iotests/201         |  13 ++++
  tests/qemu-iotests/214         |  97 +++++++++++++++++++++++
  tests/qemu-iotests/214.out     |  35 +++++++++
  tests/qemu-iotests/215         | 120 ++++++++++++++++++++++++++++
  tests/qemu-iotests/215.out     |  26 +++++++
  tests/qemu-iotests/216         | 115 +++++++++++++++++++++++++++
  tests/qemu-iotests/216.out     |  28 +++++++
  tests/qemu-iotests/common.qemu |  58 ++++++++++++--
  tests/qemu-iotests/group       |   3 +
 files changed, 1174 insertions(+), 429 deletions(-)
  create mode 100644 block/copy-on-read.c
  create mode 100755 tests/qemu-iotests/214
  create mode 100644 tests/qemu-iotests/214.out
  create mode 100755 tests/qemu-iotests/215
  create mode 100644 tests/qemu-iotests/215.out
  create mode 100755 tests/qemu-iotests/216
  create mode 100644 tests/qemu-iotests/216.out

-[Qemu-devel] [PULL 37/37] iotests: Add test for -U/force-share conflicts
+[Qemu-devel] [PULL v3 01/35] block: Formats don't need CONSISTENT_READ with NO_IO
-From: Max Reitz <mreitz@redhat.com>
+Commit 1f4ad7d fixed 'qemu-img info' for raw images that are currently
 in use as a mirror target. It is not enough for image formats, though,
 as these still unconditionally request BLK_PERM_CONSISTENT_READ.
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+As this permission is geared towards whether the guest-visible data is
-Message-id: 20180502202051.15493-4-mreitz@redhat.com
+consistent, and has no impact on whether the metadata is sane, and
-Reviewed-by: Eric Blake <eblake@redhat.com>
+'qemu-img info' does not read guest-visible data (except for the raw
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+format), it makes sense to not require BLK_PERM_CONSISTENT_READ if there
 is not going to be any guest I/O performed, regardless of image format.
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- tests/qemu-iotests/153     | 17 +++++++++++++++++
+ block.c | 6 +++++-
- tests/qemu-iotests/153.out | 16 ++++++++++++++++
+file changed, 5 insertions(+), 1 deletion(-)
 files changed, 33 insertions(+)
-diff --git a/tests/qemu-iotests/153 b/tests/qemu-iotests/153
+diff --git a/block.c b/block.c
-index XXXXXXX..XXXXXXX 100755
+index XXXXXXX..XXXXXXX 100644
---- a/tests/qemu-iotests/153
+--- a/block.c
-+++ b/tests/qemu-iotests/153
++++ b/block.c
-@@ -XXX,XX +XXX,XX @@ _run_cmd $QEMU_IO "${TEST_IMG}" -c 'write 0 512'
+@@ -XXX,XX +XXX,XX @@ void bdrv_format_default_perms(BlockDriverState *bs, BdrvChild *c,
+     assert(role == &child_backing || role == &child_file);
- _cleanup_qemu
+     if (!backing) {
-+echo
++        int flags = bdrv_reopen_get_flags(reopen_queue, bs);
 +echo "== Detecting -U and force-share conflicts =="
 +
-+echo
+         /* Apart from the modifications below, the same permissions are
-+echo 'No conflict:'
+          * forwarded and left alone as for filters */
-+$QEMU_IMG info -U --image-opts driver=null-co,force-share=on
+         bdrv_filter_default_perms(bs, c, role, reopen_queue, perm, shared,
-+echo
+@@ -XXX,XX +XXX,XX @@ void bdrv_format_default_perms(BlockDriverState *bs, BdrvChild *c,
-+echo 'Conflict:'
-+$QEMU_IMG info -U --image-opts driver=null-co,force-share=off
+         /* bs->file always needs to be consistent because of the metadata. We
-+
+          * can never allow other users to resize or write to it. */
-+echo
+-        perm |= BLK_PERM_CONSISTENT_READ;
-+echo 'No conflict:'
++        if (!(flags & BDRV_O_NO_IO)) {
-+$QEMU_IO -c 'open -r -U -o driver=null-co,force-share=on'
++            perm |= BLK_PERM_CONSISTENT_READ;
-+echo
++        }
-+echo 'Conflict:'
+         shared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
-+$QEMU_IO -c 'open -r -U -o driver=null-co,force-share=off'
+     } else {
-+
+         /* We want consistent read from backing files if the parent needs it.
  # success, all done
  echo "*** done"
  rm -f $seq.full
 diff --git a/tests/qemu-iotests/153.out b/tests/qemu-iotests/153.out
 index XXXXXXX..XXXXXXX 100644
 --- a/tests/qemu-iotests/153.out
 +++ b/tests/qemu-iotests/153.out
@@ -XXX,XX +XXX,XX @@ Is another process using the image?
  Closing the other
  _qemu_io_wrapper TEST_DIR/t.qcow2 -c write 0 512
 +
 +== Detecting -U and force-share conflicts ==
 +
 +No conflict:
 +image: null-co://
 +file format: null-co
 +virtual size: 1.0G (1073741824 bytes)
 +disk size: unavailable
 +
 +Conflict:
 +qemu-img: --force-share/-U conflicts with image options
 +
 +No conflict:
 +
 +Conflict:
 +-U conflicts with image options
  *** done
 --
 .13.6

-[Qemu-devel] [PULL 30/37] iotests: Clean up wrap image in 197
+[Qemu-devel] [PULL v3 02/35] iotests: fix 197 for vpc
-From: Max Reitz <mreitz@redhat.com>
+From: John Snow <jsnow@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+VPC has some difficulty creating geometries of particular size.
 However, we can indeed force it to use a literal one, so let's
 do that for the sake of test 197, which is testing some specific
 offsets.
 Signed-off-by: John Snow <jsnow@redhat.com>
 Reviewed-by: Eric Blake <eblake@redhat.com>
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: Alberto Garcia <berto@igalia.com>
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-Message-id: 20180421132929.21610-8-mreitz@redhat.com
+Reviewed-by: Lukáš Doktor <ldoktor@redhat.com>
 Reviewed-by: Kevin Wolf <kwolf@redhat.com>
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 ---
- tests/qemu-iotests/197 | 1 +
+ tests/qemu-iotests/197           | 4 ++++
-file changed, 1 insertion(+)
+ tests/qemu-iotests/common.filter | 3 ++-
 files changed, 6 insertions(+), 1 deletion(-)
 diff --git a/tests/qemu-iotests/197 b/tests/qemu-iotests/197
 index XXXXXXX..XXXXXXX 100755
 --- a/tests/qemu-iotests/197
 +++ b/tests/qemu-iotests/197
-@@ -XXX,XX +XXX,XX @@ esac
+@@ -XXX,XX +XXX,XX @@ echo '=== Copy-on-read ==='
- _cleanup()
+ echo
- {
-     _cleanup_test_img
+ # Prep the images
-+    rm -f "$TEST_WRAP"
++# VPC rounds image sizes to a specific geometry, force a specific size.
-     rm -f "$BLKDBG_CONF"
++if [ "$IMGFMT" = "vpc" ]; then
 +    IMGOPTS=$(_optstr_add "$IMGOPTS" "force_size")
 +fi
  _make_test_img 4G
  $QEMU_IO -c "write -P 55 3G 1k" "$TEST_IMG" | _filter_qemu_io
  IMGPROTO=file IMGFMT=qcow2 IMGOPTS= TEST_IMG_FILE="$TEST_WRAP" \
 diff --git a/tests/qemu-iotests/common.filter b/tests/qemu-iotests/common.filter
 index XXXXXXX..XXXXXXX 100644
 --- a/tests/qemu-iotests/common.filter
 +++ b/tests/qemu-iotests/common.filter
@@ -XXX,XX +XXX,XX @@ _filter_img_create()
          -e "s# log_size=[0-9]\\+##g" \
          -e "s# refcount_bits=[0-9]\\+##g" \
          -e "s# key-secret=[a-zA-Z0-9]\\+##g" \
 -        -e "s# iter-time=[0-9]\\+##g"
 +        -e "s# iter-time=[0-9]\\+##g" \
 +        -e "s# force_size=\\(on\\|off\\)##g"
  }
- trap "_cleanup; exit \$status" 0 1 2 3 15
  _filter_img_info()
 --
 .13.6

-[Qemu-devel] [PULL 11/37] blockjob: Fix assertion in block_job_finalize()
+[Qemu-devel] [PULL v3 03/35] block: Make bdrv_drain_invoke() recursive
-Every job gets a non-NULL job->txn on creation, but it doesn't
+This change separates bdrv_drain_invoke(), which calls the BlockDriver
-necessarily keep it until it is decommissioned: Finalising a job removes
+drain callbacks, from bdrv_drain_recurse(). Instead, the function
-it from its transaction. Therefore, calling 'blockdev-job-finalize' a
+performs its own recursion now.
 second time on an already concluded job causes an assertion failure.
-Remove job->txn from the assertion in block_job_finalize() to fix this.
+One reason for this is that bdrv_drain_recurse() can be called multiple
-block_job_do_finalize() still has the same assertion, but if a job is
+times by bdrv_drain_all_begin(), but the callbacks may only be called
-already removed from its transaction, block_job_apply_verb() will
+once. The separation is necessary to fix this bug.
-already error out before we run into that assertion.
 The other reason is that we intend to go to a model where we call all
 driver callbacks first, and only then start polling. This is not fully
 achieved yet with this patch, as bdrv_drain_invoke() contains a
 BDRV_POLL_WHILE() loop for the block driver callbacks, which can still
 call callbacks for any unrelated event. It's a step in this direction
 anyway.
 Cc: qemu-stable@nongnu.org
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Reviewed-by: Max Reitz <mreitz@redhat.com>
 Reviewed-by: John Snow <jsnow@redhat.com>
 ---
- blockjob.c | 2 +-
+ block/io.c | 14 +++++++++++---
-file changed, 1 insertion(+), 1 deletion(-)
+file changed, 11 insertions(+), 3 deletions(-)
-diff --git a/blockjob.c b/blockjob.c
+diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
---- a/blockjob.c
+--- a/block/io.c
-+++ b/blockjob.c
++++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@ void block_job_complete(BlockJob *job, Error **errp)
+@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
+     bdrv_wakeup(bs);
- void block_job_finalize(BlockJob *job, Error **errp)
+ }
 +/* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
  static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
  {
--    assert(job && job->id && job->txn);
++    BdrvChild *child, *tmp;
-+    assert(job && job->id);
+     BdrvCoDrainData data = { .bs = bs, .done = false, .begin = begin};
-     if (block_job_apply_verb(job, BLOCK_JOB_VERB_FINALIZE, errp)) {
-         return;
+     if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) ||
@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
      data.co = qemu_coroutine_create(bdrv_drain_invoke_entry, &data);
      bdrv_coroutine_enter(bs, data.co);
      BDRV_POLL_WHILE(bs, !data.done);
 +
 +    QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
 +        bdrv_drain_invoke(child->bs, begin);
 +    }
  }
  static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
      BdrvChild *child, *tmp;
      bool waited;
 -    /* Ensure any pending metadata writes are submitted to bs->file.  */
 -    bdrv_drain_invoke(bs, begin);
 -
      /* Wait for drained requests to finish */
      waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0);
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
          bdrv_parent_drained_begin(bs);
      }
 +    bdrv_drain_invoke(bs, true);
      bdrv_drain_recurse(bs, true);
  }
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
      }
      bdrv_parent_drained_end(bs);
 +    bdrv_drain_invoke(bs, false);
      bdrv_drain_recurse(bs, false);
      aio_enable_external(bdrv_get_aio_context(bs));
  }
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
              aio_context_acquire(aio_context);
              for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
                  if (aio_context == bdrv_get_aio_context(bs)) {
 +                    /* FIXME Calling this multiple times is wrong */
 +                    bdrv_drain_invoke(bs, true);
                      waited |= bdrv_drain_recurse(bs, true);
                  }
              }
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
          aio_context_acquire(aio_context);
          aio_enable_external(aio_context);
          bdrv_parent_drained_end(bs);
 +        bdrv_drain_invoke(bs, false);
          bdrv_drain_recurse(bs, false);
          aio_context_release(aio_context);
      }
 --
 .13.6

-[Qemu-devel] [PULL 36/37] qemu-img: Use only string options in img_open_opts
+[Qemu-devel] [PULL v3 04/35] block: Call .drain_begin only once in bdrv_drain_all_begin()
-From: Max Reitz <mreitz@redhat.com>
+bdrv_drain_all_begin() used to call the .bdrv_co_drain_begin() driver
 callback inside its polling loop. This means that how many times it got
 called for each node depended on long it had to poll the event loop.
-img_open_opts() takes a QemuOpts and converts them to a QDict, so all
+This is obviously not right and results in nodes that stay drained even
-values therein are strings.  Then it may try to call qdict_get_bool(),
+after bdrv_drain_all_end(), which calls .bdrv_co_drain_begin() once per
-however, which will fail with a segmentation fault every time:
+node.
-$ ./qemu-img info -U --image-opts \
+Fix bdrv_drain_all_begin() to call the callback only once, too.
     driver=file,filename=/dev/null,force-share=off
 [1]    27869 segmentation fault (core dumped)  ./qemu-img info -U
 --image-opts driver=file,filename=/dev/null,force-share=off
 Fix this by using qdict_get_str() and comparing the value as a string.
 Also, when adding a force-share value to the QDict, add it as a string
 so it fits the rest of the dict.
 Cc: qemu-stable@nongnu.org
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-Message-id: 20180502202051.15493-3-mreitz@redhat.com
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Reviewed-by: Eric Blake <eblake@redhat.com>
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 ---
- qemu-img.c | 4 ++--
+ block/io.c | 3 +--
-file changed, 2 insertions(+), 2 deletions(-)
+file changed, 1 insertion(+), 2 deletions(-)
-diff --git a/qemu-img.c b/qemu-img.c
+diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
---- a/qemu-img.c
+--- a/block/io.c
-+++ b/qemu-img.c
++++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@ static BlockBackend *img_open_opts(const char *optstr,
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
-     options = qemu_opts_to_qdict(opts, NULL);
+         aio_context_acquire(aio_context);
-     if (force_share) {
+         bdrv_parent_drained_begin(bs);
-         if (qdict_haskey(options, BDRV_OPT_FORCE_SHARE)
+         aio_disable_external(aio_context);
--            && !qdict_get_bool(options, BDRV_OPT_FORCE_SHARE)) {
++        bdrv_drain_invoke(bs, true);
-+            && strcmp(qdict_get_str(options, BDRV_OPT_FORCE_SHARE), "on")) {
+         aio_context_release(aio_context);
-             error_report("--force-share/-U conflicts with image options");
-             qobject_unref(options);
+         if (!g_slist_find(aio_ctxs, aio_context)) {
-             return NULL;
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
-         }
+             aio_context_acquire(aio_context);
--        qdict_put_bool(options, BDRV_OPT_FORCE_SHARE, true);
+             for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
-+        qdict_put_str(options, BDRV_OPT_FORCE_SHARE, "on");
+                 if (aio_context == bdrv_get_aio_context(bs)) {
-     }
+-                    /* FIXME Calling this multiple times is wrong */
-     blk = blk_new_open(NULL, NULL, options, flags, &local_err);
+-                    bdrv_drain_invoke(bs, true);
-     if (!blk) {
+                     waited |= bdrv_drain_recurse(bs, true);
                  }
              }
 --
 .13.6

-[Qemu-devel] [PULL 24/37] block: Add COR filter driver
+[Qemu-devel] [PULL v3 05/35] test-bdrv-drain: Test BlockDriver callbacks for drain
-From: Max Reitz <mreitz@redhat.com>
+This adds a test case that the BlockDriver callbacks for drain are
 called in bdrv_drained_all_begin/end(), and that both of them are called
 exactly once.
-This adds a simple copy-on-read filter driver.  It relies on the already
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-existing COR functionality in the central block layer code, which may be
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-moved here once we no longer need it there.
+Reviewed-by: Eric Blake <eblake@redhat.com>
 ---
  tests/test-bdrv-drain.c | 137 ++++++++++++++++++++++++++++++++++++++++++++++++
  tests/Makefile.include  |   2 +
 files changed, 139 insertions(+)
  create mode 100644 tests/test-bdrv-drain.c
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
 Message-id: 20180421132929.21610-2-mreitz@redhat.com
 Reviewed-by: Alberto Garcia <berto@igalia.com>
 Reviewed-by: Kevin Wolf <kwolf@redhat.com>
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 ---
  qapi/block-core.json |   5 +-
  block/copy-on-read.c | 171 +++++++++++++++++++++++++++++++++++++++++++++++++++
  block/Makefile.objs  |   2 +-
 files changed, 176 insertions(+), 2 deletions(-)
  create mode 100644 block/copy-on-read.c
 diff --git a/qapi/block-core.json b/qapi/block-core.json
 index XXXXXXX..XXXXXXX 100644
 --- a/qapi/block-core.json
 +++ b/qapi/block-core.json
@@ -XXX,XX +XXX,XX @@
  # @vxhs: Since 2.10
  # @throttle: Since 2.11
  # @nvme: Since 2.12
 +# @copy-on-read: Since 2.13
  #
  # Since: 2.9
  ##
  { 'enum': 'BlockdevDriver',
 -  'data': [ 'blkdebug', 'blkverify', 'bochs', 'cloop',
 +  'data': [ 'blkdebug', 'blkverify', 'bochs', 'cloop', 'copy-on-read',
              'dmg', 'file', 'ftp', 'ftps', 'gluster', 'host_cdrom',
              'host_device', 'http', 'https', 'iscsi', 'luks', 'nbd', 'nfs',
              'null-aio', 'null-co', 'nvme', 'parallels', 'qcow', 'qcow2', 'qed',
@@ -XXX,XX +XXX,XX @@
        'blkverify':  'BlockdevOptionsBlkverify',
        'bochs':      'BlockdevOptionsGenericFormat',
        'cloop':      'BlockdevOptionsGenericFormat',
 +      'copy-on-read':'BlockdevOptionsGenericFormat',
        'dmg':        'BlockdevOptionsGenericFormat',
        'file':       'BlockdevOptionsFile',
        'ftp':        'BlockdevOptionsCurlFtp',
@@ -XXX,XX +XXX,XX @@
        'blkverify':      'BlockdevCreateNotSupported',
        'bochs':          'BlockdevCreateNotSupported',
        'cloop':          'BlockdevCreateNotSupported',
 +      'copy-on-read':   'BlockdevCreateNotSupported',
        'dmg':            'BlockdevCreateNotSupported',
        'file':           'BlockdevCreateOptionsFile',
        'ftp':            'BlockdevCreateNotSupported',
 diff --git a/block/copy-on-read.c b/block/copy-on-read.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
-+++ b/block/copy-on-read.c
++++ b/tests/test-bdrv-drain.c
 @@ -XXX,XX +XXX,XX @@
 +/*
-+ * Copy-on-read filter block driver
++ * Block node draining tests
 + *
-+ * Copyright (c) 2018 Red Hat, Inc.
++ * Copyright (c) 2017 Kevin Wolf <kwolf@redhat.com>
 + *
-+ * Author:
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
-+ *   Max Reitz <mreitz@redhat.com>
++ * of this software and associated documentation files (the "Software"), to deal
 + * in the Software without restriction, including without limitation the rights
 + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 + * copies of the Software, and to permit persons to whom the Software is
 + * furnished to do so, subject to the following conditions:
 + *
-+ * This program is free software; you can redistribute it and/or
++ * The above copyright notice and this permission notice shall be included in
-+ * modify it under the terms of the GNU General Public License as
++ * all copies or substantial portions of the Software.
 + * published by the Free Software Foundation; either version 2 or
 + * (at your option) version 3 of the License.
 + *
-+ * This program is distributed in the hope that it will be useful,
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-+ * GNU General Public License for more details.
++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+ *
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-+ * You should have received a copy of the GNU General Public License
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
++ * THE SOFTWARE.
 + */
 +
 +#include "qemu/osdep.h"
-+#include "block/block_int.h"
++#include "block/block.h"
 +#include "sysemu/block-backend.h"
 +#include "qapi/error.h"
 +
++typedef struct BDRVTestState {
++    int drain_count;
++} BDRVTestState;
 +
-+static int cor_open(BlockDriverState *bs, QDict *options, int flags,
++static void coroutine_fn bdrv_test_co_drain_begin(BlockDriverState *bs)
 +                    Error **errp)
 +{
-+    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file, false,
++    BDRVTestState *s = bs->opaque;
-+                               errp);
++    s->drain_count++;
-+    if (!bs->file) {
++}
 +        return -EINVAL;
 +    }
 +
-+    bs->supported_write_flags = BDRV_REQ_FUA &
++static void coroutine_fn bdrv_test_co_drain_end(BlockDriverState *bs)
-+                                    bs->file->bs->supported_write_flags;
++{
 +    BDRVTestState *s = bs->opaque;
 +    s->drain_count--;
 +}
 +
-+    bs->supported_zero_flags = (BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) &
++static void bdrv_test_close(BlockDriverState *bs)
-+                                    bs->file->bs->supported_zero_flags;
++{
 +    BDRVTestState *s = bs->opaque;
 +    g_assert_cmpint(s->drain_count, >, 0);
 +}
 +
 +static int coroutine_fn bdrv_test_co_preadv(BlockDriverState *bs,
 +                                            uint64_t offset, uint64_t bytes,
 +                                            QEMUIOVector *qiov, int flags)
 +{
 +    /* We want this request to stay until the polling loop in drain waits for
 +     * it to complete. We need to sleep a while as bdrv_drain_invoke() comes
 +     * first and polls its result, too, but it shouldn't accidentally complete
 +     * this request yet. */
 +    qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100000);
 +
 +    return 0;
 +}
 +
++static BlockDriver bdrv_test = {
++    .format_name            = "test",
++    .instance_size          = sizeof(BDRVTestState),
 +
-+static void cor_close(BlockDriverState *bs)
++    .bdrv_close             = bdrv_test_close,
 +    .bdrv_co_preadv         = bdrv_test_co_preadv,
 +
 +    .bdrv_co_drain_begin    = bdrv_test_co_drain_begin,
 +    .bdrv_co_drain_end      = bdrv_test_co_drain_end,
 +};
 +
 +static void aio_ret_cb(void *opaque, int ret)
 +{
++    int *aio_ret = opaque;
++    *aio_ret = ret;
 +}
 +
++static void test_drv_cb_drain_all(void)
++{
++    BlockBackend *blk;
++    BlockDriverState *bs;
++    BDRVTestState *s;
++    BlockAIOCB *acb;
++    int aio_ret;
 +
-+#define PERM_PASSTHROUGH (BLK_PERM_CONSISTENT_READ \
++    QEMUIOVector qiov;
-+                          | BLK_PERM_WRITE \
++    struct iovec iov = {
-+                          | BLK_PERM_RESIZE)
++        .iov_base = NULL,
-+#define PERM_UNCHANGED (BLK_PERM_ALL & ~PERM_PASSTHROUGH)
++        .iov_len = 0,
 +    };
 +    qemu_iovec_init_external(&qiov, &iov, 1);
 +
-+static void cor_child_perm(BlockDriverState *bs, BdrvChild *c,
++    blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
-+                           const BdrvChildRole *role,
++    bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
-+                           BlockReopenQueue *reopen_queue,
++                              &error_abort);
-+                           uint64_t perm, uint64_t shared,
++    s = bs->opaque;
-+                           uint64_t *nperm, uint64_t *nshared)
++    blk_insert_bs(blk, bs, &error_abort);
 +{
 +    if (c == NULL) {
 +        *nperm = (perm & PERM_PASSTHROUGH) | BLK_PERM_WRITE_UNCHANGED;
 +        *nshared = (shared & PERM_PASSTHROUGH) | PERM_UNCHANGED;
 +        return;
 +    }
 +
-+    *nperm = (perm & PERM_PASSTHROUGH) |
++    /* Simple bdrv_drain_all_begin/end pair, check that CBs are called */
-+             (c->perm & PERM_UNCHANGED);
++    g_assert_cmpint(s->drain_count, ==, 0);
-+    *nshared = (shared & PERM_PASSTHROUGH) |
++    bdrv_drain_all_begin();
-+               (c->shared_perm & PERM_UNCHANGED);
++    g_assert_cmpint(s->drain_count, ==, 1);
 +    bdrv_drain_all_end();
 +    g_assert_cmpint(s->drain_count, ==, 0);
 +
 +    /* Now do the same while a request is pending */
 +    aio_ret = -EINPROGRESS;
 +    acb = blk_aio_preadv(blk, 0, &qiov, 0, aio_ret_cb, &aio_ret);
 +    g_assert(acb != NULL);
 +    g_assert_cmpint(aio_ret, ==, -EINPROGRESS);
 +
 +    g_assert_cmpint(s->drain_count, ==, 0);
 +    bdrv_drain_all_begin();
 +    g_assert_cmpint(aio_ret, ==, 0);
 +    g_assert_cmpint(s->drain_count, ==, 1);
 +    bdrv_drain_all_end();
 +    g_assert_cmpint(s->drain_count, ==, 0);
 +
 +    bdrv_unref(bs);
 +    blk_unref(blk);
 +}
 +
++int main(int argc, char **argv)
++{
++    bdrv_init();
++    qemu_init_main_loop(&error_abort);
 +
-+static int64_t cor_getlength(BlockDriverState *bs)
++    g_test_init(&argc, &argv, NULL);
-+{
++
-+    return bdrv_getlength(bs->file->bs);
++    g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
 +
 +    return g_test_run();
 +}
-+
+diff --git a/tests/Makefile.include b/tests/Makefile.include
 +
 +static int cor_truncate(BlockDriverState *bs, int64_t offset,
 +                        PreallocMode prealloc, Error **errp)
 +{
 +    return bdrv_truncate(bs->file, offset, prealloc, errp);
 +}
 +
 +
 +static int coroutine_fn cor_co_preadv(BlockDriverState *bs,
 +                                      uint64_t offset, uint64_t bytes,
 +                                      QEMUIOVector *qiov, int flags)
 +{
 +    return bdrv_co_preadv(bs->file, offset, bytes, qiov,
 +                          flags | BDRV_REQ_COPY_ON_READ);
 +}
 +
 +
 +static int coroutine_fn cor_co_pwritev(BlockDriverState *bs,
 +                                       uint64_t offset, uint64_t bytes,
 +                                       QEMUIOVector *qiov, int flags)
 +{
 +
 +    return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);
 +}
 +
 +
 +static int coroutine_fn cor_co_pwrite_zeroes(BlockDriverState *bs,
 +                                             int64_t offset, int bytes,
 +                                             BdrvRequestFlags flags)
 +{
 +    return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags);
 +}
 +
 +
 +static int coroutine_fn cor_co_pdiscard(BlockDriverState *bs,
 +                                        int64_t offset, int bytes)
 +{
 +    return bdrv_co_pdiscard(bs->file->bs, offset, bytes);
 +}
 +
 +
 +static void cor_eject(BlockDriverState *bs, bool eject_flag)
 +{
 +    bdrv_eject(bs->file->bs, eject_flag);
 +}
 +
 +
 +static void cor_lock_medium(BlockDriverState *bs, bool locked)
 +{
 +    bdrv_lock_medium(bs->file->bs, locked);
 +}
 +
 +
 +static bool cor_recurse_is_first_non_filter(BlockDriverState *bs,
 +                                            BlockDriverState *candidate)
 +{
 +    return bdrv_recurse_is_first_non_filter(bs->file->bs, candidate);
 +}
 +
 +
 +BlockDriver bdrv_copy_on_read = {
 +    .format_name                        = "copy-on-read",
 +
 +    .bdrv_open                          = cor_open,
 +    .bdrv_close                         = cor_close,
 +    .bdrv_child_perm                    = cor_child_perm,
 +
 +    .bdrv_getlength                     = cor_getlength,
 +    .bdrv_truncate                      = cor_truncate,
 +
 +    .bdrv_co_preadv                     = cor_co_preadv,
 +    .bdrv_co_pwritev                    = cor_co_pwritev,
 +    .bdrv_co_pwrite_zeroes              = cor_co_pwrite_zeroes,
 +    .bdrv_co_pdiscard                   = cor_co_pdiscard,
 +
 +    .bdrv_eject                         = cor_eject,
 +    .bdrv_lock_medium                   = cor_lock_medium,
 +
 +    .bdrv_co_block_status               = bdrv_co_block_status_from_file,
 +
 +    .bdrv_recurse_is_first_non_filter   = cor_recurse_is_first_non_filter,
 +
 +    .has_variable_length                = true,
 +    .is_filter                          = true,
 +};
 +
 +static void bdrv_copy_on_read_init(void)
 +{
 +    bdrv_register(&bdrv_copy_on_read);
 +}
 +
 +block_init(bdrv_copy_on_read_init);
 diff --git a/block/Makefile.objs b/block/Makefile.objs
 index XXXXXXX..XXXXXXX 100644
---- a/block/Makefile.objs
+--- a/tests/Makefile.include
-+++ b/block/Makefile.objs
++++ b/tests/Makefile.include
-@@ -XXX,XX +XXX,XX @@ block-obj-y += accounting.o dirty-bitmap.o
+@@ -XXX,XX +XXX,XX @@ gcov-files-test-thread-pool-y = thread-pool.c
- block-obj-y += write-threshold.o
+ gcov-files-test-hbitmap-y = util/hbitmap.c
- block-obj-y += backup.o
+ check-unit-y += tests/test-hbitmap$(EXESUF)
- block-obj-$(CONFIG_REPLICATION) += replication.o
+ gcov-files-test-hbitmap-y = blockjob.c
--block-obj-y += throttle.o
++check-unit-y += tests/test-bdrv-drain$(EXESUF)
-+block-obj-y += throttle.o copy-on-read.o
+ check-unit-y += tests/test-blockjob$(EXESUF)
+ check-unit-y += tests/test-blockjob-txn$(EXESUF)
- block-obj-y += crypto.o
+ check-unit-y += tests/test-x86-cpuid$(EXESUF)
+@@ -XXX,XX +XXX,XX @@ tests/test-coroutine$(EXESUF): tests/test-coroutine.o $(test-block-obj-y)
  tests/test-aio$(EXESUF): tests/test-aio.o $(test-block-obj-y)
  tests/test-aio-multithread$(EXESUF): tests/test-aio-multithread.o $(test-block-obj-y)
  tests/test-throttle$(EXESUF): tests/test-throttle.o $(test-block-obj-y)
 +tests/test-bdrv-drain$(EXESUF): tests/test-bdrv-drain.o $(test-block-obj-y) $(test-util-obj-y)
  tests/test-blockjob$(EXESUF): tests/test-blockjob.o $(test-block-obj-y) $(test-util-obj-y)
  tests/test-blockjob-txn$(EXESUF): tests/test-blockjob-txn.o $(test-block-obj-y) $(test-util-obj-y)
  tests/test-thread-pool$(EXESUF): tests/test-thread-pool.o $(test-block-obj-y)
 --
 .13.6

-[Qemu-devel] [PULL 08/37] block: Merge .bdrv_co_writev{, _flags} in drivers
+[Qemu-devel] [PULL v3 06/35] block: bdrv_drain_recurse(): Remove unused begin parameter
-From: Eric Blake <eblake@redhat.com>
+Now that the bdrv_drain_invoke() calls are pulled up to the callers of
 bdrv_drain_recurse(), the 'begin' parameter isn't needed any more.
-We have too many driver callback interfaces; simplify the mess
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-somewhat by merging the flags parameter of .bdrv_co_writev_flags()
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-into .bdrv_co_writev().  Note that as long as a driver doesn't set
+---
-.supported_write_flags, the flags argument will be 0 and behavior is
+ block/io.c | 12 ++++++------
-identical.  Also note that the public function bdrv_co_writev() still
+file changed, 6 insertions(+), 6 deletions(-)
 lacks a flags argument; so the driver signature is thus intentionally
 slightly different.  But that's not the end of the world, nor the first
 time that the driver interface differs slightly from the public
 interface.
-Ideally, we should be rewriting all of these drivers to use modern
-byte-based interfaces.  But that's a more invasive patch to write
-and audit, compared to the simplification done here.
-Signed-off-by: Eric Blake <eblake@redhat.com>
-Reviewed-by: Daniel P. Berrangé <berrange@redhat.com>
-Signed-off-by: Kevin Wolf <kwolf@redhat.com>
----
- include/block/block_int.h |  2 --
- block/gluster.c           |  4 +++-
- block/io.c                | 13 ++++---------
- block/iscsi.c             |  8 ++++----
- block/parallels.c         |  4 +++-
- block/qcow.c              |  6 ++++--
- block/qed.c               |  3 ++-
- block/replication.c       |  4 +++-
- block/sheepdog.c          |  4 +++-
- block/ssh.c               |  4 +++-
- block/vhdx.c              |  4 +++-
-files changed, 32 insertions(+), 24 deletions(-)
-diff --git a/include/block/block_int.h b/include/block/block_int.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/block/block_int.h
-+++ b/include/block/block_int.h
-@@ -XXX,XX +XXX,XX @@ struct BlockDriver {
-     int coroutine_fn (*bdrv_co_preadv)(BlockDriverState *bs,
-         uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags);
-     int coroutine_fn (*bdrv_co_writev)(BlockDriverState *bs,
--        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
--    int coroutine_fn (*bdrv_co_writev_flags)(BlockDriverState *bs,
-         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, int flags);
-     /**
-      * @offset: position in bytes to write at
-diff --git a/block/gluster.c b/block/gluster.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/gluster.c
-+++ b/block/gluster.c
-@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qemu_gluster_co_readv(BlockDriverState *bs,
- static coroutine_fn int qemu_gluster_co_writev(BlockDriverState *bs,
-                                                int64_t sector_num,
-                                                int nb_sectors,
--                                               QEMUIOVector *qiov)
-+                                               QEMUIOVector *qiov,
-+                                               int flags)
- {
-+    assert(!flags);
-     return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 1);
- }
 diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/io.c
 +++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
+@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
-     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+     }
      assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
 -    if (drv->bdrv_co_writev_flags) {
 -        ret = drv->bdrv_co_writev_flags(bs, sector_num, nb_sectors, qiov,
 -                                        flags & bs->supported_write_flags);
 -        flags &= ~bs->supported_write_flags;
 -    } else {
 -        assert(drv->bdrv_co_writev);
 -        assert(!bs->supported_write_flags);
 -        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
 -    }
 +    assert(drv->bdrv_co_writev);
 +    ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov,
 +                              flags & bs->supported_write_flags);
 +    flags &= ~bs->supported_write_flags;
  emulate_flags:
      if (ret == 0 && (flags & BDRV_REQ_FUA)) {
 diff --git a/block/iscsi.c b/block/iscsi.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/iscsi.c
 +++ b/block/iscsi.c
@@ -XXX,XX +XXX,XX @@ static inline bool iscsi_allocmap_is_valid(IscsiLun *iscsilun,
  }
- static int coroutine_fn
+-static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
--iscsi_co_writev_flags(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
++static bool bdrv_drain_recurse(BlockDriverState *bs)
 -                      QEMUIOVector *iov, int flags)
 +iscsi_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
 +                QEMUIOVector *iov, int flags)
  {
-     IscsiLun *iscsilun = bs->opaque;
+     BdrvChild *child, *tmp;
-     struct IscsiTask iTask;
+     bool waited;
-@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_iscsi = {
+@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
-     .bdrv_co_pdiscard      = iscsi_co_pdiscard,
+              */
-     .bdrv_co_pwrite_zeroes = iscsi_co_pwrite_zeroes,
+             bdrv_ref(bs);
-     .bdrv_co_readv         = iscsi_co_readv,
+         }
--    .bdrv_co_writev_flags  = iscsi_co_writev_flags,
+-        waited |= bdrv_drain_recurse(bs, begin);
-+    .bdrv_co_writev        = iscsi_co_writev,
++        waited |= bdrv_drain_recurse(bs);
-     .bdrv_co_flush_to_disk = iscsi_co_flush,
+         if (in_main_loop) {
+             bdrv_unref(bs);
- #ifdef __linux__
+         }
-@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_iser = {
+@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
-     .bdrv_co_pdiscard      = iscsi_co_pdiscard,
+     }
-     .bdrv_co_pwrite_zeroes = iscsi_co_pwrite_zeroes,
-     .bdrv_co_readv         = iscsi_co_readv,
+     bdrv_drain_invoke(bs, true);
--    .bdrv_co_writev_flags  = iscsi_co_writev_flags,
+-    bdrv_drain_recurse(bs, true);
-+    .bdrv_co_writev        = iscsi_co_writev,
++    bdrv_drain_recurse(bs);
      .bdrv_co_flush_to_disk = iscsi_co_flush,
  #ifdef __linux__
 diff --git a/block/parallels.c b/block/parallels.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/parallels.c
 +++ b/block/parallels.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn parallels_co_block_status(BlockDriverState *bs,
  }
- static coroutine_fn int parallels_co_writev(BlockDriverState *bs,
+ void bdrv_drained_end(BlockDriverState *bs)
--        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
+@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
-+                                            int64_t sector_num, int nb_sectors,
-+                                            QEMUIOVector *qiov, int flags)
+     bdrv_parent_drained_end(bs);
- {
+     bdrv_drain_invoke(bs, false);
-     BDRVParallelsState *s = bs->opaque;
+-    bdrv_drain_recurse(bs, false);
-     uint64_t bytes_done = 0;
++    bdrv_drain_recurse(bs);
-     QEMUIOVector hd_qiov;
+     aio_enable_external(bdrv_get_aio_context(bs));
      int ret = 0;
 +    assert(!flags);
      qemu_iovec_init(&hd_qiov, qiov->niov);
      while (nb_sectors > 0) {
 diff --git a/block/qcow.c b/block/qcow.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qcow.c
 +++ b/block/qcow.c
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
  }
- static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num,
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
--                          int nb_sectors, QEMUIOVector *qiov)
+             aio_context_acquire(aio_context);
-+                                       int nb_sectors, QEMUIOVector *qiov,
+             for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
-+                                       int flags)
+                 if (aio_context == bdrv_get_aio_context(bs)) {
- {
+-                    waited |= bdrv_drain_recurse(bs, true);
-     BDRVQcowState *s = bs->opaque;
++                    waited |= bdrv_drain_recurse(bs);
-     int index_in_cluster;
+                 }
-@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num,
+             }
-     uint8_t *buf;
+             aio_context_release(aio_context);
-     void *orig_buf;
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
+         aio_enable_external(aio_context);
-+    assert(!flags);
+         bdrv_parent_drained_end(bs);
-     s->cluster_cache_offset = -1; /* disable compressed cache */
+         bdrv_drain_invoke(bs, false);
+-        bdrv_drain_recurse(bs, false);
-     /* We must always copy the iov when encrypting, so we
++        bdrv_drain_recurse(bs);
-@@ -XXX,XX +XXX,XX @@ qcow_co_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
+         aio_context_release(aio_context);
-     if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
+     }
-         /* could not compress: write normal cluster */
          ret = qcow_co_writev(bs, offset >> BDRV_SECTOR_BITS,
 -                             bytes >> BDRV_SECTOR_BITS, qiov);
 +                             bytes >> BDRV_SECTOR_BITS, qiov, 0);
          if (ret < 0) {
              goto fail;
          }
 diff --git a/block/qed.c b/block/qed.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qed.c
 +++ b/block/qed.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_qed_co_readv(BlockDriverState *bs,
  static int coroutine_fn bdrv_qed_co_writev(BlockDriverState *bs,
                                             int64_t sector_num, int nb_sectors,
 -                                           QEMUIOVector *qiov)
 +                                           QEMUIOVector *qiov, int flags)
  {
 +    assert(!flags);
      return qed_co_request(bs, sector_num, qiov, nb_sectors, QED_AIOCB_WRITE);
  }
 diff --git a/block/replication.c b/block/replication.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/replication.c
 +++ b/block/replication.c
@@ -XXX,XX +XXX,XX @@ out:
  static coroutine_fn int replication_co_writev(BlockDriverState *bs,
                                                int64_t sector_num,
                                                int remaining_sectors,
 -                                              QEMUIOVector *qiov)
 +                                              QEMUIOVector *qiov,
 +                                              int flags)
  {
      BDRVReplicationState *s = bs->opaque;
      QEMUIOVector hd_qiov;
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int replication_co_writev(BlockDriverState *bs,
      int ret;
      int64_t n;
 +    assert(!flags);
      ret = replication_get_io_status(s);
      if (ret < 0) {
          goto out;
 diff --git a/block/sheepdog.c b/block/sheepdog.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/sheepdog.c
 +++ b/block/sheepdog.c
@@ -XXX,XX +XXX,XX @@ static void sd_aio_complete(SheepdogAIOCB *acb)
  }
  static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
 -                        int nb_sectors, QEMUIOVector *qiov)
 +                                     int nb_sectors, QEMUIOVector *qiov,
 +                                     int flags)
  {
      SheepdogAIOCB acb;
      int ret;
      int64_t offset = (sector_num + nb_sectors) * BDRV_SECTOR_SIZE;
      BDRVSheepdogState *s = bs->opaque;
 +    assert(!flags);
      if (offset > s->inode.vdi_size) {
          ret = sd_truncate(bs, offset, PREALLOC_MODE_OFF, NULL);
          if (ret < 0) {
 diff --git a/block/ssh.c b/block/ssh.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/ssh.c
 +++ b/block/ssh.c
@@ -XXX,XX +XXX,XX @@ static int ssh_write(BDRVSSHState *s, BlockDriverState *bs,
  static coroutine_fn int ssh_co_writev(BlockDriverState *bs,
                                        int64_t sector_num,
 -                                      int nb_sectors, QEMUIOVector *qiov)
 +                                      int nb_sectors, QEMUIOVector *qiov,
 +                                      int flags)
  {
      BDRVSSHState *s = bs->opaque;
      int ret;
 +    assert(!flags);
      qemu_co_mutex_lock(&s->lock);
      ret = ssh_write(s, bs, sector_num * BDRV_SECTOR_SIZE,
                      nb_sectors * BDRV_SECTOR_SIZE, qiov);
 diff --git a/block/vhdx.c b/block/vhdx.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/vhdx.c
 +++ b/block/vhdx.c
@@ -XXX,XX +XXX,XX @@ int vhdx_user_visible_write(BlockDriverState *bs, BDRVVHDXState *s)
  }
  static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num,
 -                                      int nb_sectors, QEMUIOVector *qiov)
 +                                       int nb_sectors, QEMUIOVector *qiov,
 +                                       int flags)
  {
      int ret = -ENOTSUP;
      BDRVVHDXState *s = bs->opaque;
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num,
      uint64_t bat_prior_offset = 0;
      bool bat_update = false;
 +    assert(!flags);
      qemu_iovec_init(&hd_qiov, qiov->niov);
      qemu_co_mutex_lock(&s->lock);
 --
 .13.6

-[Qemu-devel] [PULL 28/37] block/quorum: Support BDRV_REQ_WRITE_UNCHANGED
+[Qemu-devel] [PULL v3 07/35] block: Don't wait for requests in bdrv_drain*_end()
-From: Max Reitz <mreitz@redhat.com>
+The device is drained, so there is no point in waiting for requests at
 the end of the drained section. Remove the bdrv_drain_recurse() calls
 there.
-We just need to forward it to quorum's children (except in case of a
+The bdrv_drain_recurse() calls were introduced in commit 481cad48e5e
-rewrite because of corruption), but for that we first have to support
+in order to call the .bdrv_co_drain_end() driver callback. This is now
-flags in child requests at all.
+done by a separate bdrv_drain_invoke() call.
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: Alberto Garcia <berto@igalia.com>
-Message-id: 20180421132929.21610-6-mreitz@redhat.com
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
 ---
- block/quorum.c | 19 +++++++++++++------
+ block/io.c | 2 --
-file changed, 13 insertions(+), 6 deletions(-)
+file changed, 2 deletions(-)
-diff --git a/block/quorum.c b/block/quorum.c
+diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/quorum.c
+--- a/block/io.c
-+++ b/block/quorum.c
++++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@ struct QuorumAIOCB {
+@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
-     /* Request metadata */
-     uint64_t offset;
+     bdrv_parent_drained_end(bs);
-     uint64_t bytes;
+     bdrv_drain_invoke(bs, false);
-+    int flags;
+-    bdrv_drain_recurse(bs);
+     aio_enable_external(bdrv_get_aio_context(bs));
-     QEMUIOVector *qiov;         /* calling IOV */
+ }
-@@ -XXX,XX +XXX,XX @@ static bool quorum_64bits_compare(QuorumVoteValue *a, QuorumVoteValue *b)
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
- static QuorumAIOCB *quorum_aio_get(BlockDriverState *bs,
+         aio_enable_external(aio_context);
-                                    QEMUIOVector *qiov,
+         bdrv_parent_drained_end(bs);
-                                    uint64_t offset,
+         bdrv_drain_invoke(bs, false);
--                                   uint64_t bytes)
+-        bdrv_drain_recurse(bs);
-+                                   uint64_t bytes,
+         aio_context_release(aio_context);
 +                                   int flags)
  {
      BDRVQuorumState *s = bs->opaque;
      QuorumAIOCB *acb = g_new(QuorumAIOCB, 1);
@@ -XXX,XX +XXX,XX @@ static QuorumAIOCB *quorum_aio_get(BlockDriverState *bs,
          .bs                 = bs,
          .offset             = offset,
          .bytes              = bytes,
 +        .flags              = flags,
          .qiov               = qiov,
          .votes.compare      = quorum_sha256_compare,
          .votes.vote_list    = QLIST_HEAD_INITIALIZER(acb.votes.vote_list),
@@ -XXX,XX +XXX,XX @@ static void quorum_rewrite_entry(void *opaque)
      BDRVQuorumState *s = acb->bs->opaque;
      /* Ignore any errors, it's just a correction attempt for already
 -     * corrupted data. */
 +     * corrupted data.
 +     * Mask out BDRV_REQ_WRITE_UNCHANGED because this overwrites the
 +     * area with different data from the other children. */
      bdrv_co_pwritev(s->children[co->idx], acb->offset, acb->bytes,
 -                    acb->qiov, 0);
 +                    acb->qiov, acb->flags & ~BDRV_REQ_WRITE_UNCHANGED);
      /* Wake up the caller after the last rewrite */
      acb->rewrite_count--;
@@ -XXX,XX +XXX,XX @@ static int quorum_co_preadv(BlockDriverState *bs, uint64_t offset,
                              uint64_t bytes, QEMUIOVector *qiov, int flags)
  {
      BDRVQuorumState *s = bs->opaque;
 -    QuorumAIOCB *acb = quorum_aio_get(bs, qiov, offset, bytes);
 +    QuorumAIOCB *acb = quorum_aio_get(bs, qiov, offset, bytes, flags);
      int ret;
      acb->is_read = true;
@@ -XXX,XX +XXX,XX @@ static void write_quorum_entry(void *opaque)
      sacb->bs = s->children[i]->bs;
      sacb->ret = bdrv_co_pwritev(s->children[i], acb->offset, acb->bytes,
 -                                acb->qiov, 0);
 +                                acb->qiov, acb->flags);
      if (sacb->ret == 0) {
          acb->success_count++;
      } else {
@@ -XXX,XX +XXX,XX @@ static int quorum_co_pwritev(BlockDriverState *bs, uint64_t offset,
                               uint64_t bytes, QEMUIOVector *qiov, int flags)
  {
      BDRVQuorumState *s = bs->opaque;
 -    QuorumAIOCB *acb = quorum_aio_get(bs, qiov, offset, bytes);
 +    QuorumAIOCB *acb = quorum_aio_get(bs, qiov, offset, bytes, flags);
      int i, ret;
      for (i = 0; i < s->num_children; i++) {
@@ -XXX,XX +XXX,XX @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
      }
-     s->next_child_index = s->num_children;
-+    bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED;
-+
-     g_free(opened);
-     goto exit;
 --
 .13.6

-[Qemu-devel] [PULL 02/37] block: Support byte-based aio callbacks
+[Qemu-devel] [PULL v3 08/35] block: Unify order in drain functions
-From: Eric Blake <eblake@redhat.com>
+Drain requests are propagated to child nodes, parent nodes and directly
 to the AioContext. The order in which this happened was different
 between all combinations of drain/drain_all and begin/end.
-We are gradually moving away from sector-based interfaces, towards
+The correct order is to keep children only drained when their parents
-byte-based.  Add new sector-based aio callbacks for read and write,
+are also drained. This means that at the start of a drained section, the
-to match the fact that bdrv_aio_pdiscard is already byte-based.
+AioContext needs to be drained first, the parents second and only then
 the children. The correct order for the end of a drained section is the
 opposite.
-Ideally, drivers should be converted to use coroutine callbacks
+This patch changes the three other functions to follow the example of
-rather than aio; but that is not quite as trivial (and if we were
+bdrv_drained_begin(), which is the only one that got it right.
 to do that conversion, the null-aio driver would disappear), so for
 the short term, converting the signature but keeping things with
 aio is easier.  However, we CAN declare that a driver that uses
 the byte-based aio interfaces now defaults to byte-based
 operations, and must explicitly provide a refresh_limits override
 to stick with larger alignments (making the alignment issues more
 obvious directly in the drivers touched in the next few patches).
-Once all drivers are converted, the sector-based aio callbacks will
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-be removed; in the meantime, a FIXME comment is added due to a
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-slight inefficiency that will be touched up as part of that later
+---
-cleanup.
+ block/io.c | 12 ++++++++----
 file changed, 8 insertions(+), 4 deletions(-)
-Simplify some instances of 'bs->drv' into 'drv' while touching this,
-since the local variable already exists to reduce typing.
-Signed-off-by: Eric Blake <eblake@redhat.com>
-Signed-off-by: Kevin Wolf <kwolf@redhat.com>
----
- include/block/block_int.h |  6 ++++++
- block/io.c                | 38 +++++++++++++++++++++++++++++---------
-files changed, 35 insertions(+), 9 deletions(-)
-diff --git a/include/block/block_int.h b/include/block/block_int.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/block/block_int.h
-+++ b/include/block/block_int.h
-@@ -XXX,XX +XXX,XX @@ struct BlockDriver {
-     BlockAIOCB *(*bdrv_aio_readv)(BlockDriverState *bs,
-         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-         BlockCompletionFunc *cb, void *opaque);
-+    BlockAIOCB *(*bdrv_aio_preadv)(BlockDriverState *bs,
-+        uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags,
-+        BlockCompletionFunc *cb, void *opaque);
-     BlockAIOCB *(*bdrv_aio_writev)(BlockDriverState *bs,
-         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-         BlockCompletionFunc *cb, void *opaque);
-+    BlockAIOCB *(*bdrv_aio_pwritev)(BlockDriverState *bs,
-+        uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags,
-+        BlockCompletionFunc *cb, void *opaque);
-     BlockAIOCB *(*bdrv_aio_flush)(BlockDriverState *bs,
-         BlockCompletionFunc *cb, void *opaque);
-     BlockAIOCB *(*bdrv_aio_pdiscard)(BlockDriverState *bs,
 diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/io.c
 +++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@ void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
+@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
          return;
      }
-     /* Default alignment based on whether driver has byte interface */
++    /* Stop things in parent-to-child order */
--    bs->bl.request_alignment = drv->bdrv_co_preadv ? 1 : 512;
+     if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
-+    bs->bl.request_alignment = (drv->bdrv_co_preadv ||
+         aio_disable_external(bdrv_get_aio_context(bs));
-+                                drv->bdrv_aio_preadv) ? 1 : 512;
+         bdrv_parent_drained_begin(bs);
+@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
-     /* Take some limits from the children as a default */
+         return;
      if (bs->file) {
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
          return drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags);
      }
-+    /* FIXME - no need to calculate these if .bdrv_aio_preadv exists */
+-    bdrv_parent_drained_end(bs);
-     sector_num = offset >> BDRV_SECTOR_BITS;
++    /* Re-enable things in child-to-parent order */
-     nb_sectors = bytes >> BDRV_SECTOR_BITS;
+     bdrv_drain_invoke(bs, false);
++    bdrv_parent_drained_end(bs);
--    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+     aio_enable_external(bdrv_get_aio_context(bs));
--    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+ }
--    assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
-+    if (!drv->bdrv_aio_preadv) {
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
-+        assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
-+        assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+         AioContext *aio_context = bdrv_get_aio_context(bs);
-+        assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
-+    }
++        /* Stop things in parent-to-child order */
+         aio_context_acquire(aio_context);
-     if (drv->bdrv_co_readv) {
+-        bdrv_parent_drained_begin(bs);
-         return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
+         aio_disable_external(aio_context);
-@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
++        bdrv_parent_drained_begin(bs);
-             .coroutine = qemu_coroutine_self(),
+         bdrv_drain_invoke(bs, true);
-         };
+         aio_context_release(aio_context);
--        acb = bs->drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
-+        if (drv->bdrv_aio_preadv) {
+     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
-+            acb = drv->bdrv_aio_preadv(bs, offset, bytes, qiov, flags,
+         AioContext *aio_context = bdrv_get_aio_context(bs);
-+                                       bdrv_co_io_em_complete, &co);
-+        } else {
++        /* Re-enable things in child-to-parent order */
-+            acb = drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
+         aio_context_acquire(aio_context);
-                                       bdrv_co_io_em_complete, &co);
+-        aio_enable_external(aio_context);
-+        }
+-        bdrv_parent_drained_end(bs);
-         if (acb == NULL) {
+         bdrv_drain_invoke(bs, false);
-             return -EIO;
++        bdrv_parent_drained_end(bs);
-         } else {
++        aio_enable_external(aio_context);
-@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
+         aio_context_release(aio_context);
          goto emulate_flags;
      }
-+    /* FIXME - no need to calculate these if .bdrv_aio_pwritev exists */
-     sector_num = offset >> BDRV_SECTOR_BITS;
-     nb_sectors = bytes >> BDRV_SECTOR_BITS;
--    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
--    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
--    assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
-+    if (!drv->bdrv_aio_pwritev) {
-+        assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
-+        assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
-+        assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
-+    }
-     if (drv->bdrv_co_writev_flags) {
-         ret = drv->bdrv_co_writev_flags(bs, sector_num, nb_sectors, qiov,
-@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
-             .coroutine = qemu_coroutine_self(),
-         };
--        acb = bs->drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors,
-+        if (drv->bdrv_aio_pwritev) {
-+            acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov,
-+                                        flags & bs->supported_write_flags,
-+                                        bdrv_co_io_em_complete, &co);
-+            flags &= ~bs->supported_write_flags;
-+        } else {
-+            assert(!bs->supported_write_flags);
-+            acb = drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors,
-                                        bdrv_co_io_em_complete, &co);
-+        }
-         if (acb == NULL) {
-             ret = -EIO;
-         } else {
 --
 .13.6

-[Qemu-devel] [PULL 09/37] hmp: Allow using a qdev id in block_set_io_throttle
+[Qemu-devel] [PULL v3 09/35] block: Don't acquire AioContext in hmp_qemu_io()
-From: Alberto Garcia <berto@igalia.com>
+Commit 15afd94a047 added code to acquire and release the AioContext in
 qemuio_command(). This means that the lock is taken twice now in the
 call path from hmp_qemu_io(). This causes BDRV_POLL_WHILE() to hang for
 any requests issued to nodes in a non-mainloop AioContext.
-The QMP version of this command can take a qdev ID since 7a9877a02635,
+Dropping the first locking from hmp_qemu_io() fixes the problem.
 but the HMP version is still using the deprecated block device name so
 there's no way to refer to a block device added like this:
-  -blockdev node-name=disk0,driver=qcow2,file.driver=file,file.filename=hd.qcow2
-  -device virtio-blk-pci,id=virtio-blk-pci0,drive=disk0
-This patch works around this problem by using the specified name as a
-qdev ID if the block device name is not found.
-Signed-off-by: Alberto Garcia <berto@igalia.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- hmp.c           | 14 ++++++++++++--
+ hmp.c | 6 ------
- hmp-commands.hx |  3 ++-
+file changed, 6 deletions(-)
 files changed, 14 insertions(+), 3 deletions(-)
 diff --git a/hmp.c b/hmp.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hmp.c
 +++ b/hmp.c
-@@ -XXX,XX +XXX,XX @@ void hmp_change(Monitor *mon, const QDict *qdict)
+@@ -XXX,XX +XXX,XX @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
  void hmp_block_set_io_throttle(Monitor *mon, const QDict *qdict)
  {
+     BlockBackend *blk;
+     BlockBackend *local_blk = NULL;
+-    AioContext *aio_context;
+     const char* device = qdict_get_str(qdict, "device");
+     const char* command = qdict_get_str(qdict, "command");
      Error *err = NULL;
-+    char *device = (char *) qdict_get_str(qdict, "device");
+@@ -XXX,XX +XXX,XX @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
-     BlockIOThrottle throttle = {
+         }
--        .has_device = true,
+     }
--        .device = (char *) qdict_get_str(qdict, "device"),
-         .bps = qdict_get_int(qdict, "bps"),
+-    aio_context = blk_get_aio_context(blk);
-         .bps_rd = qdict_get_int(qdict, "bps_rd"),
+-    aio_context_acquire(aio_context);
-         .bps_wr = qdict_get_int(qdict, "bps_wr"),
+-
-@@ -XXX,XX +XXX,XX @@ void hmp_block_set_io_throttle(Monitor *mon, const QDict *qdict)
+     /*
-         .iops_wr = qdict_get_int(qdict, "iops_wr"),
+      * Notably absent: Proper permission management. This is sad, but it seems
-     };
+      * almost impossible to achieve without changing the semantics and thereby
+@@ -XXX,XX +XXX,XX @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
-+    /* qmp_block_set_io_throttle has separate parameters for the
+      */
-+     * (deprecated) block device name and the qdev ID but the HMP
+     qemuio_command(blk, command);
-+     * version has only one, so we must decide which one to pass. */
-+    if (blk_by_name(device)) {
+-    aio_context_release(aio_context);
-+        throttle.has_device = true;
+-
-+        throttle.device = device;
+ fail:
-+    } else {
+     blk_unref(local_blk);
 +        throttle.has_id = true;
 +        throttle.id = device;
 +    }
 +
      qmp_block_set_io_throttle(&throttle, &err);
      hmp_handle_error(mon, &err);
- }
-diff --git a/hmp-commands.hx b/hmp-commands.hx
-index XXXXXXX..XXXXXXX 100644
---- a/hmp-commands.hx
-+++ b/hmp-commands.hx
-@@ -XXX,XX +XXX,XX @@ ETEXI
- STEXI
- @item block_set_io_throttle @var{device} @var{bps} @var{bps_rd} @var{bps_wr} @var{iops} @var{iops_rd} @var{iops_wr}
- @findex block_set_io_throttle
--Change I/O throttle limits for a block drive to @var{bps} @var{bps_rd} @var{bps_wr} @var{iops} @var{iops_rd} @var{iops_wr}
-+Change I/O throttle limits for a block drive to @var{bps} @var{bps_rd} @var{bps_wr} @var{iops} @var{iops_rd} @var{iops_wr}.
-+@var{device} can be a block device name, a qdev ID or a QOM path.
- ETEXI
-     {
 --
 .13.6

-[Qemu-devel] [PULL 20/37] qcow2: Give the refcount cache the minimum possible size by default
+[Qemu-devel] [PULL v3 10/35] qcow2: get rid of qcow2_backing_read1 routine
-From: Alberto Garcia <berto@igalia.com>
+From: Edgar Kaziakhmedov <edgar.kaziakhmedov@virtuozzo.com>
-The L2 and refcount caches have default sizes that can be overridden
+Since bdrv_co_preadv does all neccessary checks including
-using the l2-cache-size and refcount-cache-size (an additional
+reading after the end of the backing file, avoid duplication
-parameter named cache-size sets the combined size of both caches).
+of verification before bdrv_co_preadv call.
-Unless forced by one of the aforementioned parameters, QEMU will set
+Signed-off-by: Edgar Kaziakhmedov <edgar.kaziakhmedov@virtuozzo.com>
-the unspecified sizes so that the L2 cache is 4 times larger than the
+Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
 refcount cache.
 This is based on the premise that the refcount metadata needs to be
 only a fourth of the L2 metadata to cover the same amount of disk
 space. This is incorrect for two reasons:
  a) The amount of disk covered by an L2 table depends solely on the
     cluster size, but in the case of a refcount block it depends on
     the cluster size *and* the width of each refcount entry.
     The 4/1 ratio is only valid with 16-bit entries (the default).
  b) When we talk about disk space and L2 tables we are talking about
     guest space (L2 tables map guest clusters to host clusters),
     whereas refcount blocks are used for host clusters (including
     L1/L2 tables and the refcount blocks themselves). On a fully
     populated (and uncompressed) qcow2 file, image size > virtual size
     so there are more refcount entries than L2 entries.
 Problem (a) could be fixed by adjusting the algorithm to take into
 account the refcount entry width. Problem (b) could be fixed by
 increasing a bit the refcount cache size to account for the clusters
 used for qcow2 metadata.
 However this patch takes a completely different approach and instead
 of keeping a ratio between both cache sizes it assigns as much as
 possible to the L2 cache and the remainder to the refcount cache.
 The reason is that L2 tables are used for every single I/O request
 from the guest and the effect of increasing the cache is significant
 and clearly measurable. Refcount blocks are however only used for
 cluster allocation and internal snapshots and in practice are accessed
 sequentially in most cases, so the effect of increasing the cache is
 negligible (even when doing random writes from the guest).
 So, make the refcount cache as small as possible unless the user
 explicitly asks for a larger one.
 Signed-off-by: Alberto Garcia <berto@igalia.com>
 Reviewed-by: Eric Blake <eblake@redhat.com>
-Reviewed-by: Max Reitz <mreitz@redhat.com>
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 Message-id: 9695182c2eb11b77cb319689a1ebaa4e7c9d6591.1523968389.git.berto@igalia.com
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 ---
- block/qcow2.h              |  4 ----
+ block/qcow2.h |  3 ---
- block/qcow2.c              | 31 +++++++++++++++++++------------
+ block/qcow2.c | 51 ++++++++-------------------------------------------
- tests/qemu-iotests/137.out |  2 +-
+files changed, 8 insertions(+), 46 deletions(-)
 files changed, 20 insertions(+), 17 deletions(-)
 diff --git a/block/qcow2.h b/block/qcow2.h
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qcow2.h
 +++ b/block/qcow2.h
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ uint32_t offset_to_reftable_index(BDRVQcow2State *s, uint64_t offset)
- #define DEFAULT_L2_CACHE_CLUSTERS 8 /* clusters */
+ }
- #define DEFAULT_L2_CACHE_BYTE_SIZE 1048576 /* bytes */
+ /* qcow2.c functions */
--/* The refblock cache needs only a fourth of the L2 cache size to cover as many
+-int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
-- * clusters */
+-                  int64_t sector_num, int nb_sectors);
 -#define DEFAULT_L2_REFCOUNT_SIZE_RATIO 4
 -
- #define DEFAULT_CLUSTER_SIZE 65536
+ int64_t qcow2_refcount_metadata_size(int64_t clusters, size_t cluster_size,
+                                      int refcount_order, bool generous_increase,
+                                      uint64_t *refblock_count);
 diff --git a/block/qcow2.c b/block/qcow2.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qcow2.c
 +++ b/block/qcow2.c
-@@ -XXX,XX +XXX,XX @@ static void read_cache_sizes(BlockDriverState *bs, QemuOpts *opts,
+@@ -XXX,XX +XXX,XX @@ static int64_t coroutine_fn qcow2_co_get_block_status(BlockDriverState *bs,
-         } else if (refcount_cache_size_set) {
+     return status;
-             *l2_cache_size = combined_cache_size - *refcount_cache_size;
+ }
-         } else {
--            *refcount_cache_size = combined_cache_size
+-/* handle reading after the end of the backing file */
--                                 / (DEFAULT_L2_REFCOUNT_SIZE_RATIO + 1);
+-int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
--            *l2_cache_size = combined_cache_size - *refcount_cache_size;
+-                        int64_t offset, int bytes)
-+            uint64_t virtual_disk_size = bs->total_sectors * BDRV_SECTOR_SIZE;
+-{
-+            uint64_t max_l2_cache = virtual_disk_size / (s->cluster_size / 8);
+-    uint64_t bs_size = bs->total_sectors * BDRV_SECTOR_SIZE;
-+            uint64_t min_refcount_cache =
+-    int n1;
-+                (uint64_t) MIN_REFCOUNT_CACHE_SIZE * s->cluster_size;
+-
-+
+-    if ((offset + bytes) <= bs_size) {
-+            /* Assign as much memory as possible to the L2 cache, and
+-        return bytes;
-+             * use the remainder for the refcount cache */
+-    }
-+            if (combined_cache_size >= max_l2_cache + min_refcount_cache) {
+-
-+                *l2_cache_size = max_l2_cache;
+-    if (offset >= bs_size) {
-+                *refcount_cache_size = combined_cache_size - *l2_cache_size;
+-        n1 = 0;
-+            } else {
+-    } else {
-+                *refcount_cache_size =
+-        n1 = bs_size - offset;
-+                    MIN(combined_cache_size, min_refcount_cache);
+-    }
-+                *l2_cache_size = combined_cache_size - *refcount_cache_size;
+-
-+            }
+-    qemu_iovec_memset(qiov, n1, 0, bytes - n1);
-         }
+-
-     } else {
+-    return n1;
--        if (!l2_cache_size_set && !refcount_cache_size_set) {
+-}
-+        if (!l2_cache_size_set) {
+-
-             *l2_cache_size = MAX(DEFAULT_L2_CACHE_BYTE_SIZE,
+ static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
-                                  (uint64_t)DEFAULT_L2_CACHE_CLUSTERS
+                                         uint64_t bytes, QEMUIOVector *qiov,
-                                  * s->cluster_size);
+                                         int flags)
--            *refcount_cache_size = *l2_cache_size
+ {
--                                 / DEFAULT_L2_REFCOUNT_SIZE_RATIO;
+     BDRVQcow2State *s = bs->opaque;
--        } else if (!l2_cache_size_set) {
+-    int offset_in_cluster, n1;
--            *l2_cache_size = *refcount_cache_size
++    int offset_in_cluster;
--                           * DEFAULT_L2_REFCOUNT_SIZE_RATIO;
+     int ret;
--        } else if (!refcount_cache_size_set) {
+     unsigned int cur_bytes; /* number of bytes in current iteration */
--            *refcount_cache_size = *l2_cache_size
+     uint64_t cluster_offset = 0;
--                                 / DEFAULT_L2_REFCOUNT_SIZE_RATIO;
+@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
-+        }
+         case QCOW2_CLUSTER_UNALLOCATED:
-+        if (!refcount_cache_size_set) {
-+            *refcount_cache_size = MIN_REFCOUNT_CACHE_SIZE * s->cluster_size;
+             if (bs->backing) {
-         }
+-                /* read from the base image */
-     }
+-                n1 = qcow2_backing_read1(bs->backing->bs, &hd_qiov,
+-                                         offset, cur_bytes);
-diff --git a/tests/qemu-iotests/137.out b/tests/qemu-iotests/137.out
+-                if (n1 > 0) {
-index XXXXXXX..XXXXXXX 100644
+-                    QEMUIOVector local_qiov;
---- a/tests/qemu-iotests/137.out
+-
-+++ b/tests/qemu-iotests/137.out
+-                    qemu_iovec_init(&local_qiov, hd_qiov.niov);
-@@ -XXX,XX +XXX,XX @@ refcount-cache-size may not exceed cache-size
+-                    qemu_iovec_concat(&local_qiov, &hd_qiov, 0, n1);
- L2 cache size too big
+-
- L2 cache entry size must be a power of two between 512 and the cluster size (65536)
+-                    BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
- L2 cache entry size must be a power of two between 512 and the cluster size (65536)
+-                    qemu_co_mutex_unlock(&s->lock);
--L2 cache size too big
+-                    ret = bdrv_co_preadv(bs->backing, offset, n1,
-+Refcount cache size too big
+-                                         &local_qiov, 0);
- Conflicting values for qcow2 options 'overlap-check' ('constant') and 'overlap-check.template' ('all')
+-                    qemu_co_mutex_lock(&s->lock);
- Unsupported value 'blubb' for qcow2 option 'overlap-check'. Allowed are any of the following: none, constant, cached, all
+-
- Unsupported value 'blubb' for qcow2 option 'overlap-check'. Allowed are any of the following: none, constant, cached, all
+-                    qemu_iovec_destroy(&local_qiov);
 -
 -                    if (ret < 0) {
 -                        goto fail;
 -                    }
 +                BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
 +                qemu_co_mutex_unlock(&s->lock);
 +                ret = bdrv_co_preadv(bs->backing, offset, cur_bytes,
 +                                     &hd_qiov, 0);
 +                qemu_co_mutex_lock(&s->lock);
 +                if (ret < 0) {
 +                    goto fail;
                  }
              } else {
                  /* Note: in this case, no need to wait */
 --
 .13.6

-[Qemu-devel] [PULL 10/37] blockjob: expose error string via query
+[Qemu-devel] [PULL v3 11/35] block: Document that x-blockdev-change breaks quorum children list
-From: John Snow <jsnow@redhat.com>
+Removing a quorum child node with x-blockdev-change results in a quorum
 driver state that cannot be recreated with create options because it
 would require a list with gaps. This causes trouble in at least
 .bdrv_refresh_filename().
-When we've reached the concluded state, we need to expose the error
+Document this problem so that we won't accidentally mark the command
-state if applicable. Add the new field.
+stable without having addressed it.
-This should be sufficient for determining if a job completed
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 successfully or not after concluding; if we want to discriminate
 based on how it failed more mechanically, we can always add an
 explicit return code enumeration later.
 I didn't bother to make it only show up if we are in the concluded
 state; I don't think it's necessary.
 Cc: qemu-stable@nongnu.org
 Signed-off-by: John Snow <jsnow@redhat.com>
 Reviewed-by: Eric Blake <eblake@redhat.com>
 Reviewed-by: Alberto Garcia <berto@igalia.com>
-Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- qapi/block-core.json | 6 +++++-
+ qapi/block-core.json | 4 ++++
- blockjob.c           | 2 ++
+file changed, 4 insertions(+)
 files changed, 7 insertions(+), 1 deletion(-)
 diff --git a/qapi/block-core.json b/qapi/block-core.json
 index XXXXXXX..XXXXXXX 100644
 --- a/qapi/block-core.json
 +++ b/qapi/block-core.json
 @@ -XXX,XX +XXX,XX @@
- # @auto-dismiss: Job will dismiss itself when CONCLUDED, moving to the NULL
+ # does not support all kinds of operations, all kinds of children, nor
- #                state and disappearing from the query list. (since 2.12)
+ # all block drivers.
  #
-+# @error: Error information if the job did not complete successfully.
++# FIXME Removing children from a quorum node means introducing gaps in the
-+#         Not set if the job completed successfully. (since 2.12.1)
++# child indices. This cannot be represented in the 'children' list of
 +# BlockdevOptionsQuorum, as returned by .bdrv_refresh_filename().
 +#
- # Since: 1.1
+ # Warning: The data in a new quorum child MUST be consistent with that of
- ##
+ # the rest of the array.
- { 'struct': 'BlockJobInfo',
+ #
@@ -XXX,XX +XXX,XX @@
             'offset': 'int', 'busy': 'bool', 'paused': 'bool', 'speed': 'int',
             'io-status': 'BlockDeviceIoStatus', 'ready': 'bool',
             'status': 'BlockJobStatus',
 -           'auto-finalize': 'bool', 'auto-dismiss': 'bool' } }
 +           'auto-finalize': 'bool', 'auto-dismiss': 'bool',
 +           '*error': 'str' } }
  ##
  # @query-block-jobs:
 diff --git a/blockjob.c b/blockjob.c
 index XXXXXXX..XXXXXXX 100644
 --- a/blockjob.c
 +++ b/blockjob.c
@@ -XXX,XX +XXX,XX @@ BlockJobInfo *block_job_query(BlockJob *job, Error **errp)
      info->status    = job->status;
      info->auto_finalize = job->auto_finalize;
      info->auto_dismiss  = job->auto_dismiss;
 +    info->has_error = job->ret != 0;
 +    info->error     = job->ret ? g_strdup(strerror(-job->ret)) : NULL;
      return info;
  }
 --
 .13.6

-[Qemu-devel] [PULL 18/37] Fix error message about compressed clusters with OFLAG_COPIED
+[Qemu-devel] [PULL v3 12/35] nvme: Add tracing
-From: Alberto Garcia <berto@igalia.com>
+From: Doug Gale <doug16k@gmail.com>
-Compressed clusters are not supposed to have the COPIED bit set.
+Add trace output for commands, errors, and undefined behavior.
-"qemu-img check" detects that and prints an error message reporting
+Add guest error log output for undefined behavior.
-the number of the affected host cluster. This doesn't make much sense
+Report invalid undefined accesses to MMIO.
-because compressed clusters are not aligned to host clusters, so it
+Annotate unlikely error checks with unlikely.
 would be better to report the offset instead. Plus, the calculation is
 wrong and it uses the raw L2 entry as if it was simply an offset.
-This patch fixes the error message and reports the offset of the
+Signed-off-by: Doug Gale <doug16k@gmail.com>
-compressed cluster.
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
  hw/block/nvme.c       | 349 ++++++++++++++++++++++++++++++++++++++++++--------
  hw/block/trace-events |  93 ++++++++++++++
 files changed, 390 insertions(+), 52 deletions(-)
-Signed-off-by: Alberto Garcia <berto@igalia.com>
+diff --git a/hw/block/nvme.c b/hw/block/nvme.c
 Message-id: 0f687957feb72e80c740403191a47e607c2463fe.1523376013.git.berto@igalia.com
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 ---
  block/qcow2-refcount.c | 4 ++--
 file changed, 2 insertions(+), 2 deletions(-)
 diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-refcount.c
+--- a/hw/block/nvme.c
-+++ b/block/qcow2-refcount.c
++++ b/hw/block/nvme.c
-@@ -XXX,XX +XXX,XX @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
+@@ -XXX,XX +XXX,XX @@
-         case QCOW2_CLUSTER_COMPRESSED:
+ #include "qapi/visitor.h"
-             /* Compressed clusters don't have QCOW_OFLAG_COPIED */
+ #include "sysemu/block-backend.h"
-             if (l2_entry & QCOW_OFLAG_COPIED) {
--                fprintf(stderr, "ERROR: cluster %" PRId64 ": "
++#include "qemu/log.h"
-+                fprintf(stderr, "ERROR: coffset=0x%" PRIx64 ": "
++#include "trace.h"
-                     "copied flag must never be set for compressed "
+ #include "nvme.h"
--                    "clusters\n", l2_entry >> s->cluster_bits);
-+                    "clusters\n", l2_entry & s->cluster_offset_mask);
++#define NVME_GUEST_ERR(trace, fmt, ...) \
-                 l2_entry &= ~QCOW_OFLAG_COPIED;
++    do { \
-                 res->corruptions++;
++        (trace_##trace)(__VA_ARGS__); \
 +        qemu_log_mask(LOG_GUEST_ERROR, #trace \
 +            " in %s: " fmt "\n", __func__, ## __VA_ARGS__); \
 +    } while (0)
 +
  static void nvme_process_sq(void *opaque);
  static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
@@ -XXX,XX +XXX,XX @@ static void nvme_isr_notify(NvmeCtrl *n, NvmeCQueue *cq)
  {
      if (cq->irq_enabled) {
          if (msix_enabled(&(n->parent_obj))) {
 +            trace_nvme_irq_msix(cq->vector);
              msix_notify(&(n->parent_obj), cq->vector);
          } else {
 +            trace_nvme_irq_pin();
              pci_irq_pulse(&n->parent_obj);
          }
 +    } else {
 +        trace_nvme_irq_masked();
      }
  }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
      trans_len = MIN(len, trans_len);
      int num_prps = (len >> n->page_bits) + 1;
 -    if (!prp1) {
 +    if (unlikely(!prp1)) {
 +        trace_nvme_err_invalid_prp();
          return NVME_INVALID_FIELD | NVME_DNR;
      } else if (n->cmbsz && prp1 >= n->ctrl_mem.addr &&
                 prp1 < n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size)) {
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
      }
      len -= trans_len;
      if (len) {
 -        if (!prp2) {
 +        if (unlikely(!prp2)) {
 +            trace_nvme_err_invalid_prp2_missing();
              goto unmap;
          }
          if (len > n->page_size) {
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
                  uint64_t prp_ent = le64_to_cpu(prp_list[i]);
                  if (i == n->max_prp_ents - 1 && len > n->page_size) {
 -                    if (!prp_ent || prp_ent & (n->page_size - 1)) {
 +                    if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) {
 +                        trace_nvme_err_invalid_prplist_ent(prp_ent);
                          goto unmap;
                      }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
                      prp_ent = le64_to_cpu(prp_list[i]);
                  }
 -                if (!prp_ent || prp_ent & (n->page_size - 1)) {
 +                if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) {
 +                    trace_nvme_err_invalid_prplist_ent(prp_ent);
                      goto unmap;
                  }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
                  i++;
              }
+         } else {
+-            if (prp2 & (n->page_size - 1)) {
++            if (unlikely(prp2 & (n->page_size - 1))) {
++                trace_nvme_err_invalid_prp2_align(prp2);
+                 goto unmap;
+             }
+             if (qsg->nsg) {
+@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_dma_read_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
+     QEMUIOVector iov;
+     uint16_t status = NVME_SUCCESS;
++    trace_nvme_dma_read(prp1, prp2);
++
+     if (nvme_map_prp(&qsg, &iov, prp1, prp2, len, n)) {
+         return NVME_INVALID_FIELD | NVME_DNR;
+     }
+     if (qsg.nsg > 0) {
+-        if (dma_buf_read(ptr, len, &qsg)) {
++        if (unlikely(dma_buf_read(ptr, len, &qsg))) {
++            trace_nvme_err_invalid_dma();
+             status = NVME_INVALID_FIELD | NVME_DNR;
+         }
+         qemu_sglist_destroy(&qsg);
+     } else {
+-        if (qemu_iovec_to_buf(&iov, 0, ptr, len) != len) {
++        if (unlikely(qemu_iovec_to_buf(&iov, 0, ptr, len) != len)) {
++            trace_nvme_err_invalid_dma();
+             status = NVME_INVALID_FIELD | NVME_DNR;
+         }
+         qemu_iovec_destroy(&iov);
+@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_write_zeros(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
+     uint64_t aio_slba = slba << (data_shift - BDRV_SECTOR_BITS);
+     uint32_t aio_nlb = nlb << (data_shift - BDRV_SECTOR_BITS);
+-    if (slba + nlb > ns->id_ns.nsze) {
++    if (unlikely(slba + nlb > ns->id_ns.nsze)) {
++        trace_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
+         return NVME_LBA_RANGE | NVME_DNR;
+     }
+@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
+     int is_write = rw->opcode == NVME_CMD_WRITE ? 1 : 0;
+     enum BlockAcctType acct = is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ;
+-    if ((slba + nlb) > ns->id_ns.nsze) {
++    trace_nvme_rw(is_write ? "write" : "read", nlb, data_size, slba);
++
++    if (unlikely((slba + nlb) > ns->id_ns.nsze)) {
+         block_acct_invalid(blk_get_stats(n->conf.blk), acct);
++        trace_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
+         return NVME_LBA_RANGE | NVME_DNR;
+     }
+@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
+     NvmeNamespace *ns;
+     uint32_t nsid = le32_to_cpu(cmd->nsid);
+-    if (nsid == 0 || nsid > n->num_namespaces) {
++    if (unlikely(nsid == 0 || nsid > n->num_namespaces)) {
++        trace_nvme_err_invalid_ns(nsid, n->num_namespaces);
+         return NVME_INVALID_NSID | NVME_DNR;
+     }
+@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
+     case NVME_CMD_READ:
+         return nvme_rw(n, ns, cmd, req);
+     default:
++        trace_nvme_err_invalid_opc(cmd->opcode);
+         return NVME_INVALID_OPCODE | NVME_DNR;
+     }
+ }
+@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeCmd *cmd)
+     NvmeCQueue *cq;
+     uint16_t qid = le16_to_cpu(c->qid);
+-    if (!qid || nvme_check_sqid(n, qid)) {
++    if (unlikely(!qid || nvme_check_sqid(n, qid))) {
++        trace_nvme_err_invalid_del_sq(qid);
+         return NVME_INVALID_QID | NVME_DNR;
+     }
++    trace_nvme_del_sq(qid);
++
+     sq = n->sq[qid];
+     while (!QTAILQ_EMPTY(&sq->out_req_list)) {
+         req = QTAILQ_FIRST(&sq->out_req_list);
+@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeCmd *cmd)
+     uint16_t qflags = le16_to_cpu(c->sq_flags);
+     uint64_t prp1 = le64_to_cpu(c->prp1);
+-    if (!cqid || nvme_check_cqid(n, cqid)) {
++    trace_nvme_create_sq(prp1, sqid, cqid, qsize, qflags);
++
++    if (unlikely(!cqid || nvme_check_cqid(n, cqid))) {
++        trace_nvme_err_invalid_create_sq_cqid(cqid);
+         return NVME_INVALID_CQID | NVME_DNR;
+     }
+-    if (!sqid || !nvme_check_sqid(n, sqid)) {
++    if (unlikely(!sqid || !nvme_check_sqid(n, sqid))) {
++        trace_nvme_err_invalid_create_sq_sqid(sqid);
+         return NVME_INVALID_QID | NVME_DNR;
+     }
+-    if (!qsize || qsize > NVME_CAP_MQES(n->bar.cap)) {
++    if (unlikely(!qsize || qsize > NVME_CAP_MQES(n->bar.cap))) {
++        trace_nvme_err_invalid_create_sq_size(qsize);
+         return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
+     }
+-    if (!prp1 || prp1 & (n->page_size - 1)) {
++    if (unlikely(!prp1 || prp1 & (n->page_size - 1))) {
++        trace_nvme_err_invalid_create_sq_addr(prp1);
+         return NVME_INVALID_FIELD | NVME_DNR;
+     }
+-    if (!(NVME_SQ_FLAGS_PC(qflags))) {
++    if (unlikely(!(NVME_SQ_FLAGS_PC(qflags)))) {
++        trace_nvme_err_invalid_create_sq_qflags(NVME_SQ_FLAGS_PC(qflags));
+         return NVME_INVALID_FIELD | NVME_DNR;
+     }
+     sq = g_malloc0(sizeof(*sq));
+@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeCmd *cmd)
+     NvmeCQueue *cq;
+     uint16_t qid = le16_to_cpu(c->qid);
+-    if (!qid || nvme_check_cqid(n, qid)) {
++    if (unlikely(!qid || nvme_check_cqid(n, qid))) {
++        trace_nvme_err_invalid_del_cq_cqid(qid);
+         return NVME_INVALID_CQID | NVME_DNR;
+     }
+     cq = n->cq[qid];
+-    if (!QTAILQ_EMPTY(&cq->sq_list)) {
++    if (unlikely(!QTAILQ_EMPTY(&cq->sq_list))) {
++        trace_nvme_err_invalid_del_cq_notempty(qid);
+         return NVME_INVALID_QUEUE_DEL;
+     }
++    trace_nvme_del_cq(qid);
+     nvme_free_cq(cq, n);
+     return NVME_SUCCESS;
+ }
+@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeCmd *cmd)
+     uint16_t qflags = le16_to_cpu(c->cq_flags);
+     uint64_t prp1 = le64_to_cpu(c->prp1);
+-    if (!cqid || !nvme_check_cqid(n, cqid)) {
++    trace_nvme_create_cq(prp1, cqid, vector, qsize, qflags,
++                         NVME_CQ_FLAGS_IEN(qflags) != 0);
++
++    if (unlikely(!cqid || !nvme_check_cqid(n, cqid))) {
++        trace_nvme_err_invalid_create_cq_cqid(cqid);
+         return NVME_INVALID_CQID | NVME_DNR;
+     }
+-    if (!qsize || qsize > NVME_CAP_MQES(n->bar.cap)) {
++    if (unlikely(!qsize || qsize > NVME_CAP_MQES(n->bar.cap))) {
++        trace_nvme_err_invalid_create_cq_size(qsize);
+         return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
+     }
+-    if (!prp1) {
++    if (unlikely(!prp1)) {
++        trace_nvme_err_invalid_create_cq_addr(prp1);
+         return NVME_INVALID_FIELD | NVME_DNR;
+     }
+-    if (vector > n->num_queues) {
++    if (unlikely(vector > n->num_queues)) {
++        trace_nvme_err_invalid_create_cq_vector(vector);
+         return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
+     }
+-    if (!(NVME_CQ_FLAGS_PC(qflags))) {
++    if (unlikely(!(NVME_CQ_FLAGS_PC(qflags)))) {
++        trace_nvme_err_invalid_create_cq_qflags(NVME_CQ_FLAGS_PC(qflags));
+         return NVME_INVALID_FIELD | NVME_DNR;
+     }
+@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeIdentify *c)
+     uint64_t prp1 = le64_to_cpu(c->prp1);
+     uint64_t prp2 = le64_to_cpu(c->prp2);
++    trace_nvme_identify_ctrl();
++
+     return nvme_dma_read_prp(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl),
+         prp1, prp2);
+ }
+@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeIdentify *c)
+     uint64_t prp1 = le64_to_cpu(c->prp1);
+     uint64_t prp2 = le64_to_cpu(c->prp2);
+-    if (nsid == 0 || nsid > n->num_namespaces) {
++    trace_nvme_identify_ns(nsid);
++
++    if (unlikely(nsid == 0 || nsid > n->num_namespaces)) {
++        trace_nvme_err_invalid_ns(nsid, n->num_namespaces);
+         return NVME_INVALID_NSID | NVME_DNR;
+     }
+     ns = &n->namespaces[nsid - 1];
++
+     return nvme_dma_read_prp(n, (uint8_t *)&ns->id_ns, sizeof(ns->id_ns),
+         prp1, prp2);
+ }
+@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeIdentify *c)
+     uint16_t ret;
+     int i, j = 0;
++    trace_nvme_identify_nslist(min_nsid);
++
+     list = g_malloc0(data_len);
+     for (i = 0; i < n->num_namespaces; i++) {
+         if (i < min_nsid) {
+@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeCmd *cmd)
+     case 0x02:
+         return nvme_identify_nslist(n, c);
+     default:
++        trace_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns));
+         return NVME_INVALID_FIELD | NVME_DNR;
+     }
+ }
+@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
+     switch (dw10) {
+     case NVME_VOLATILE_WRITE_CACHE:
+         result = blk_enable_write_cache(n->conf.blk);
++        trace_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
+         break;
+     case NVME_NUMBER_OF_QUEUES:
+         result = cpu_to_le32((n->num_queues - 2) | ((n->num_queues - 2) << 16));
++        trace_nvme_getfeat_numq(result);
+         break;
+     default:
++        trace_nvme_err_invalid_getfeat(dw10);
+         return NVME_INVALID_FIELD | NVME_DNR;
+     }
+@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
+         blk_set_enable_write_cache(n->conf.blk, dw11 & 1);
+         break;
+     case NVME_NUMBER_OF_QUEUES:
++        trace_nvme_setfeat_numq((dw11 & 0xFFFF) + 1,
++                                ((dw11 >> 16) & 0xFFFF) + 1,
++                                n->num_queues - 1, n->num_queues - 1);
+         req->cqe.result =
+             cpu_to_le32((n->num_queues - 2) | ((n->num_queues - 2) << 16));
+         break;
+     default:
++        trace_nvme_err_invalid_setfeat(dw10);
+         return NVME_INVALID_FIELD | NVME_DNR;
+     }
+     return NVME_SUCCESS;
+@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
+     case NVME_ADM_CMD_GET_FEATURES:
+         return nvme_get_feature(n, cmd, req);
+     default:
++        trace_nvme_err_invalid_admin_opc(cmd->opcode);
+         return NVME_INVALID_OPCODE | NVME_DNR;
+     }
+ }
+@@ -XXX,XX +XXX,XX @@ static int nvme_start_ctrl(NvmeCtrl *n)
+     uint32_t page_bits = NVME_CC_MPS(n->bar.cc) + 12;
+     uint32_t page_size = 1 << page_bits;
+-    if (n->cq[0] || n->sq[0] || !n->bar.asq || !n->bar.acq ||
+-            n->bar.asq & (page_size - 1) || n->bar.acq & (page_size - 1) ||
+-            NVME_CC_MPS(n->bar.cc) < NVME_CAP_MPSMIN(n->bar.cap) ||
+-            NVME_CC_MPS(n->bar.cc) > NVME_CAP_MPSMAX(n->bar.cap) ||
+-            NVME_CC_IOCQES(n->bar.cc) < NVME_CTRL_CQES_MIN(n->id_ctrl.cqes) ||
+-            NVME_CC_IOCQES(n->bar.cc) > NVME_CTRL_CQES_MAX(n->id_ctrl.cqes) ||
+-            NVME_CC_IOSQES(n->bar.cc) < NVME_CTRL_SQES_MIN(n->id_ctrl.sqes) ||
+-            NVME_CC_IOSQES(n->bar.cc) > NVME_CTRL_SQES_MAX(n->id_ctrl.sqes) ||
+-            !NVME_AQA_ASQS(n->bar.aqa) || !NVME_AQA_ACQS(n->bar.aqa)) {
++    if (unlikely(n->cq[0])) {
++        trace_nvme_err_startfail_cq();
++        return -1;
++    }
++    if (unlikely(n->sq[0])) {
++        trace_nvme_err_startfail_sq();
++        return -1;
++    }
++    if (unlikely(!n->bar.asq)) {
++        trace_nvme_err_startfail_nbarasq();
++        return -1;
++    }
++    if (unlikely(!n->bar.acq)) {
++        trace_nvme_err_startfail_nbaracq();
++        return -1;
++    }
++    if (unlikely(n->bar.asq & (page_size - 1))) {
++        trace_nvme_err_startfail_asq_misaligned(n->bar.asq);
++        return -1;
++    }
++    if (unlikely(n->bar.acq & (page_size - 1))) {
++        trace_nvme_err_startfail_acq_misaligned(n->bar.acq);
++        return -1;
++    }
++    if (unlikely(NVME_CC_MPS(n->bar.cc) <
++                 NVME_CAP_MPSMIN(n->bar.cap))) {
++        trace_nvme_err_startfail_page_too_small(
++                    NVME_CC_MPS(n->bar.cc),
++                    NVME_CAP_MPSMIN(n->bar.cap));
++        return -1;
++    }
++    if (unlikely(NVME_CC_MPS(n->bar.cc) >
++                 NVME_CAP_MPSMAX(n->bar.cap))) {
++        trace_nvme_err_startfail_page_too_large(
++                    NVME_CC_MPS(n->bar.cc),
++                    NVME_CAP_MPSMAX(n->bar.cap));
++        return -1;
++    }
++    if (unlikely(NVME_CC_IOCQES(n->bar.cc) <
++                 NVME_CTRL_CQES_MIN(n->id_ctrl.cqes))) {
++        trace_nvme_err_startfail_cqent_too_small(
++                    NVME_CC_IOCQES(n->bar.cc),
++                    NVME_CTRL_CQES_MIN(n->bar.cap));
++        return -1;
++    }
++    if (unlikely(NVME_CC_IOCQES(n->bar.cc) >
++                 NVME_CTRL_CQES_MAX(n->id_ctrl.cqes))) {
++        trace_nvme_err_startfail_cqent_too_large(
++                    NVME_CC_IOCQES(n->bar.cc),
++                    NVME_CTRL_CQES_MAX(n->bar.cap));
++        return -1;
++    }
++    if (unlikely(NVME_CC_IOSQES(n->bar.cc) <
++                 NVME_CTRL_SQES_MIN(n->id_ctrl.sqes))) {
++        trace_nvme_err_startfail_sqent_too_small(
++                    NVME_CC_IOSQES(n->bar.cc),
++                    NVME_CTRL_SQES_MIN(n->bar.cap));
++        return -1;
++    }
++    if (unlikely(NVME_CC_IOSQES(n->bar.cc) >
++                 NVME_CTRL_SQES_MAX(n->id_ctrl.sqes))) {
++        trace_nvme_err_startfail_sqent_too_large(
++                    NVME_CC_IOSQES(n->bar.cc),
++                    NVME_CTRL_SQES_MAX(n->bar.cap));
++        return -1;
++    }
++    if (unlikely(!NVME_AQA_ASQS(n->bar.aqa))) {
++        trace_nvme_err_startfail_asqent_sz_zero();
++        return -1;
++    }
++    if (unlikely(!NVME_AQA_ACQS(n->bar.aqa))) {
++        trace_nvme_err_startfail_acqent_sz_zero();
+         return -1;
+     }
+@@ -XXX,XX +XXX,XX @@ static int nvme_start_ctrl(NvmeCtrl *n)
+ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
+     unsigned size)
+ {
++    if (unlikely(offset & (sizeof(uint32_t) - 1))) {
++        NVME_GUEST_ERR(nvme_ub_mmiowr_misaligned32,
++                       "MMIO write not 32-bit aligned,"
++                       " offset=0x%"PRIx64"", offset);
++        /* should be ignored, fall through for now */
++    }
++
++    if (unlikely(size < sizeof(uint32_t))) {
++        NVME_GUEST_ERR(nvme_ub_mmiowr_toosmall,
++                       "MMIO write smaller than 32-bits,"
++                       " offset=0x%"PRIx64", size=%u",
++                       offset, size);
++        /* should be ignored, fall through for now */
++    }
++
+     switch (offset) {
+-    case 0xc:
++    case 0xc:   /* INTMS */
++        if (unlikely(msix_enabled(&(n->parent_obj)))) {
++            NVME_GUEST_ERR(nvme_ub_mmiowr_intmask_with_msix,
++                           "undefined access to interrupt mask set"
++                           " when MSI-X is enabled");
++            /* should be ignored, fall through for now */
++        }
+         n->bar.intms |= data & 0xffffffff;
+         n->bar.intmc = n->bar.intms;
++        trace_nvme_mmio_intm_set(data & 0xffffffff,
++                                 n->bar.intmc);
+         break;
+-    case 0x10:
++    case 0x10:  /* INTMC */
++        if (unlikely(msix_enabled(&(n->parent_obj)))) {
++            NVME_GUEST_ERR(nvme_ub_mmiowr_intmask_with_msix,
++                           "undefined access to interrupt mask clr"
++                           " when MSI-X is enabled");
++            /* should be ignored, fall through for now */
++        }
+         n->bar.intms &= ~(data & 0xffffffff);
+         n->bar.intmc = n->bar.intms;
++        trace_nvme_mmio_intm_clr(data & 0xffffffff,
++                                 n->bar.intmc);
+         break;
+-    case 0x14:
++    case 0x14:  /* CC */
++        trace_nvme_mmio_cfg(data & 0xffffffff);
+         /* Windows first sends data, then sends enable bit */
+         if (!NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc) &&
+             !NVME_CC_SHN(data) && !NVME_CC_SHN(n->bar.cc))
+@@ -XXX,XX +XXX,XX @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
+         if (NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc)) {
+             n->bar.cc = data;
+-            if (nvme_start_ctrl(n)) {
++            if (unlikely(nvme_start_ctrl(n))) {
++                trace_nvme_err_startfail();
+                 n->bar.csts = NVME_CSTS_FAILED;
+             } else {
++                trace_nvme_mmio_start_success();
+                 n->bar.csts = NVME_CSTS_READY;
+             }
+         } else if (!NVME_CC_EN(data) && NVME_CC_EN(n->bar.cc)) {
++            trace_nvme_mmio_stopped();
+             nvme_clear_ctrl(n);
+             n->bar.csts &= ~NVME_CSTS_READY;
+         }
+         if (NVME_CC_SHN(data) && !(NVME_CC_SHN(n->bar.cc))) {
+-                nvme_clear_ctrl(n);
+-                n->bar.cc = data;
+-                n->bar.csts |= NVME_CSTS_SHST_COMPLETE;
++            trace_nvme_mmio_shutdown_set();
++            nvme_clear_ctrl(n);
++            n->bar.cc = data;
++            n->bar.csts |= NVME_CSTS_SHST_COMPLETE;
+         } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(n->bar.cc)) {
+-                n->bar.csts &= ~NVME_CSTS_SHST_COMPLETE;
+-                n->bar.cc = data;
++            trace_nvme_mmio_shutdown_cleared();
++            n->bar.csts &= ~NVME_CSTS_SHST_COMPLETE;
++            n->bar.cc = data;
++        }
++        break;
++    case 0x1C:  /* CSTS */
++        if (data & (1 << 4)) {
++            NVME_GUEST_ERR(nvme_ub_mmiowr_ssreset_w1c_unsupported,
++                           "attempted to W1C CSTS.NSSRO"
++                           " but CAP.NSSRS is zero (not supported)");
++        } else if (data != 0) {
++            NVME_GUEST_ERR(nvme_ub_mmiowr_ro_csts,
++                           "attempted to set a read only bit"
++                           " of controller status");
++        }
++        break;
++    case 0x20:  /* NSSR */
++        if (data == 0x4E564D65) {
++            trace_nvme_ub_mmiowr_ssreset_unsupported();
++        } else {
++            /* The spec says that writes of other values have no effect */
++            return;
+         }
+         break;
+-    case 0x24:
++    case 0x24:  /* AQA */
+         n->bar.aqa = data & 0xffffffff;
++        trace_nvme_mmio_aqattr(data & 0xffffffff);
+         break;
+-    case 0x28:
++    case 0x28:  /* ASQ */
+         n->bar.asq = data;
++        trace_nvme_mmio_asqaddr(data);
+         break;
+-    case 0x2c:
++    case 0x2c:  /* ASQ hi */
+         n->bar.asq |= data << 32;
++        trace_nvme_mmio_asqaddr_hi(data, n->bar.asq);
+         break;
+-    case 0x30:
++    case 0x30:  /* ACQ */
++        trace_nvme_mmio_acqaddr(data);
+         n->bar.acq = data;
+         break;
+-    case 0x34:
++    case 0x34:  /* ACQ hi */
+         n->bar.acq |= data << 32;
++        trace_nvme_mmio_acqaddr_hi(data, n->bar.acq);
+         break;
++    case 0x38:  /* CMBLOC */
++        NVME_GUEST_ERR(nvme_ub_mmiowr_cmbloc_reserved,
++                       "invalid write to reserved CMBLOC"
++                       " when CMBSZ is zero, ignored");
++        return;
++    case 0x3C:  /* CMBSZ */
++        NVME_GUEST_ERR(nvme_ub_mmiowr_cmbsz_readonly,
++                       "invalid write to read only CMBSZ, ignored");
++        return;
+     default:
++        NVME_GUEST_ERR(nvme_ub_mmiowr_invalid,
++                       "invalid MMIO write,"
++                       " offset=0x%"PRIx64", data=%"PRIx64"",
++                       offset, data);
+         break;
+     }
+ }
+@@ -XXX,XX +XXX,XX @@ static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
+     uint8_t *ptr = (uint8_t *)&n->bar;
+     uint64_t val = 0;
++    if (unlikely(addr & (sizeof(uint32_t) - 1))) {
++        NVME_GUEST_ERR(nvme_ub_mmiord_misaligned32,
++                       "MMIO read not 32-bit aligned,"
++                       " offset=0x%"PRIx64"", addr);
++        /* should RAZ, fall through for now */
++    } else if (unlikely(size < sizeof(uint32_t))) {
++        NVME_GUEST_ERR(nvme_ub_mmiord_toosmall,
++                       "MMIO read smaller than 32-bits,"
++                       " offset=0x%"PRIx64"", addr);
++        /* should RAZ, fall through for now */
++    }
++
+     if (addr < sizeof(n->bar)) {
+         memcpy(&val, ptr + addr, size);
++    } else {
++        NVME_GUEST_ERR(nvme_ub_mmiord_invalid_ofs,
++                       "MMIO read beyond last register,"
++                       " offset=0x%"PRIx64", returning 0", addr);
+     }
++
+     return val;
+ }
+@@ -XXX,XX +XXX,XX @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
+ {
+     uint32_t qid;
+-    if (addr & ((1 << 2) - 1)) {
++    if (unlikely(addr & ((1 << 2) - 1))) {
++        NVME_GUEST_ERR(nvme_ub_db_wr_misaligned,
++                       "doorbell write not 32-bit aligned,"
++                       " offset=0x%"PRIx64", ignoring", addr);
+         return;
+     }
+     if (((addr - 0x1000) >> 2) & 1) {
++        /* Completion queue doorbell write */
++
+         uint16_t new_head = val & 0xffff;
+         int start_sqs;
+         NvmeCQueue *cq;
+         qid = (addr - (0x1000 + (1 << 2))) >> 3;
+-        if (nvme_check_cqid(n, qid)) {
++        if (unlikely(nvme_check_cqid(n, qid))) {
++            NVME_GUEST_ERR(nvme_ub_db_wr_invalid_cq,
++                           "completion queue doorbell write"
++                           " for nonexistent queue,"
++                           " sqid=%"PRIu32", ignoring", qid);
+             return;
+         }
+         cq = n->cq[qid];
+-        if (new_head >= cq->size) {
++        if (unlikely(new_head >= cq->size)) {
++            NVME_GUEST_ERR(nvme_ub_db_wr_invalid_cqhead,
++                           "completion queue doorbell write value"
++                           " beyond queue size, sqid=%"PRIu32","
++                           " new_head=%"PRIu16", ignoring",
++                           qid, new_head);
+             return;
+         }
+@@ -XXX,XX +XXX,XX @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
+             nvme_isr_notify(n, cq);
+         }
+     } else {
++        /* Submission queue doorbell write */
++
+         uint16_t new_tail = val & 0xffff;
+         NvmeSQueue *sq;
+         qid = (addr - 0x1000) >> 3;
+-        if (nvme_check_sqid(n, qid)) {
++        if (unlikely(nvme_check_sqid(n, qid))) {
++            NVME_GUEST_ERR(nvme_ub_db_wr_invalid_sq,
++                           "submission queue doorbell write"
++                           " for nonexistent queue,"
++                           " sqid=%"PRIu32", ignoring", qid);
+             return;
+         }
+         sq = n->sq[qid];
+-        if (new_tail >= sq->size) {
++        if (unlikely(new_tail >= sq->size)) {
++            NVME_GUEST_ERR(nvme_ub_db_wr_invalid_sqtail,
++                           "submission queue doorbell write value"
++                           " beyond queue size, sqid=%"PRIu32","
++                           " new_tail=%"PRIu16", ignoring",
++                           qid, new_tail);
+             return;
+         }
+diff --git a/hw/block/trace-events b/hw/block/trace-events
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/block/trace-events
++++ b/hw/block/trace-events
+@@ -XXX,XX +XXX,XX @@ virtio_blk_submit_multireq(void *vdev, void *mrb, int start, int num_reqs, uint6
+ hd_geometry_lchs_guess(void *blk, int cyls, int heads, int secs) "blk %p LCHS %d %d %d"
+ hd_geometry_guess(void *blk, uint32_t cyls, uint32_t heads, uint32_t secs, int trans) "blk %p CHS %u %u %u trans %d"
++# hw/block/nvme.c
++# nvme traces for successful events
++nvme_irq_msix(uint32_t vector) "raising MSI-X IRQ vector %u"
++nvme_irq_pin(void) "pulsing IRQ pin"
++nvme_irq_masked(void) "IRQ is masked"
++nvme_dma_read(uint64_t prp1, uint64_t prp2) "DMA read, prp1=0x%"PRIx64" prp2=0x%"PRIx64""
++nvme_rw(char const *verb, uint32_t blk_count, uint64_t byte_count, uint64_t lba) "%s %"PRIu32" blocks (%"PRIu64" bytes) from LBA %"PRIu64""
++nvme_create_sq(uint64_t addr, uint16_t sqid, uint16_t cqid, uint16_t qsize, uint16_t qflags) "create submission queue, addr=0x%"PRIx64", sqid=%"PRIu16", cqid=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16""
++nvme_create_cq(uint64_t addr, uint16_t cqid, uint16_t vector, uint16_t size, uint16_t qflags, int ien) "create completion queue, addr=0x%"PRIx64", cqid=%"PRIu16", vector=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16", ien=%d"
++nvme_del_sq(uint16_t qid) "deleting submission queue sqid=%"PRIu16""
++nvme_del_cq(uint16_t cqid) "deleted completion queue, sqid=%"PRIu16""
++nvme_identify_ctrl(void) "identify controller"
++nvme_identify_ns(uint16_t ns) "identify namespace, nsid=%"PRIu16""
++nvme_identify_nslist(uint16_t ns) "identify namespace list, nsid=%"PRIu16""
++nvme_getfeat_vwcache(char const* result) "get feature volatile write cache, result=%s"
++nvme_getfeat_numq(int result) "get feature number of queues, result=%d"
++nvme_setfeat_numq(int reqcq, int reqsq, int gotcq, int gotsq) "requested cq_count=%d sq_count=%d, responding with cq_count=%d sq_count=%d"
++nvme_mmio_intm_set(uint64_t data, uint64_t new_mask) "wrote MMIO, interrupt mask set, data=0x%"PRIx64", new_mask=0x%"PRIx64""
++nvme_mmio_intm_clr(uint64_t data, uint64_t new_mask) "wrote MMIO, interrupt mask clr, data=0x%"PRIx64", new_mask=0x%"PRIx64""
++nvme_mmio_cfg(uint64_t data) "wrote MMIO, config controller config=0x%"PRIx64""
++nvme_mmio_aqattr(uint64_t data) "wrote MMIO, admin queue attributes=0x%"PRIx64""
++nvme_mmio_asqaddr(uint64_t data) "wrote MMIO, admin submission queue address=0x%"PRIx64""
++nvme_mmio_acqaddr(uint64_t data) "wrote MMIO, admin completion queue address=0x%"PRIx64""
++nvme_mmio_asqaddr_hi(uint64_t data, uint64_t new_addr) "wrote MMIO, admin submission queue high half=0x%"PRIx64", new_address=0x%"PRIx64""
++nvme_mmio_acqaddr_hi(uint64_t data, uint64_t new_addr) "wrote MMIO, admin completion queue high half=0x%"PRIx64", new_address=0x%"PRIx64""
++nvme_mmio_start_success(void) "setting controller enable bit succeeded"
++nvme_mmio_stopped(void) "cleared controller enable bit"
++nvme_mmio_shutdown_set(void) "shutdown bit set"
++nvme_mmio_shutdown_cleared(void) "shutdown bit cleared"
++
++# nvme traces for error conditions
++nvme_err_invalid_dma(void) "PRP/SGL is too small for transfer size"
++nvme_err_invalid_prplist_ent(uint64_t prplist) "PRP list entry is null or not page aligned: 0x%"PRIx64""
++nvme_err_invalid_prp2_align(uint64_t prp2) "PRP2 is not page aligned: 0x%"PRIx64""
++nvme_err_invalid_prp2_missing(void) "PRP2 is null and more data to be transferred"
++nvme_err_invalid_field(void) "invalid field"
++nvme_err_invalid_prp(void) "invalid PRP"
++nvme_err_invalid_sgl(void) "invalid SGL"
++nvme_err_invalid_ns(uint32_t ns, uint32_t limit) "invalid namespace %u not within 1-%u"
++nvme_err_invalid_opc(uint8_t opc) "invalid opcode 0x%"PRIx8""
++nvme_err_invalid_admin_opc(uint8_t opc) "invalid admin opcode 0x%"PRIx8""
++nvme_err_invalid_lba_range(uint64_t start, uint64_t len, uint64_t limit) "Invalid LBA start=%"PRIu64" len=%"PRIu64" limit=%"PRIu64""
++nvme_err_invalid_del_sq(uint16_t qid) "invalid submission queue deletion, sid=%"PRIu16""
++nvme_err_invalid_create_sq_cqid(uint16_t cqid) "failed creating submission queue, invalid cqid=%"PRIu16""
++nvme_err_invalid_create_sq_sqid(uint16_t sqid) "failed creating submission queue, invalid sqid=%"PRIu16""
++nvme_err_invalid_create_sq_size(uint16_t qsize) "failed creating submission queue, invalid qsize=%"PRIu16""
++nvme_err_invalid_create_sq_addr(uint64_t addr) "failed creating submission queue, addr=0x%"PRIx64""
++nvme_err_invalid_create_sq_qflags(uint16_t qflags) "failed creating submission queue, qflags=%"PRIu16""
++nvme_err_invalid_del_cq_cqid(uint16_t cqid) "failed deleting completion queue, cqid=%"PRIu16""
++nvme_err_invalid_del_cq_notempty(uint16_t cqid) "failed deleting completion queue, it is not empty, cqid=%"PRIu16""
++nvme_err_invalid_create_cq_cqid(uint16_t cqid) "failed creating completion queue, cqid=%"PRIu16""
++nvme_err_invalid_create_cq_size(uint16_t size) "failed creating completion queue, size=%"PRIu16""
++nvme_err_invalid_create_cq_addr(uint64_t addr) "failed creating completion queue, addr=0x%"PRIx64""
++nvme_err_invalid_create_cq_vector(uint16_t vector) "failed creating completion queue, vector=%"PRIu16""
++nvme_err_invalid_create_cq_qflags(uint16_t qflags) "failed creating completion queue, qflags=%"PRIu16""
++nvme_err_invalid_identify_cns(uint16_t cns) "identify, invalid cns=0x%"PRIx16""
++nvme_err_invalid_getfeat(int dw10) "invalid get features, dw10=0x%"PRIx32""
++nvme_err_invalid_setfeat(uint32_t dw10) "invalid set features, dw10=0x%"PRIx32""
++nvme_err_startfail_cq(void) "nvme_start_ctrl failed because there are non-admin completion queues"
++nvme_err_startfail_sq(void) "nvme_start_ctrl failed because there are non-admin submission queues"
++nvme_err_startfail_nbarasq(void) "nvme_start_ctrl failed because the admin submission queue address is null"
++nvme_err_startfail_nbaracq(void) "nvme_start_ctrl failed because the admin completion queue address is null"
++nvme_err_startfail_asq_misaligned(uint64_t addr) "nvme_start_ctrl failed because the admin submission queue address is misaligned: 0x%"PRIx64""
++nvme_err_startfail_acq_misaligned(uint64_t addr) "nvme_start_ctrl failed because the admin completion queue address is misaligned: 0x%"PRIx64""
++nvme_err_startfail_page_too_small(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the page size is too small: log2size=%u, min=%u"
++nvme_err_startfail_page_too_large(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the page size is too large: log2size=%u, max=%u"
++nvme_err_startfail_cqent_too_small(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the completion queue entry size is too small: log2size=%u, min=%u"
++nvme_err_startfail_cqent_too_large(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the completion queue entry size is too large: log2size=%u, max=%u"
++nvme_err_startfail_sqent_too_small(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the submission queue entry size is too small: log2size=%u, min=%u"
++nvme_err_startfail_sqent_too_large(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the submission queue entry size is too large: log2size=%u, max=%u"
++nvme_err_startfail_asqent_sz_zero(void) "nvme_start_ctrl failed because the admin submission queue size is zero"
++nvme_err_startfail_acqent_sz_zero(void) "nvme_start_ctrl failed because the admin completion queue size is zero"
++nvme_err_startfail(void) "setting controller enable bit failed"
++
++# Traces for undefined behavior
++nvme_ub_mmiowr_misaligned32(uint64_t offset) "MMIO write not 32-bit aligned, offset=0x%"PRIx64""
++nvme_ub_mmiowr_toosmall(uint64_t offset, unsigned size) "MMIO write smaller than 32 bits, offset=0x%"PRIx64", size=%u"
++nvme_ub_mmiowr_intmask_with_msix(void) "undefined access to interrupt mask set when MSI-X is enabled"
++nvme_ub_mmiowr_ro_csts(void) "attempted to set a read only bit of controller status"
++nvme_ub_mmiowr_ssreset_w1c_unsupported(void) "attempted to W1C CSTS.NSSRO but CAP.NSSRS is zero (not supported)"
++nvme_ub_mmiowr_ssreset_unsupported(void) "attempted NVM subsystem reset but CAP.NSSRS is zero (not supported)"
++nvme_ub_mmiowr_cmbloc_reserved(void) "invalid write to reserved CMBLOC when CMBSZ is zero, ignored"
++nvme_ub_mmiowr_cmbsz_readonly(void) "invalid write to read only CMBSZ, ignored"
++nvme_ub_mmiowr_invalid(uint64_t offset, uint64_t data) "invalid MMIO write, offset=0x%"PRIx64", data=0x%"PRIx64""
++nvme_ub_mmiord_misaligned32(uint64_t offset) "MMIO read not 32-bit aligned, offset=0x%"PRIx64""
++nvme_ub_mmiord_toosmall(uint64_t offset) "MMIO read smaller than 32-bits, offset=0x%"PRIx64""
++nvme_ub_mmiord_invalid_ofs(uint64_t offset) "MMIO read beyond last register, offset=0x%"PRIx64", returning 0"
++nvme_ub_db_wr_misaligned(uint64_t offset) "doorbell write not 32-bit aligned, offset=0x%"PRIx64", ignoring"
++nvme_ub_db_wr_invalid_cq(uint32_t qid) "completion queue doorbell write for nonexistent queue, cqid=%"PRIu32", ignoring"
++nvme_ub_db_wr_invalid_cqhead(uint32_t qid, uint16_t new_head) "completion queue doorbell write value beyond queue size, cqid=%"PRIu32", new_head=%"PRIu16", ignoring"
++nvme_ub_db_wr_invalid_sq(uint32_t qid) "submission queue doorbell write for nonexistent queue, sqid=%"PRIu32", ignoring"
++nvme_ub_db_wr_invalid_sqtail(uint32_t qid, uint16_t new_tail) "submission queue doorbell write value beyond queue size, sqid=%"PRIu32", new_head=%"PRIu16", ignoring"
++
+ # hw/block/xen_disk.c
+ xen_disk_alloc(char *name) "%s"
+ xen_disk_init(char *name) "%s"
 --
 .13.6

-[Qemu-devel] [PULL 35/37] qemu-io: Use purely string blockdev options
+[Qemu-devel] [PULL v3 13/35] block: Open backing image in force share mode for size probe
-From: Max Reitz <mreitz@redhat.com>
+From: Fam Zheng <famz@redhat.com>
-Currently, qemu-io only uses string-valued blockdev options (as all are
+Management tools create overlays of running guests with qemu-img:
 converted directly from QemuOpts) -- with one exception: -U adds the
 force-share option as a boolean.  This in itself is already a bit
 questionable, but a real issue is that it also assumes the value already
 existing in the options QDict would be a boolean, which is wrong.
-That has the following effect:
+  $ qemu-img create -b /image/in/use.qcow2 -f qcow2 /overlay/image.qcow2
-$ ./qemu-io -r -U --image-opts \
+but this doesn't work anymore due to image locking:
     driver=file,filename=/dev/null,force-share=off
 [1]    15200 segmentation fault (core dumped)  ./qemu-io -r -U
 --image-opts driver=file,filename=/dev/null,force-share=off
-Since @opts is converted from QemuOpts, the value must be a string, and
+    qemu-img: /overlay/image.qcow2: Failed to get shared "write" lock
-we have to compare it as such.  Consequently, it makes sense to also set
+    Is another process using the image?
-it as a string instead of a boolean.
+    Could not open backing image to determine size.
 Use the force share option to allow this use case again.
 Cc: qemu-stable@nongnu.org
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+Signed-off-by: Fam Zheng <famz@redhat.com>
 Message-id: 20180502202051.15493-2-mreitz@redhat.com
 Reviewed-by: Eric Blake <eblake@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- qemu-io.c | 4 ++--
+ block.c | 3 ++-
-file changed, 2 insertions(+), 2 deletions(-)
+file changed, 2 insertions(+), 1 deletion(-)
-diff --git a/qemu-io.c b/qemu-io.c
+diff --git a/block.c b/block.c
 index XXXXXXX..XXXXXXX 100644
---- a/qemu-io.c
+--- a/block.c
-+++ b/qemu-io.c
++++ b/block.c
-@@ -XXX,XX +XXX,XX @@ static int openfile(char *name, int flags, bool writethrough, bool force_share,
+@@ -XXX,XX +XXX,XX @@ void bdrv_img_create(const char *filename, const char *fmt,
-             opts = qdict_new();
+         back_flags = flags;
          back_flags &= ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
 +        backing_options = qdict_new();
          if (backing_fmt) {
 -            backing_options = qdict_new();
              qdict_put_str(backing_options, "driver", backing_fmt);
          }
-         if (qdict_haskey(opts, BDRV_OPT_FORCE_SHARE)
++        qdict_put_bool(backing_options, BDRV_OPT_FORCE_SHARE, true);
--            && !qdict_get_bool(opts, BDRV_OPT_FORCE_SHARE)) {
-+            && strcmp(qdict_get_str(opts, BDRV_OPT_FORCE_SHARE), "on")) {
+         bs = bdrv_open(full_backing, NULL, backing_options, back_flags,
-             error_report("-U conflicts with image options");
+                        &local_err);
              qobject_unref(opts);
              return 1;
          }
 -        qdict_put_bool(opts, BDRV_OPT_FORCE_SHARE, true);
 +        qdict_put_str(opts, BDRV_OPT_FORCE_SHARE, "on");
      }
      qemuio_blk = blk_new_open(name, NULL, opts, flags, &local_err);
      if (!qemuio_blk) {
 --
 .13.6

-[Qemu-devel] [PULL 29/37] block: Support BDRV_REQ_WRITE_UNCHANGED in filters
+[Qemu-devel] [PULL v3 14/35] block: Remove the obsolete -drive boot=on|off parameter
-From: Max Reitz <mreitz@redhat.com>
+From: Thomas Huth <thuth@redhat.com>
-Update the rest of the filter drivers to support
+It's not working anymore since QEMU v1.3.0 - time to remove it now.
 BDRV_REQ_WRITE_UNCHANGED.  They already forward write request flags to
 their children, so we just have to announce support for it.
-This patch does not cover the replication driver because that currently
+Signed-off-by: Thomas Huth <thuth@redhat.com>
-does not support flags at all, and because it just grabs the WRITE
+Reviewed-by: John Snow <jsnow@redhat.com>
-permission for its children when it can, so we should be fine just
+Reviewed-by: Markus Armbruster <armbru@redhat.com>
-submitting the incoming WRITE_UNCHANGED requests as normal writes.
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
  blockdev.c    | 11 -----------
  qemu-doc.texi |  6 ------
 files changed, 17 deletions(-)
-It also does not cover format drivers for similar reasons.  They all use
+diff --git a/blockdev.c b/blockdev.c
 bdrv_format_default_perms() as their .bdrv_child_perm() implementation
 so they just always grab the WRITE permission for their file children
 whenever possible.  In addition, it often would be difficult to
 ascertain whether incoming unchanging writes end up as unchanging writes
 in their files.  So we just leave them as normal potentially changing
 writes.
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Reviewed-by: Alberto Garcia <berto@igalia.com>
 Message-id: 20180421132929.21610-7-mreitz@redhat.com
 Reviewed-by: Kevin Wolf <kwolf@redhat.com>
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 ---
  block/blkdebug.c     |  9 +++++----
  block/blkreplay.c    |  3 +++
  block/blkverify.c    |  3 +++
  block/copy-on-read.c | 10 ++++++----
  block/mirror.c       |  2 ++
  block/raw-format.c   |  9 +++++----
  block/throttle.c     |  6 ++++--
 files changed, 28 insertions(+), 14 deletions(-)
 diff --git a/block/blkdebug.c b/block/blkdebug.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/blkdebug.c
+--- a/blockdev.c
-+++ b/block/blkdebug.c
++++ b/blockdev.c
-@@ -XXX,XX +XXX,XX @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags,
+@@ -XXX,XX +XXX,XX @@ QemuOptsList qemu_legacy_drive_opts = {
-         goto out;
+             .type = QEMU_OPT_STRING,
-     }
+             .help = "chs translation (auto, lba, none)",
+         },{
--    bs->supported_write_flags = BDRV_REQ_FUA &
+-            .name = "boot",
--        bs->file->bs->supported_write_flags;
+-            .type = QEMU_OPT_BOOL,
--    bs->supported_zero_flags = (BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) &
+-            .help = "(deprecated, ignored)",
--        bs->file->bs->supported_zero_flags;
+-        },{
-+    bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED |
+             .name = "addr",
-+        (BDRV_REQ_FUA & bs->file->bs->supported_write_flags);
+             .type = QEMU_OPT_STRING,
-+    bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED |
+             .help = "pci address (virtio only)",
-+        ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) &
+@@ -XXX,XX +XXX,XX @@ DriveInfo *drive_new(QemuOpts *all_opts, BlockInterfaceType block_default_type)
 +            bs->file->bs->supported_zero_flags);
      ret = -EINVAL;
      /* Set alignment overrides */
 diff --git a/block/blkreplay.c b/block/blkreplay.c
 index XXXXXXX..XXXXXXX 100755
 --- a/block/blkreplay.c
 +++ b/block/blkreplay.c
@@ -XXX,XX +XXX,XX @@ static int blkreplay_open(BlockDriverState *bs, QDict *options, int flags,
          goto fail;
      }
-+    bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED;
+-    /* Deprecated option boot=[on|off] */
-+    bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED;
+-    if (qemu_opt_get(legacy_opts, "boot") != NULL) {
-+
+-        fprintf(stderr, "qemu-kvm: boot=on|off is deprecated and will be "
-     ret = 0;
+-                "ignored. Future versions will reject this parameter. Please "
- fail:
+-                "update your scripts.\n");
-     return ret;
+-    }
-diff --git a/block/blkverify.c b/block/blkverify.c
+-
      /* Other deprecated options */
      if (!qtest_enabled()) {
          for (i = 0; i < ARRAY_SIZE(deprecated); i++) {
 diff --git a/qemu-doc.texi b/qemu-doc.texi
 index XXXXXXX..XXXXXXX 100644
---- a/block/blkverify.c
+--- a/qemu-doc.texi
-+++ b/block/blkverify.c
++++ b/qemu-doc.texi
-@@ -XXX,XX +XXX,XX @@ static int blkverify_open(BlockDriverState *bs, QDict *options, int flags,
+@@ -XXX,XX +XXX,XX @@ deprecated.
-         goto fail;
-     }
+ @section System emulator command line arguments
-+    bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED;
+-@subsection -drive boot=on|off (since 1.3.0)
-+    bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED;
+-
-+
+-The ``boot=on|off'' option to the ``-drive'' argument is
-     ret = 0;
+-ignored. Applications should use the ``bootindex=N'' parameter
- fail:
+-to set an absolute ordering between devices instead.
-     qemu_opts_del(opts);
+-
-diff --git a/block/copy-on-read.c b/block/copy-on-read.c
+ @subsection -tdf (since 1.3.0)
-index XXXXXXX..XXXXXXX 100644
---- a/block/copy-on-read.c
+ The ``-tdf'' argument is ignored. The behaviour implemented
 +++ b/block/copy-on-read.c
@@ -XXX,XX +XXX,XX @@ static int cor_open(BlockDriverState *bs, QDict *options, int flags,
          return -EINVAL;
      }
 -    bs->supported_write_flags = BDRV_REQ_FUA &
 -                                    bs->file->bs->supported_write_flags;
 +    bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED |
 +                                (BDRV_REQ_FUA &
 +                                    bs->file->bs->supported_write_flags);
 -    bs->supported_zero_flags = (BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) &
 -                                    bs->file->bs->supported_zero_flags;
 +    bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED |
 +                               ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) &
 +                                    bs->file->bs->supported_zero_flags);
      return 0;
  }
 diff --git a/block/mirror.c b/block/mirror.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/mirror.c
 +++ b/block/mirror.c
@@ -XXX,XX +XXX,XX @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs,
          mirror_top_bs->implicit = true;
      }
      mirror_top_bs->total_sectors = bs->total_sectors;
 +    mirror_top_bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED;
 +    mirror_top_bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED;
      bdrv_set_aio_context(mirror_top_bs, bdrv_get_aio_context(bs));
      /* bdrv_append takes ownership of the mirror_top_bs reference, need to keep
 diff --git a/block/raw-format.c b/block/raw-format.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/raw-format.c
 +++ b/block/raw-format.c
@@ -XXX,XX +XXX,XX @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags,
      }
      bs->sg = bs->file->bs->sg;
 -    bs->supported_write_flags = BDRV_REQ_FUA &
 -        bs->file->bs->supported_write_flags;
 -    bs->supported_zero_flags = (BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) &
 -        bs->file->bs->supported_zero_flags;
 +    bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED |
 +        (BDRV_REQ_FUA & bs->file->bs->supported_write_flags);
 +    bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED |
 +        ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) &
 +            bs->file->bs->supported_zero_flags);
      if (bs->probed && !bdrv_is_read_only(bs)) {
          fprintf(stderr,
 diff --git a/block/throttle.c b/block/throttle.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/throttle.c
 +++ b/block/throttle.c
@@ -XXX,XX +XXX,XX @@ static int throttle_open(BlockDriverState *bs, QDict *options,
      if (!bs->file) {
          return -EINVAL;
      }
 -    bs->supported_write_flags = bs->file->bs->supported_write_flags;
 -    bs->supported_zero_flags = bs->file->bs->supported_zero_flags;
 +    bs->supported_write_flags = bs->file->bs->supported_write_flags |
 +                                BDRV_REQ_WRITE_UNCHANGED;
 +    bs->supported_zero_flags = bs->file->bs->supported_zero_flags |
 +                               BDRV_REQ_WRITE_UNCHANGED;
      return throttle_configure_tgm(bs, tgm, options, errp);
  }
 --
 .13.6

-[Qemu-devel] [PULL 34/37] block: Document BDRV_REQ_WRITE_UNCHANGED support
+[Qemu-devel] [PULL v3 15/35] block: Remove the deprecated -hdachs option
-From: Max Reitz <mreitz@redhat.com>
+From: Thomas Huth <thuth@redhat.com>
-Add BDRV_REQ_WRITE_UNCHANGED to the list of flags honored during pwrite
+It's been marked as deprecated since QEMU v2.10.0, and so far nobody
-and pwrite_zeroes, and also add a note on when you absolutely need to
+complained that we should keep it, so let's remove this legacy option
-support it.
+now to simplify the code quite a bit.
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+Signed-off-by: Thomas Huth <thuth@redhat.com>
-Message-id: 20180502140359.18222-1-mreitz@redhat.com
+Reviewed-by: John Snow <jsnow@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
+Reviewed-by: Markus Armbruster <armbru@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- include/block/block_int.h | 18 ++++++++++++++++--
+ vl.c            | 86 ++-------------------------------------------------------
-file changed, 16 insertions(+), 2 deletions(-)
+ qemu-doc.texi   |  8 ------
+ qemu-options.hx | 19 ++-----------
-diff --git a/include/block/block_int.h b/include/block/block_int.h
+files changed, 4 insertions(+), 109 deletions(-)
 diff --git a/vl.c b/vl.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/block/block_int.h
+--- a/vl.c
-+++ b/include/block/block_int.h
++++ b/vl.c
-@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
+@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
-     /* I/O Limits */
+     const char *boot_order = NULL;
-     BlockLimits bl;
+     const char *boot_once = NULL;
+     DisplayState *ds;
--    /* Flags honored during pwrite (so far: BDRV_REQ_FUA) */
+-    int cyls, heads, secs, translation;
-+    /* Flags honored during pwrite (so far: BDRV_REQ_FUA,
+     QemuOpts *opts, *machine_opts;
-+     * BDRV_REQ_WRITE_UNCHANGED).
+-    QemuOpts *hda_opts = NULL, *icount_opts = NULL, *accel_opts = NULL;
-+     * If a driver does not support BDRV_REQ_WRITE_UNCHANGED, those
++    QemuOpts *icount_opts = NULL, *accel_opts = NULL;
-+     * writes will be issued as normal writes without the flag set.
+     QemuOptsList *olist;
-+     * This is important to note for drivers that do not explicitly
+     int optind;
-+     * request a WRITE permission for their children and instead take
+     const char *optarg;
-+     * the same permissions as their parent did (this is commonly what
+@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
-+     * block filters do).  Such drivers have to be aware that the
-+     * parent may have taken a WRITE_UNCHANGED permission only and is
+     cpu_model = NULL;
-+     * issuing such requests.  Drivers either must make sure that
+     snapshot = 0;
-+     * these requests do not result in plain WRITE accesses (usually
+-    cyls = heads = secs = 0;
-+     * by supporting BDRV_REQ_WRITE_UNCHANGED, and then forwarding
+-    translation = BIOS_ATA_TRANSLATION_AUTO;
-+     * every incoming write request as-is, including potentially that
-+     * flag), or they have to explicitly take the WRITE permission for
+     nb_nics = 0;
-+     * their children. */
-     unsigned int supported_write_flags;
+@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
-     /* Flags honored during pwrite_zeroes (so far: BDRV_REQ_FUA,
+         if (optind >= argc)
--     * BDRV_REQ_MAY_UNMAP) */
+             break;
-+     * BDRV_REQ_MAY_UNMAP, BDRV_REQ_WRITE_UNCHANGED) */
+         if (argv[optind][0] != '-') {
-     unsigned int supported_zero_flags;
+-            hda_opts = drive_add(IF_DEFAULT, 0, argv[optind++], HD_OPTS);
++            drive_add(IF_DEFAULT, 0, argv[optind++], HD_OPTS);
-     /* the following member gives a name to every node on the bs graph. */
+         } else {
              const QEMUOption *popt;
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
                  cpu_model = optarg;
                  break;
              case QEMU_OPTION_hda:
 -                {
 -                    char buf[256];
 -                    if (cyls == 0)
 -                        snprintf(buf, sizeof(buf), "%s", HD_OPTS);
 -                    else
 -                        snprintf(buf, sizeof(buf),
 -                                 "%s,cyls=%d,heads=%d,secs=%d%s",
 -                                 HD_OPTS , cyls, heads, secs,
 -                                 translation == BIOS_ATA_TRANSLATION_LBA ?
 -                                 ",trans=lba" :
 -                                 translation == BIOS_ATA_TRANSLATION_NONE ?
 -                                 ",trans=none" : "");
 -                    drive_add(IF_DEFAULT, 0, optarg, buf);
 -                    break;
 -                }
              case QEMU_OPTION_hdb:
              case QEMU_OPTION_hdc:
              case QEMU_OPTION_hdd:
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
              case QEMU_OPTION_snapshot:
                  snapshot = 1;
                  break;
 -            case QEMU_OPTION_hdachs:
 -                {
 -                    const char *p;
 -                    p = optarg;
 -                    cyls = strtol(p, (char **)&p, 0);
 -                    if (cyls < 1 || cyls > 16383)
 -                        goto chs_fail;
 -                    if (*p != ',')
 -                        goto chs_fail;
 -                    p++;
 -                    heads = strtol(p, (char **)&p, 0);
 -                    if (heads < 1 || heads > 16)
 -                        goto chs_fail;
 -                    if (*p != ',')
 -                        goto chs_fail;
 -                    p++;
 -                    secs = strtol(p, (char **)&p, 0);
 -                    if (secs < 1 || secs > 63)
 -                        goto chs_fail;
 -                    if (*p == ',') {
 -                        p++;
 -                        if (!strcmp(p, "large")) {
 -                            translation = BIOS_ATA_TRANSLATION_LARGE;
 -                        } else if (!strcmp(p, "rechs")) {
 -                            translation = BIOS_ATA_TRANSLATION_RECHS;
 -                        } else if (!strcmp(p, "none")) {
 -                            translation = BIOS_ATA_TRANSLATION_NONE;
 -                        } else if (!strcmp(p, "lba")) {
 -                            translation = BIOS_ATA_TRANSLATION_LBA;
 -                        } else if (!strcmp(p, "auto")) {
 -                            translation = BIOS_ATA_TRANSLATION_AUTO;
 -                        } else {
 -                            goto chs_fail;
 -                        }
 -                    } else if (*p != '\0') {
 -                    chs_fail:
 -                        error_report("invalid physical CHS format");
 -                        exit(1);
 -                    }
 -                    if (hda_opts != NULL) {
 -                        qemu_opt_set_number(hda_opts, "cyls", cyls,
 -                                            &error_abort);
 -                        qemu_opt_set_number(hda_opts, "heads", heads,
 -                                            &error_abort);
 -                        qemu_opt_set_number(hda_opts, "secs", secs,
 -                                            &error_abort);
 -                        if (translation == BIOS_ATA_TRANSLATION_LARGE) {
 -                            qemu_opt_set(hda_opts, "trans", "large",
 -                                         &error_abort);
 -                        } else if (translation == BIOS_ATA_TRANSLATION_RECHS) {
 -                            qemu_opt_set(hda_opts, "trans", "rechs",
 -                                         &error_abort);
 -                        } else if (translation == BIOS_ATA_TRANSLATION_LBA) {
 -                            qemu_opt_set(hda_opts, "trans", "lba",
 -                                         &error_abort);
 -                        } else if (translation == BIOS_ATA_TRANSLATION_NONE) {
 -                            qemu_opt_set(hda_opts, "trans", "none",
 -                                         &error_abort);
 -                        }
 -                    }
 -                }
 -                error_report("'-hdachs' is deprecated, please use '-device"
 -                             " ide-hd,cyls=c,heads=h,secs=s,...' instead");
 -                break;
              case QEMU_OPTION_numa:
                  opts = qemu_opts_parse_noisily(qemu_find_opts("numa"),
                                                 optarg, true);
 diff --git a/qemu-doc.texi b/qemu-doc.texi
 index XXXXXXX..XXXXXXX 100644
 --- a/qemu-doc.texi
 +++ b/qemu-doc.texi
@@ -XXX,XX +XXX,XX @@ The ``--net dump'' argument is now replaced with the
  ``-object filter-dump'' argument which works in combination
  with the modern ``-netdev`` backends instead.
 -@subsection -hdachs (since 2.10.0)
 -
 -The ``-hdachs'' argument is now a synonym for setting
 -the ``cyls'', ``heads'', ``secs'', and ``trans'' properties
 -on the ``ide-hd'' device using the ``-device'' argument.
 -The new syntax allows different settings to be provided
 -per disk.
 -
  @subsection -usbdevice (since 2.10.0)
  The ``-usbdevice DEV'' argument is now a synonym for setting
 diff --git a/qemu-options.hx b/qemu-options.hx
 index XXXXXXX..XXXXXXX 100644
 --- a/qemu-options.hx
 +++ b/qemu-options.hx
@@ -XXX,XX +XXX,XX @@ of available connectors of a given interface type.
  @item media=@var{media}
  This option defines the type of the media: disk or cdrom.
  @item cyls=@var{c},heads=@var{h},secs=@var{s}[,trans=@var{t}]
 -These options have the same definition as they have in @option{-hdachs}.
 -These parameters are deprecated, use the corresponding parameters
 +Force disk physical geometry and the optional BIOS translation (trans=none or
 +lba). These parameters are deprecated, use the corresponding parameters
  of @code{-device} instead.
  @item snapshot=@var{snapshot}
  @var{snapshot} is "on" or "off" and controls snapshot mode for the given drive
@@ -XXX,XX +XXX,XX @@ the raw disk image you use is not written back. You can however force
  the write back by pressing @key{C-a s} (@pxref{disk_images}).
  ETEXI
 -DEF("hdachs", HAS_ARG, QEMU_OPTION_hdachs, \
 -    "-hdachs c,h,s[,t]\n" \
 -    "                force hard disk 0 physical geometry and the optional BIOS\n" \
 -    "                translation (t=none or lba) (usually QEMU can guess them)\n",
 -    QEMU_ARCH_ALL)
 -STEXI
 -@item -hdachs @var{c},@var{h},@var{s},[,@var{t}]
 -@findex -hdachs
 -Force hard disk 0 physical geometry (1 <= @var{c} <= 16383, 1 <=
 -@var{h} <= 16, 1 <= @var{s} <= 63) and optionally force the BIOS
 -translation mode (@var{t}=none, lba or auto). Usually QEMU can guess
 -all those parameters. This option is deprecated, please use
 -@code{-device ide-hd,cyls=c,heads=h,secs=s,...} instead.
 -ETEXI
 -
  DEF("fsdev", HAS_ARG, QEMU_OPTION_fsdev,
      "-fsdev fsdriver,id=id[,path=path,][security_model={mapped-xattr|mapped-file|passthrough|none}]\n"
      " [,writeout=immediate][,readonly][,socket=socket|sock_fd=sock_fd][,fmode=fmode][,dmode=dmode]\n"
 --
 .13.6

-[Qemu-devel] [PULL 32/37] iotests: Add test for COR across nodes
+[Qemu-devel] [PULL v3 16/35] block: Mention -drive cyls/heads/secs/trans/serial/addr in deprecation chapter
-From: Max Reitz <mreitz@redhat.com>
+From: Thomas Huth <thuth@redhat.com>
-COR across nodes (that is, you have some filter node between the
+Looks like we forgot to announce the deprecation of these options in
-actually COR target and the node that performs the COR) cannot reliably
+the corresponding chapter of the qemu-doc text, so let's do that now.
 work together with the permission system when there is no explicit COR
 node that can request the WRITE_UNCHANGED permission for its child.
 This is because COR (currently) sneaks its requests by the usual
 permission checks, so it can work without a WRITE* permission; but if
 there is a filter node in between, that will re-issue the request, which
 then passes through the usual check -- and if nobody has requested a
 WRITE_UNCHANGED permission, that check will fail.
-There is no real direct fix apart from hoping that there is someone who
+Signed-off-by: Thomas Huth <thuth@redhat.com>
-has requested that permission; in case of just the qemu-io HMP command
+Reviewed-by: John Snow <jsnow@redhat.com>
-(and no guest device), however, that is not the case.  The real real fix
+Reviewed-by: Markus Armbruster <armbru@redhat.com>
-is to implement the copy-on-read flag through an implicitly added COR
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-node.  Such a node can request the necessary permissions as shown in
+---
-this test.
+ qemu-doc.texi | 15 +++++++++++++++
 file changed, 15 insertions(+)
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+diff --git a/qemu-doc.texi b/qemu-doc.texi
-Message-id: 20180421132929.21610-10-mreitz@redhat.com
+index XXXXXXX..XXXXXXX 100644
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
+--- a/qemu-doc.texi
-Signed-off-by: Max Reitz <mreitz@redhat.com>
++++ b/qemu-doc.texi
----
+@@ -XXX,XX +XXX,XX @@ longer be directly supported in QEMU.
- tests/qemu-iotests/216     | 115 +++++++++++++++++++++++++++++++++++++++++++++
+ The ``-drive if=scsi'' argument is replaced by the the
- tests/qemu-iotests/216.out |  28 +++++++++++
+ ``-device BUS-TYPE'' argument combined with ``-drive if=none''.
- tests/qemu-iotests/group   |   1 +
-files changed, 144 insertions(+)
++@subsection -drive cyls=...,heads=...,secs=...,trans=... (since 2.10.0)
  create mode 100755 tests/qemu-iotests/216
  create mode 100644 tests/qemu-iotests/216.out
 diff --git a/tests/qemu-iotests/216 b/tests/qemu-iotests/216
 new file mode 100755
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/tests/qemu-iotests/216
@@ -XXX,XX +XXX,XX @@
 +#!/usr/bin/env python
 +#
 +# Copy-on-read tests using a COR filter node
 +#
 +# Copyright (C) 2018 Red Hat, Inc.
 +#
 +# This program is free software; you can redistribute it and/or modify
 +# it under the terms of the GNU General Public License as published by
 +# the Free Software Foundation; either version 2 of the License, or
 +# (at your option) any later version.
 +#
 +# This program is distributed in the hope that it will be useful,
 +# but WITHOUT ANY WARRANTY; without even the implied warranty of
 +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 +# GNU General Public License for more details.
 +#
 +# You should have received a copy of the GNU General Public License
 +# along with this program.  If not, see <http://www.gnu.org/licenses/>.
 +#
 +# Creator/Owner: Max Reitz <mreitz@redhat.com>
 +
-+import iotests
++The drive geometry arguments are replaced by the the geometry arguments
-+from iotests import log, qemu_img_pipe, qemu_io, filter_qemu_io
++that can be specified with the ``-device'' parameter.
 +
-+# Need backing file support
++@subsection -drive serial=... (since 2.10.0)
 +iotests.verify_image_format(supported_fmts=['qcow2', 'qcow', 'qed', 'vmdk'])
 +iotests.verify_platform(['linux'])
 +
-+log('')
++The drive serial argument is replaced by the the serial argument
-+log('=== Copy-on-read across nodes ===')
++that can be specified with the ``-device'' parameter.
 +log('')
 +
-+# The old copy-on-read mechanism without a filter node cannot request
++@subsection -drive addr=... (since 2.10.0)
 +# WRITE_UNCHANGED permissions for its child.  Therefore it just tries
 +# to sneak its write by the usual permission system and holds its
 +# fingers crossed.  However, that sneaking does not work so well when
 +# there is a filter node in the way: That will receive the write
 +# request and re-issue a new one to its child, which this time is a
 +# proper write request that will make the permission system cough --
 +# unless there is someone at the top (like a guest device) that has
 +# requested write permissions.
 +#
 +# A COR filter node, however, can request the proper permissions for
 +# its child and therefore is not hit by this issue.
 +
-+with iotests.FilePath('base.img') as base_img_path, \
++The drive addr argument is replaced by the the addr argument
-+     iotests.FilePath('top.img') as top_img_path, \
++that can be specified with the ``-device'' parameter.
 +     iotests.VM() as vm:
 +
-+    log('--- Setting up images ---')
+ @subsection -net dump (since 2.10.0)
-+    log('')
-+
+ The ``--net dump'' argument is now replaced with the
 +    qemu_img_pipe('create', '-f', iotests.imgfmt, base_img_path, '64M')
 +
 +    log(filter_qemu_io(qemu_io(base_img_path, '-c', 'write -P 1 0M 1M')))
 +
 +    qemu_img_pipe('create', '-f', iotests.imgfmt, '-b', base_img_path,
 +                  top_img_path)
 +
 +    log(filter_qemu_io(qemu_io(top_img_path,  '-c', 'write -P 2 1M 1M')))
 +
 +    log('')
 +    log('--- Doing COR ---')
 +    log('')
 +
 +    # Compare with e.g. the following:
 +    #   vm.add_drive_raw('if=none,node-name=node0,copy-on-read=on,driver=raw,' \
 +    #                    'file.driver=%s,file.file.filename=%s' %
 +    #                       (iotests.imgfmt, top_img_path))
 +    # (Remove the blockdev-add instead.)
 +    # ((Not tested here because it hits an assertion in the permission
 +    #   system.))
 +
 +    vm.launch()
 +
 +    log(vm.qmp('blockdev-add',
 +                    node_name='node0',
 +                    driver='copy-on-read',
 +                    file={
 +                        'driver': 'raw',
 +                        'file': {
 +                            'driver': 'copy-on-read',
 +                            'file': {
 +                                'driver': 'raw',
 +                                'file': {
 +                                    'driver': iotests.imgfmt,
 +                                    'file': {
 +                                        'driver': 'file',
 +                                        'filename': top_img_path
 +                                    },
 +                                    'backing': {
 +                                        'driver': iotests.imgfmt,
 +                                        'file': {
 +                                            'driver': 'file',
 +                                            'filename': base_img_path
 +                                        }
 +                                    }
 +                                }
 +                            }
 +                        }
 +                    }))
 +
 +    # Trigger COR
 +    log(vm.qmp('human-monitor-command',
 +               command_line='qemu-io node0 "read 0 64M"'))
 +
 +    vm.shutdown()
 +
 +    log('')
 +    log('--- Checking COR result ---')
 +    log('')
 +
 +    log(filter_qemu_io(qemu_io(base_img_path, '-c', 'discard 0 64M')))
 +    log(filter_qemu_io(qemu_io(top_img_path,  '-c', 'read -P 1 0M 1M')))
 +    log(filter_qemu_io(qemu_io(top_img_path,  '-c', 'read -P 2 1M 1M')))
 diff --git a/tests/qemu-iotests/216.out b/tests/qemu-iotests/216.out
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/tests/qemu-iotests/216.out
@@ -XXX,XX +XXX,XX @@
 +
 +=== Copy-on-read across nodes ===
 +
 +--- Setting up images ---
 +
 +wrote 1048576/1048576 bytes at offset 0
 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +
 +wrote 1048576/1048576 bytes at offset 1048576
 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +
 +
 +--- Doing COR ---
 +
 +{u'return': {}}
 +{u'return': u''}
 +
 +--- Checking COR result ---
 +
 +discard 67108864/67108864 bytes at offset 0
 +64 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +
 +read 1048576/1048576 bytes at offset 0
 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +
 +read 1048576/1048576 bytes at offset 1048576
 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +
 diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
 index XXXXXXX..XXXXXXX 100644
 --- a/tests/qemu-iotests/group
 +++ b/tests/qemu-iotests/group
@@ -XXX,XX +XXX,XX @@
 rw auto quick
 rw auto
 rw auto quick
 +216 rw auto quick
 rw auto quick
 --
 .13.6

-[Qemu-devel] [PULL 07/37] block: Drop last of the sector-based aio callbacks
+[Qemu-devel] [PULL v3 17/35] block: Remove unused bdrv_requests_pending
-From: Eric Blake <eblake@redhat.com>
+From: Fam Zheng <famz@redhat.com>
-We are gradually moving away from sector-based interfaces, towards
+Signed-off-by: Fam Zheng <famz@redhat.com>
 byte-based.  Now that all drivers with aio callbacks are using the
 byte-based interfaces, we can remove the sector-based versions.
 Signed-off-by: Eric Blake <eblake@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- include/block/block_int.h |  6 ----
+ include/block/block_int.h |  1 -
- block/io.c                | 84 ++++++++++++++++++++---------------------------
+ block/io.c                | 18 ------------------
-files changed, 36 insertions(+), 54 deletions(-)
+files changed, 19 deletions(-)
 diff --git a/include/block/block_int.h b/include/block/block_int.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/block/block_int.h
 +++ b/include/block/block_int.h
-@@ -XXX,XX +XXX,XX @@ struct BlockDriver {
+@@ -XXX,XX +XXX,XX @@ bool blk_dev_is_tray_open(BlockBackend *blk);
-     void (*bdrv_refresh_filename)(BlockDriverState *bs, QDict *options);
+ bool blk_dev_is_medium_locked(BlockBackend *blk);
-     /* aio */
+ void bdrv_set_dirty(BlockDriverState *bs, int64_t offset, int64_t bytes);
--    BlockAIOCB *(*bdrv_aio_readv)(BlockDriverState *bs,
+-bool bdrv_requests_pending(BlockDriverState *bs);
--        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
--        BlockCompletionFunc *cb, void *opaque);
+ void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out);
-     BlockAIOCB *(*bdrv_aio_preadv)(BlockDriverState *bs,
+ void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in);
          uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags,
          BlockCompletionFunc *cb, void *opaque);
 -    BlockAIOCB *(*bdrv_aio_writev)(BlockDriverState *bs,
 -        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
 -        BlockCompletionFunc *cb, void *opaque);
      BlockAIOCB *(*bdrv_aio_pwritev)(BlockDriverState *bs,
          uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags,
          BlockCompletionFunc *cb, void *opaque);
 diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/io.c
 +++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
+@@ -XXX,XX +XXX,XX @@ void bdrv_disable_copy_on_read(BlockDriverState *bs)
-         return drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags);
+     assert(old >= 1);
-     }
+ }
--    /* FIXME - no need to calculate these if .bdrv_aio_preadv exists */
+-/* Check if any requests are in-flight (including throttled requests) */
--    sector_num = offset >> BDRV_SECTOR_BITS;
+-bool bdrv_requests_pending(BlockDriverState *bs)
--    nb_sectors = bytes >> BDRV_SECTOR_BITS;
+-{
 -    BdrvChild *child;
 -
--    if (!drv->bdrv_aio_preadv) {
+-    if (atomic_read(&bs->in_flight)) {
--        assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+-        return true;
 -        assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
 -        assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
 -    }
 -
--    if (drv->bdrv_co_readv) {
+-    QLIST_FOREACH(child, &bs->children, next) {
--        return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
+-        if (bdrv_requests_pending(child->bs)) {
--    } else {
+-            return true;
 +    if (drv->bdrv_aio_preadv) {
          BlockAIOCB *acb;
          CoroutineIOCompletion co = {
              .coroutine = qemu_coroutine_self(),
          };
 -        if (drv->bdrv_aio_preadv) {
 -            acb = drv->bdrv_aio_preadv(bs, offset, bytes, qiov, flags,
 -                                       bdrv_co_io_em_complete, &co);
 -        } else {
 -            acb = drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
 -                                      bdrv_co_io_em_complete, &co);
 -        }
-+        acb = drv->bdrv_aio_preadv(bs, offset, bytes, qiov, flags,
-+                                   bdrv_co_io_em_complete, &co);
-         if (acb == NULL) {
-             return -EIO;
-         } else {
-@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
-             return co.ret;
-         }
-     }
-+
-+    sector_num = offset >> BDRV_SECTOR_BITS;
-+    nb_sectors = bytes >> BDRV_SECTOR_BITS;
-+
-+    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
-+    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
-+    assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
-+    assert(drv->bdrv_co_readv);
-+
-+    return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
- }
- static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
-@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
-         goto emulate_flags;
-     }
--    /* FIXME - no need to calculate these if .bdrv_aio_pwritev exists */
--    sector_num = offset >> BDRV_SECTOR_BITS;
--    nb_sectors = bytes >> BDRV_SECTOR_BITS;
--
--    if (!drv->bdrv_aio_pwritev) {
--        assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
--        assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
--        assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
 -    }
 -
--    if (drv->bdrv_co_writev_flags) {
+-    return false;
--        ret = drv->bdrv_co_writev_flags(bs, sector_num, nb_sectors, qiov,
+-}
--                                        flags & bs->supported_write_flags);
+-
--        flags &= ~bs->supported_write_flags;
+ typedef struct {
--    } else if (drv->bdrv_co_writev) {
+     Coroutine *co;
--        assert(!bs->supported_write_flags);
+     BlockDriverState *bs;
 -        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
 -    } else {
 +    if (drv->bdrv_aio_pwritev) {
          BlockAIOCB *acb;
          CoroutineIOCompletion co = {
              .coroutine = qemu_coroutine_self(),
          };
 -        if (drv->bdrv_aio_pwritev) {
 -            acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov,
 -                                        flags & bs->supported_write_flags,
 -                                        bdrv_co_io_em_complete, &co);
 -            flags &= ~bs->supported_write_flags;
 -        } else {
 -            assert(!bs->supported_write_flags);
 -            acb = drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors,
 -                                       bdrv_co_io_em_complete, &co);
 -        }
 +        acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov,
 +                                    flags & bs->supported_write_flags,
 +                                    bdrv_co_io_em_complete, &co);
 +        flags &= ~bs->supported_write_flags;
          if (acb == NULL) {
              ret = -EIO;
          } else {
              qemu_coroutine_yield();
              ret = co.ret;
          }
 +        goto emulate_flags;
 +    }
 +
 +    sector_num = offset >> BDRV_SECTOR_BITS;
 +    nb_sectors = bytes >> BDRV_SECTOR_BITS;
 +
 +    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
 +    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
 +    assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
 +
 +    if (drv->bdrv_co_writev_flags) {
 +        ret = drv->bdrv_co_writev_flags(bs, sector_num, nb_sectors, qiov,
 +                                        flags & bs->supported_write_flags);
 +        flags &= ~bs->supported_write_flags;
 +    } else {
 +        assert(drv->bdrv_co_writev);
 +        assert(!bs->supported_write_flags);
 +        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
      }
  emulate_flags:
 --
 .13.6

-[Qemu-devel] [PULL 27/37] block: Set BDRV_REQ_WRITE_UNCHANGED for COR writes
+[Qemu-devel] [PULL v3 18/35] block: Assert drain_all is only called from main AioContext
-From: Max Reitz <mreitz@redhat.com>
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
+Reviewed-by: Fam Zheng <famz@redhat.com>
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Reviewed-by: Alberto Garcia <berto@igalia.com>
 Message-id: 20180421132929.21610-5-mreitz@redhat.com
 Reviewed-by: Kevin Wolf <kwolf@redhat.com>
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 ---
- block/io.c | 6 ++++--
+ block/io.c | 6 ++++++
-file changed, 4 insertions(+), 2 deletions(-)
+file changed, 6 insertions(+)
 diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/io.c
 +++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child,
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
-                 /* FIXME: Should we (perhaps conditionally) be setting
+     BdrvNextIterator it;
-                  * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy
+     GSList *aio_ctxs = NULL, *ctx;
-                  * that still correctly reads as zero? */
--                ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum, 0);
++    /* BDRV_POLL_WHILE() for a node can only be called from its own I/O thread
-+                ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum,
++     * or the main loop AioContext. We potentially use BDRV_POLL_WHILE() on
-+                                               BDRV_REQ_WRITE_UNCHANGED);
++     * nodes in several different AioContexts, so make sure we're in the main
-             } else {
++     * context. */
-                 /* This does not change the data on the disk, it is not
++    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
-                  * necessary to flush even in cache=writethrough mode.
++
-                  */
+     block_job_pause_all();
-                 ret = bdrv_driver_pwritev(bs, cluster_offset, pnum,
--                                          &local_qiov, 0);
+     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
 +                                          &local_qiov,
 +                                          BDRV_REQ_WRITE_UNCHANGED);
              }
              if (ret < 0) {
 --
 .13.6

-[Qemu-devel] [PULL 14/37] blockjob: Implement block_job_set_speed() centrally
+[Qemu-devel] [PULL v3 19/35] block: Make bdrv_drain() driver callbacks non-recursive
-All block job drivers support .set_speed and all of them duplicate the
+bdrv_drained_begin() doesn't increase bs->quiesce_counter recursively
-same code to implement it. Move that code to blockjob.c and remove the
+and also doesn't notify other parent nodes of children, which both means
-now useless callback.
+that the child nodes are not actually drained, and bdrv_drained_begin()
 is providing useful functionality only on a single node.
 To keep things consistent, we also shouldn't call the block driver
 callbacks recursively.
 A proper recursive drain version that provides an actually working
 drained section for child nodes will be introduced later.
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
+Reviewed-by: Fam Zheng <famz@redhat.com>
 Reviewed-by: Max Reitz <mreitz@redhat.com>
 Reviewed-by: John Snow <jsnow@redhat.com>
 ---
- include/block/blockjob.h     |  2 ++
+ block/io.c | 16 +++++++++-------
- include/block/blockjob_int.h |  3 ---
+file changed, 9 insertions(+), 7 deletions(-)
  block/backup.c               | 13 -------------
  block/commit.c               | 14 --------------
  block/mirror.c               | 26 ++++++--------------------
  block/stream.c               | 14 --------------
  blockjob.c                   | 12 ++++--------
 files changed, 12 insertions(+), 72 deletions(-)
-diff --git a/include/block/blockjob.h b/include/block/blockjob.h
+diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/block/blockjob.h
+--- a/block/io.c
-+++ b/include/block/blockjob.h
++++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
  #include "block/block.h"
  #include "qemu/ratelimit.h"
 +#define BLOCK_JOB_SLICE_TIME 100000000ULL /* ns */
 +
  typedef struct BlockJobDriver BlockJobDriver;
  typedef struct BlockJobTxn BlockJobTxn;
 diff --git a/include/block/blockjob_int.h b/include/block/blockjob_int.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/block/blockjob_int.h
 +++ b/include/block/blockjob_int.h
@@ -XXX,XX +XXX,XX @@ struct BlockJobDriver {
      /** String describing the operation, part of query-block-jobs QMP API */
      BlockJobType job_type;
 -    /** Optional callback for job types that support setting a speed limit */
 -    void (*set_speed)(BlockJob *job, int64_t speed, Error **errp);
 -
      /** Mandatory: Entrypoint for the Coroutine. */
      CoroutineEntry *start;
 diff --git a/block/backup.c b/block/backup.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/backup.c
 +++ b/block/backup.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/error-report.h"
  #define BACKUP_CLUSTER_SIZE_DEFAULT (1 << 16)
 -#define SLICE_TIME 100000000ULL /* ns */
  typedef struct BackupBlockJob {
      BlockJob common;
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn backup_before_write_notify(
      return backup_do_cow(job, req->offset, req->bytes, NULL, true);
  }
--static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp)
+ /* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
--{
+-static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
--    BackupBlockJob *s = container_of(job, BackupBlockJob, common);
++static void bdrv_drain_invoke(BlockDriverState *bs, bool begin, bool recursive)
 -
 -    if (speed < 0) {
 -        error_setg(errp, QERR_INVALID_PARAMETER, "speed");
 -        return;
 -    }
 -    ratelimit_set_speed(&s->common.limit, speed, SLICE_TIME);
 -}
 -
  static void backup_cleanup_sync_bitmap(BackupBlockJob *job, int ret)
  {
-     BdrvDirtyBitmap *bm;
+     BdrvChild *child, *tmp;
-@@ -XXX,XX +XXX,XX @@ static const BlockJobDriver backup_job_driver = {
+     BdrvCoDrainData data = { .bs = bs, .done = false, .begin = begin};
-     .instance_size          = sizeof(BackupBlockJob),
+@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
-     .job_type               = BLOCK_JOB_TYPE_BACKUP,
+     bdrv_coroutine_enter(bs, data.co);
-     .start                  = backup_run,
+     BDRV_POLL_WHILE(bs, !data.done);
--    .set_speed              = backup_set_speed,
-     .commit                 = backup_commit,
+-    QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
-     .abort                  = backup_abort,
+-        bdrv_drain_invoke(child->bs, begin);
-     .clean                  = backup_clean,
++    if (recursive) {
-diff --git a/block/commit.c b/block/commit.c
++        QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
-index XXXXXXX..XXXXXXX 100644
++            bdrv_drain_invoke(child->bs, begin, true);
---- a/block/commit.c
++        }
-+++ b/block/commit.c
+     }
@@ -XXX,XX +XXX,XX @@ enum {
      COMMIT_BUFFER_SIZE = 512 * 1024, /* in bytes */
  };
 -#define SLICE_TIME 100000000ULL /* ns */
 -
  typedef struct CommitBlockJob {
      BlockJob common;
      BlockDriverState *commit_top_bs;
@@ -XXX,XX +XXX,XX @@ out:
      block_job_defer_to_main_loop(&s->common, commit_complete, data);
  }
--static void commit_set_speed(BlockJob *job, int64_t speed, Error **errp)
+@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
--{
+         bdrv_parent_drained_begin(bs);
--    CommitBlockJob *s = container_of(job, CommitBlockJob, common);
+     }
--
--    if (speed < 0) {
+-    bdrv_drain_invoke(bs, true);
--        error_setg(errp, QERR_INVALID_PARAMETER, "speed");
++    bdrv_drain_invoke(bs, true, false);
--        return;
+     bdrv_drain_recurse(bs);
 -    }
 -    ratelimit_set_speed(&s->common.limit, speed, SLICE_TIME);
 -}
 -
  static const BlockJobDriver commit_job_driver = {
      .instance_size = sizeof(CommitBlockJob),
      .job_type      = BLOCK_JOB_TYPE_COMMIT,
 -    .set_speed     = commit_set_speed,
      .start         = commit_run,
  };
 diff --git a/block/mirror.c b/block/mirror.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/mirror.c
 +++ b/block/mirror.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/ratelimit.h"
  #include "qemu/bitmap.h"
 -#define SLICE_TIME    100000000ULL /* ns */
  #define MAX_IN_FLIGHT 16
  #define MAX_IO_BYTES (1 << 20) /* 1 Mb */
  #define DEFAULT_MIRROR_BUF_SIZE (MAX_IN_FLIGHT * MAX_IO_BYTES)
@@ -XXX,XX +XXX,XX @@ static void mirror_throttle(MirrorBlockJob *s)
  {
      int64_t now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
 -    if (now - s->last_pause_ns > SLICE_TIME) {
 +    if (now - s->last_pause_ns > BLOCK_JOB_SLICE_TIME) {
          s->last_pause_ns = now;
          block_job_sleep_ns(&s->common, 0);
      } else {
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn mirror_run(void *opaque)
          /* Note that even when no rate limit is applied we need to yield
           * periodically with no pending I/O so that bdrv_drain_all() returns.
 -         * We do so every SLICE_TIME nanoseconds, or when there is an error,
 -         * or when the source is clean, whichever comes first.
 -         */
 +         * We do so every BLKOCK_JOB_SLICE_TIME nanoseconds, or when there is
 +         * an error, or when the source is clean, whichever comes first. */
          delta = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - s->last_pause_ns;
 -        if (delta < SLICE_TIME &&
 +        if (delta < BLOCK_JOB_SLICE_TIME &&
              s->common.iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
              if (s->in_flight >= MAX_IN_FLIGHT || s->buf_free_count == 0 ||
                  (cnt == 0 && s->in_flight > 0)) {
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn mirror_run(void *opaque)
          ret = 0;
          if (s->synced && !should_complete) {
 -            delay_ns = (s->in_flight == 0 && cnt == 0 ? SLICE_TIME : 0);
 +            delay_ns = (s->in_flight == 0 &&
 +                        cnt == 0 ? BLOCK_JOB_SLICE_TIME : 0);
          }
          trace_mirror_before_sleep(s, cnt, s->synced, delay_ns);
          block_job_sleep_ns(&s->common, delay_ns);
@@ -XXX,XX +XXX,XX @@ immediate_exit:
      block_job_defer_to_main_loop(&s->common, mirror_exit, data);
  }
--static void mirror_set_speed(BlockJob *job, int64_t speed, Error **errp)
+@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
--{
+     }
--    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
--
+     /* Re-enable things in child-to-parent order */
--    if (speed < 0) {
+-    bdrv_drain_invoke(bs, false);
--        error_setg(errp, QERR_INVALID_PARAMETER, "speed");
++    bdrv_drain_invoke(bs, false, false);
--        return;
+     bdrv_parent_drained_end(bs);
--    }
+     aio_enable_external(bdrv_get_aio_context(bs));
 -    ratelimit_set_speed(&s->common.limit, speed, SLICE_TIME);
 -}
 -
  static void mirror_complete(BlockJob *job, Error **errp)
  {
      MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
@@ -XXX,XX +XXX,XX @@ static void mirror_drain(BlockJob *job)
  static const BlockJobDriver mirror_job_driver = {
      .instance_size          = sizeof(MirrorBlockJob),
      .job_type               = BLOCK_JOB_TYPE_MIRROR,
 -    .set_speed              = mirror_set_speed,
      .start                  = mirror_run,
      .complete               = mirror_complete,
      .pause                  = mirror_pause,
@@ -XXX,XX +XXX,XX @@ static const BlockJobDriver mirror_job_driver = {
  static const BlockJobDriver commit_active_job_driver = {
      .instance_size          = sizeof(MirrorBlockJob),
      .job_type               = BLOCK_JOB_TYPE_COMMIT,
 -    .set_speed              = mirror_set_speed,
      .start                  = mirror_run,
      .complete               = mirror_complete,
      .pause                  = mirror_pause,
 diff --git a/block/stream.c b/block/stream.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/stream.c
 +++ b/block/stream.c
@@ -XXX,XX +XXX,XX @@ enum {
      STREAM_BUFFER_SIZE = 512 * 1024, /* in bytes */
  };
 -#define SLICE_TIME 100000000ULL /* ns */
 -
  typedef struct StreamBlockJob {
      BlockJob common;
      BlockDriverState *base;
@@ -XXX,XX +XXX,XX @@ out:
      block_job_defer_to_main_loop(&s->common, stream_complete, data);
  }
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
--static void stream_set_speed(BlockJob *job, int64_t speed, Error **errp)
+         aio_context_acquire(aio_context);
--{
+         aio_disable_external(aio_context);
--    StreamBlockJob *s = container_of(job, StreamBlockJob, common);
+         bdrv_parent_drained_begin(bs);
--
+-        bdrv_drain_invoke(bs, true);
--    if (speed < 0) {
++        bdrv_drain_invoke(bs, true, true);
--        error_setg(errp, QERR_INVALID_PARAMETER, "speed");
+         aio_context_release(aio_context);
--        return;
--    }
+         if (!g_slist_find(aio_ctxs, aio_context)) {
--    ratelimit_set_speed(&s->common.limit, speed, SLICE_TIME);
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
--}
--
+         /* Re-enable things in child-to-parent order */
- static const BlockJobDriver stream_job_driver = {
+         aio_context_acquire(aio_context);
-     .instance_size = sizeof(StreamBlockJob),
+-        bdrv_drain_invoke(bs, false);
-     .job_type      = BLOCK_JOB_TYPE_STREAM,
++        bdrv_drain_invoke(bs, false, true);
--    .set_speed     = stream_set_speed,
+         bdrv_parent_drained_end(bs);
-     .start         = stream_run,
+         aio_enable_external(aio_context);
- };
+         aio_context_release(aio_context);
 diff --git a/blockjob.c b/blockjob.c
 index XXXXXXX..XXXXXXX 100644
 --- a/blockjob.c
 +++ b/blockjob.c
@@ -XXX,XX +XXX,XX @@ static bool block_job_timer_pending(BlockJob *job)
  void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
  {
 -    Error *local_err = NULL;
      int64_t old_speed = job->speed;
 -    if (!job->driver->set_speed) {
 -        error_setg(errp, QERR_UNSUPPORTED);
 -        return;
 -    }
      if (block_job_apply_verb(job, BLOCK_JOB_VERB_SET_SPEED, errp)) {
          return;
      }
 -    job->driver->set_speed(job, speed, &local_err);
 -    if (local_err) {
 -        error_propagate(errp, local_err);
 +    if (speed < 0) {
 +        error_setg(errp, QERR_INVALID_PARAMETER, "speed");
          return;
      }
 +    ratelimit_set_speed(&job->limit, speed, BLOCK_JOB_SLICE_TIME);
 +
      job->speed = speed;
      if (speed && speed <= old_speed) {
          return;
 --
 .13.6

-[Qemu-devel] [PULL 05/37] rbd: Switch to byte-based callbacks
+[Qemu-devel] [PULL v3 20/35] test-bdrv-drain: Test callback for bdrv_drain
-From: Eric Blake <eblake@redhat.com>
+The existing test is for bdrv_drain_all_begin/end() only. Generalise the
 test case so that it can be run for the other variants as well. At the
 moment this is only bdrv_drain_begin/end(), but in a while, we'll add
 another one.
-We are gradually moving away from sector-based interfaces, towards
+Also, add a backing file to the test node to test whether the operations
-byte-based.  Make the change for the last few sector-based callbacks
+work recursively.
 in the rbd driver.
-Note that the driver was already using byte-based calls for
-performing actual I/O, so this just gets rid of a round trip
-of scaling; however, as I don't know if RBD is tolerant of
-non-sector AIO operations, I went with the conservate approach
-of adding .bdrv_refresh_limits to override the block layer
-defaults back to the pre-patch value of 512.
-Signed-off-by: Eric Blake <eblake@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- block/rbd.c | 40 ++++++++++++++++++++++------------------
+ tests/test-bdrv-drain.c | 69 ++++++++++++++++++++++++++++++++++++++++++++-----
-file changed, 22 insertions(+), 18 deletions(-)
+file changed, 62 insertions(+), 7 deletions(-)
-diff --git a/block/rbd.c b/block/rbd.c
+diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/rbd.c
+--- a/tests/test-bdrv-drain.c
-+++ b/block/rbd.c
++++ b/tests/test-bdrv-drain.c
-@@ -XXX,XX +XXX,XX @@ done:
+@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_test = {
      .bdrv_co_drain_begin    = bdrv_test_co_drain_begin,
      .bdrv_co_drain_end      = bdrv_test_co_drain_end,
 +
 +    .bdrv_child_perm        = bdrv_format_default_perms,
  };
  static void aio_ret_cb(void *opaque, int ret)
@@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret)
      *aio_ret = ret;
  }
+-static void test_drv_cb_drain_all(void)
-+static void qemu_rbd_refresh_limits(BlockDriverState *bs, Error **errp)
++enum drain_type {
 +    BDRV_DRAIN_ALL,
 +    BDRV_DRAIN,
 +};
 +
 +static void do_drain_begin(enum drain_type drain_type, BlockDriverState *bs)
 +{
-+    /* XXX Does RBD support AIO on less than 512-byte alignment? */
++    switch (drain_type) {
-+    bs->bl.request_alignment = 512;
++    case BDRV_DRAIN_ALL:        bdrv_drain_all_begin(); break;
 +    case BDRV_DRAIN:            bdrv_drained_begin(bs); break;
 +    default:                    g_assert_not_reached();
 +    }
 +}
 +
++static void do_drain_end(enum drain_type drain_type, BlockDriverState *bs)
++{
++    switch (drain_type) {
++    case BDRV_DRAIN_ALL:        bdrv_drain_all_end(); break;
++    case BDRV_DRAIN:            bdrv_drained_end(bs); break;
++    default:                    g_assert_not_reached();
++    }
++}
 +
- static int qemu_rbd_set_auth(rados_t cluster, const char *secretid,
++static void test_drv_cb_common(enum drain_type drain_type, bool recursive)
                               Error **errp)
  {
-@@ -XXX,XX +XXX,XX @@ failed:
+     BlockBackend *blk;
-     return NULL;
+-    BlockDriverState *bs;
 -    BDRVTestState *s;
 +    BlockDriverState *bs, *backing;
 +    BDRVTestState *s, *backing_s;
      BlockAIOCB *acb;
      int aio_ret;
@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain_all(void)
      s = bs->opaque;
      blk_insert_bs(blk, bs, &error_abort);
 +    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
 +    backing_s = backing->opaque;
 +    bdrv_set_backing_hd(bs, backing, &error_abort);
 +
      /* Simple bdrv_drain_all_begin/end pair, check that CBs are called */
      g_assert_cmpint(s->drain_count, ==, 0);
 -    bdrv_drain_all_begin();
 +    g_assert_cmpint(backing_s->drain_count, ==, 0);
 +
 +    do_drain_begin(drain_type, bs);
 +
      g_assert_cmpint(s->drain_count, ==, 1);
 -    bdrv_drain_all_end();
 +    g_assert_cmpint(backing_s->drain_count, ==, !!recursive);
 +
 +    do_drain_end(drain_type, bs);
 +
      g_assert_cmpint(s->drain_count, ==, 0);
 +    g_assert_cmpint(backing_s->drain_count, ==, 0);
      /* Now do the same while a request is pending */
      aio_ret = -EINPROGRESS;
@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain_all(void)
      g_assert_cmpint(aio_ret, ==, -EINPROGRESS);
      g_assert_cmpint(s->drain_count, ==, 0);
 -    bdrv_drain_all_begin();
 +    g_assert_cmpint(backing_s->drain_count, ==, 0);
 +
 +    do_drain_begin(drain_type, bs);
 +
      g_assert_cmpint(aio_ret, ==, 0);
      g_assert_cmpint(s->drain_count, ==, 1);
 -    bdrv_drain_all_end();
 +    g_assert_cmpint(backing_s->drain_count, ==, !!recursive);
 +
 +    do_drain_end(drain_type, bs);
 +
      g_assert_cmpint(s->drain_count, ==, 0);
 +    g_assert_cmpint(backing_s->drain_count, ==, 0);
 +    bdrv_unref(backing);
      bdrv_unref(bs);
      blk_unref(blk);
  }
--static BlockAIOCB *qemu_rbd_aio_readv(BlockDriverState *bs,
++static void test_drv_cb_drain_all(void)
--                                      int64_t sector_num,
++{
--                                      QEMUIOVector *qiov,
++    test_drv_cb_common(BDRV_DRAIN_ALL, true);
--                                      int nb_sectors,
++}
--                                      BlockCompletionFunc *cb,
++
--                                      void *opaque)
++static void test_drv_cb_drain(void)
-+static BlockAIOCB *qemu_rbd_aio_preadv(BlockDriverState *bs,
++{
-+                                       uint64_t offset, uint64_t bytes,
++    test_drv_cb_common(BDRV_DRAIN, false);
-+                                       QEMUIOVector *qiov, int flags,
++}
-+                                       BlockCompletionFunc *cb,
++
-+                                       void *opaque)
+ int main(int argc, char **argv)
  {
--    return rbd_start_aio(bs, sector_num << BDRV_SECTOR_BITS, qiov,
+     bdrv_init();
--                         (int64_t) nb_sectors << BDRV_SECTOR_BITS, cb, opaque,
+@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
-+    return rbd_start_aio(bs, offset, qiov, bytes, cb, opaque,
+     g_test_init(&argc, &argv, NULL);
-                          RBD_AIO_READ);
      g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
 +    g_test_add_func("/bdrv-drain/driver-cb/drain", test_drv_cb_drain);
      return g_test_run();
  }
--static BlockAIOCB *qemu_rbd_aio_writev(BlockDriverState *bs,
--                                       int64_t sector_num,
--                                       QEMUIOVector *qiov,
--                                       int nb_sectors,
--                                       BlockCompletionFunc *cb,
--                                       void *opaque)
-+static BlockAIOCB *qemu_rbd_aio_pwritev(BlockDriverState *bs,
-+                                        uint64_t offset, uint64_t bytes,
-+                                        QEMUIOVector *qiov, int flags,
-+                                        BlockCompletionFunc *cb,
-+                                        void *opaque)
- {
--    return rbd_start_aio(bs, sector_num << BDRV_SECTOR_BITS, qiov,
--                         (int64_t) nb_sectors << BDRV_SECTOR_BITS, cb, opaque,
-+    return rbd_start_aio(bs, offset, qiov, bytes, cb, opaque,
-                          RBD_AIO_WRITE);
- }
-@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_rbd = {
-     .format_name            = "rbd",
-     .instance_size          = sizeof(BDRVRBDState),
-     .bdrv_parse_filename    = qemu_rbd_parse_filename,
-+    .bdrv_refresh_limits    = qemu_rbd_refresh_limits,
-     .bdrv_file_open         = qemu_rbd_open,
-     .bdrv_close             = qemu_rbd_close,
-     .bdrv_reopen_prepare    = qemu_rbd_reopen_prepare,
-@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_rbd = {
-     .bdrv_truncate          = qemu_rbd_truncate,
-     .protocol_name          = "rbd",
--    .bdrv_aio_readv         = qemu_rbd_aio_readv,
--    .bdrv_aio_writev        = qemu_rbd_aio_writev,
-+    .bdrv_aio_preadv        = qemu_rbd_aio_preadv,
-+    .bdrv_aio_pwritev       = qemu_rbd_aio_pwritev,
- #ifdef LIBRBD_SUPPORTS_AIO_FLUSH
-     .bdrv_aio_flush         = qemu_rbd_aio_flush,
 --
 .13.6

-[Qemu-devel] [PULL 31/37] iotests: Copy 197 for COR filter driver
+[Qemu-devel] [PULL v3 21/35] test-bdrv-drain: Test bs->quiesce_counter
-From: Max Reitz <mreitz@redhat.com>
+This is currently only working correctly for bdrv_drain(), not for
 bdrv_drain_all(). Leave a comment for the drain_all case, we'll address
 it later.
-iotest 197 tests copy-on-read using the (now old) copy-on-read flag.
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-Copy it to 215 and modify it to use the COR filter driver instead.
+---
  tests/test-bdrv-drain.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
 file changed, 45 insertions(+)
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
-Message-id: 20180421132929.21610-9-mreitz@redhat.com
+index XXXXXXX..XXXXXXX 100644
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
+--- a/tests/test-bdrv-drain.c
-Signed-off-by: Max Reitz <mreitz@redhat.com>
++++ b/tests/test-bdrv-drain.c
----
+@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain(void)
- tests/qemu-iotests/215     | 120 +++++++++++++++++++++++++++++++++++++++++++++
+     test_drv_cb_common(BDRV_DRAIN, false);
- tests/qemu-iotests/215.out |  26 ++++++++++
+ }
- tests/qemu-iotests/group   |   1 +
-files changed, 147 insertions(+)
++static void test_quiesce_common(enum drain_type drain_type, bool recursive)
- create mode 100755 tests/qemu-iotests/215
++{
- create mode 100644 tests/qemu-iotests/215.out
++    BlockBackend *blk;
++    BlockDriverState *bs, *backing;
 diff --git a/tests/qemu-iotests/215 b/tests/qemu-iotests/215
 new file mode 100755
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/tests/qemu-iotests/215
@@ -XXX,XX +XXX,XX @@
 +#!/bin/bash
 +#
 +# Test case for copy-on-read into qcow2, using the COR filter driver
 +#
 +# Copyright (C) 2018 Red Hat, Inc.
 +#
 +# This program is free software; you can redistribute it and/or modify
 +# it under the terms of the GNU General Public License as published by
 +# the Free Software Foundation; either version 2 of the License, or
 +# (at your option) any later version.
 +#
 +# This program is distributed in the hope that it will be useful,
 +# but WITHOUT ANY WARRANTY; without even the implied warranty of
 +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 +# GNU General Public License for more details.
 +#
 +# You should have received a copy of the GNU General Public License
 +# along with this program.  If not, see <http://www.gnu.org/licenses/>.
 +#
 +
-+seq="$(basename $0)"
++    blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
-+echo "QA output created by $seq"
++    bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
 +                              &error_abort);
 +    blk_insert_bs(blk, bs, &error_abort);
 +
-+here="$PWD"
++    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
-+status=1 # failure is the default!
++    bdrv_set_backing_hd(bs, backing, &error_abort);
 +
-+# get standard environment, filters and checks
++    g_assert_cmpint(bs->quiesce_counter, ==, 0);
-+. ./common.rc
++    g_assert_cmpint(backing->quiesce_counter, ==, 0);
 +. ./common.filter
 +
-+TEST_WRAP="$TEST_DIR/t.wrap.qcow2"
++    do_drain_begin(drain_type, bs);
 +BLKDBG_CONF="$TEST_DIR/blkdebug.conf"
 +
-+# Sanity check: our use of blkdebug fails if $TEST_DIR contains spaces
++    g_assert_cmpint(bs->quiesce_counter, ==, 1);
-+# or other problems
++    g_assert_cmpint(backing->quiesce_counter, ==, !!recursive);
 +case "$TEST_DIR" in
 +    *[^-_a-zA-Z0-9/]*)
 +        _notrun "Suspicious TEST_DIR='$TEST_DIR', cowardly refusing to run" ;;
 +esac
 +
-+_cleanup()
++    do_drain_end(drain_type, bs);
 +
 +    g_assert_cmpint(bs->quiesce_counter, ==, 0);
 +    g_assert_cmpint(backing->quiesce_counter, ==, 0);
 +
 +    bdrv_unref(backing);
 +    bdrv_unref(bs);
 +    blk_unref(blk);
 +}
 +
 +static void test_quiesce_drain_all(void)
 +{
-+    _cleanup_test_img
++    // XXX drain_all doesn't quiesce
-+    rm -f "$TEST_WRAP"
++    //test_quiesce_common(BDRV_DRAIN_ALL, true);
 +    rm -f "$BLKDBG_CONF"
 +}
-+trap "_cleanup; exit \$status" 0 1 2 3 15
 +
-+# Test is supported for any backing file; but we force qcow2 for our wrapper.
++static void test_quiesce_drain(void)
-+_supported_fmt generic
++{
-+_supported_proto generic
++    test_quiesce_common(BDRV_DRAIN, false);
-+_supported_os Linux
++}
 +# LUKS support may be possible, but it complicates things.
 +_unsupported_fmt luks
 +
-+echo
+ int main(int argc, char **argv)
-+echo '=== Copy-on-read ==='
+ {
-+echo
+     bdrv_init();
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
      g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
      g_test_add_func("/bdrv-drain/driver-cb/drain", test_drv_cb_drain);
 +    g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
 +    g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
 +
-+# Prep the images
+     return g_test_run();
-+# VPC rounds image sizes to a specific geometry, force a specific size.
+ }
 +if [ "$IMGFMT" = "vpc" ]; then
 +    IMGOPTS=$(_optstr_add "$IMGOPTS" "force_size")
 +fi
 +_make_test_img 4G
 +$QEMU_IO -c "write -P 55 3G 1k" "$TEST_IMG" | _filter_qemu_io
 +IMGPROTO=file IMGFMT=qcow2 IMGOPTS= TEST_IMG_FILE="$TEST_WRAP" \
 +    _make_test_img -F "$IMGFMT" -b "$TEST_IMG" | _filter_img_create
 +$QEMU_IO -f qcow2 -c "write -z -u 1M 64k" "$TEST_WRAP" | _filter_qemu_io
 +
 +# Ensure that a read of two clusters, but where one is already allocated,
 +# does not re-write the allocated cluster
 +cat > "$BLKDBG_CONF" <<EOF
 +[inject-error]
 +event = "cor_write"
 +sector = "2048"
 +EOF
 +$QEMU_IO -c "open \
 + -o driver=copy-on-read,file.driver=blkdebug,file.config=$BLKDBG_CONF,file.image.driver=qcow2 $TEST_WRAP" \
 + -c "read -P 0 1M 128k" | _filter_qemu_io
 +
 +# Read the areas we want copied. A zero-length read should still be a
 +# no-op.  The next read is under 2G, but aligned so that rounding to
 +# clusters copies more than 2G of zeroes. The final read will pick up
 +# the non-zero data in the same cluster.  Since a 2G read may exhaust
 +# memory on some machines (particularly 32-bit), we skip the test if
 +# that fails due to memory pressure.
 +$QEMU_IO \
 +    -c "open -o driver=copy-on-read,file.driver=qcow2 $TEST_WRAP" \
 +    -c "read 0 0" \
 +    | _filter_qemu_io
 +output=$($QEMU_IO \
 +         -c "open -o driver=copy-on-read,file.driver=qcow2 $TEST_WRAP" \
 +         -c "read -P 0 1k $((2*1024*1024*1024 - 512))" \
 +         2>&1 | _filter_qemu_io)
 +case $output in
 +    *allocate*)
 +        _notrun "Insufficent memory to run test" ;;
 +    *) printf '%s\n' "$output" ;;
 +esac
 +$QEMU_IO \
 +    -c "open -o driver=copy-on-read,file.driver=qcow2 $TEST_WRAP" \
 +    -c "read -P 0 $((3*1024*1024*1024 + 1024)) 1k" \
 +    | _filter_qemu_io
 +
 +# Copy-on-read is incompatible with read-only
 +$QEMU_IO \
 +    -c "open -r -o driver=copy-on-read,file.driver=qcow2 $TEST_WRAP" \
 +    2>&1 | _filter_testdir
 +
 +# Break the backing chain, and show that images are identical, and that
 +# we properly copied over explicit zeros.
 +$QEMU_IMG rebase -u -b "" -f qcow2 "$TEST_WRAP"
 +$QEMU_IO -f qcow2 -c map "$TEST_WRAP"
 +_check_test_img
 +$QEMU_IMG compare -f $IMGFMT -F qcow2 "$TEST_IMG" "$TEST_WRAP"
 +
 +# success, all done
 +echo '*** done'
 +status=0
 diff --git a/tests/qemu-iotests/215.out b/tests/qemu-iotests/215.out
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/tests/qemu-iotests/215.out
@@ -XXX,XX +XXX,XX @@
 +QA output created by 215
 +
 +=== Copy-on-read ===
 +
 +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=4294967296
 +wrote 1024/1024 bytes at offset 3221225472
 +1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +Formatting 'TEST_DIR/t.wrap.IMGFMT', fmt=IMGFMT size=4294967296 backing_file=TEST_DIR/t.IMGFMT backing_fmt=IMGFMT
 +wrote 65536/65536 bytes at offset 1048576
 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +read 131072/131072 bytes at offset 1048576
 +128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +read 0/0 bytes at offset 0
 +0 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +read 2147483136/2147483136 bytes at offset 1024
 +2 GiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +read 1024/1024 bytes at offset 3221226496
 +1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +can't open device TEST_DIR/t.wrap.qcow2: Block node is read-only
 +2 GiB (0x80010000) bytes     allocated at offset 0 bytes (0x0)
 +1023.938 MiB (0x3fff0000) bytes not allocated at offset 2 GiB (0x80010000)
 +64 KiB (0x10000) bytes     allocated at offset 3 GiB (0xc0000000)
 +1023.938 MiB (0x3fff0000) bytes not allocated at offset 3 GiB (0xc0010000)
 +No errors were found on the image.
 +Images are identical.
 +*** done
 diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
 index XXXXXXX..XXXXXXX 100644
 --- a/tests/qemu-iotests/group
 +++ b/tests/qemu-iotests/group
@@ -XXX,XX +XXX,XX @@
 rw auto quick
 rw auto quick
 rw auto
 +215 rw auto quick
 rw auto quick
 --
 .13.6

-[Qemu-devel] [PULL 16/37] blockjob: Add block_job_driver()
+[Qemu-devel] [PULL v3 22/35] blockjob: Pause job on draining any job BDS
-The backup block job directly accesses the driver field in BlockJob. Add
+Block jobs already paused themselves when their main BlockBackend
-a wrapper for getting it.
+entered a drained section. This is not good enough: We also want to
 pause a block job and may not submit new requests if, for example, the
 mirror target node should be drained.
 This implements .drained_begin/end callbacks in child_job in order to
 consider all block nodes related to the job, and removes the
 BlockBackend callbacks which are unnecessary now because the root of the
 job main BlockBackend is always referenced with a child_job, too.
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Reviewed-by: Max Reitz <mreitz@redhat.com>
-Reviewed-by: John Snow <jsnow@redhat.com>
 ---
- include/block/blockjob.h | 7 +++++++
+ blockjob.c | 22 +++++++++-------------
- block/backup.c           | 8 +++++---
+file changed, 9 insertions(+), 13 deletions(-)
  blockjob.c               | 5 +++++
 files changed, 17 insertions(+), 3 deletions(-)
-diff --git a/include/block/blockjob.h b/include/block/blockjob.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/block/blockjob.h
-+++ b/include/block/blockjob.h
-@@ -XXX,XX +XXX,XX @@ void block_job_txn_add_job(BlockJobTxn *txn, BlockJob *job);
-  */
- bool block_job_is_internal(BlockJob *job);
-+/**
-+ * block_job_driver:
-+ *
-+ * Returns the driver associated with a block job.
-+ */
-+const BlockJobDriver *block_job_driver(BlockJob *job);
-+
- #endif
-diff --git a/block/backup.c b/block/backup.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/backup.c
-+++ b/block/backup.c
-@@ -XXX,XX +XXX,XX @@ typedef struct BackupBlockJob {
-     HBitmap *copy_bitmap;
- } BackupBlockJob;
-+static const BlockJobDriver backup_job_driver;
-+
- /* See if in-flight requests overlap and wait for them to complete */
- static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job,
-                                                        int64_t start,
-@@ -XXX,XX +XXX,XX @@ void backup_do_checkpoint(BlockJob *job, Error **errp)
-     BackupBlockJob *backup_job = container_of(job, BackupBlockJob, common);
-     int64_t len;
--    assert(job->driver->job_type == BLOCK_JOB_TYPE_BACKUP);
-+    assert(block_job_driver(job) == &backup_job_driver);
-     if (backup_job->sync_mode != MIRROR_SYNC_MODE_NONE) {
-         error_setg(errp, "The backup job only supports block checkpoint in"
-@@ -XXX,XX +XXX,XX @@ void backup_wait_for_overlapping_requests(BlockJob *job, int64_t offset,
-     BackupBlockJob *backup_job = container_of(job, BackupBlockJob, common);
-     int64_t start, end;
--    assert(job->driver->job_type == BLOCK_JOB_TYPE_BACKUP);
-+    assert(block_job_driver(job) == &backup_job_driver);
-     start = QEMU_ALIGN_DOWN(offset, backup_job->cluster_size);
-     end = QEMU_ALIGN_UP(offset + bytes, backup_job->cluster_size);
-@@ -XXX,XX +XXX,XX @@ void backup_cow_request_begin(CowRequest *req, BlockJob *job,
-     BackupBlockJob *backup_job = container_of(job, BackupBlockJob, common);
-     int64_t start, end;
--    assert(job->driver->job_type == BLOCK_JOB_TYPE_BACKUP);
-+    assert(block_job_driver(job) == &backup_job_driver);
-     start = QEMU_ALIGN_DOWN(offset, backup_job->cluster_size);
-     end = QEMU_ALIGN_UP(offset + bytes, backup_job->cluster_size);
 diff --git a/blockjob.c b/blockjob.c
 index XXXXXXX..XXXXXXX 100644
 --- a/blockjob.c
 +++ b/blockjob.c
-@@ -XXX,XX +XXX,XX @@ static bool block_job_started(BlockJob *job)
+@@ -XXX,XX +XXX,XX @@ static char *child_job_get_parent_desc(BdrvChild *c)
-     return job->co;
+                            job->id);
  }
-+const BlockJobDriver *block_job_driver(BlockJob *job)
+-static const BdrvChildRole child_job = {
-+{
+-    .get_parent_desc    = child_job_get_parent_desc,
-+    return job->driver;
+-    .stay_at_node       = true,
-+}
+-};
-+
+-
- /**
+-static void block_job_drained_begin(void *opaque)
-  * All jobs must allow a pause point before entering their job proper. This
++static void child_job_drained_begin(BdrvChild *c)
-  * ensures that jobs can be paused prior to being started, then resumed later.
+ {
 -    BlockJob *job = opaque;
 +    BlockJob *job = c->opaque;
      block_job_pause(job);
  }
 -static void block_job_drained_end(void *opaque)
 +static void child_job_drained_end(BdrvChild *c)
  {
 -    BlockJob *job = opaque;
 +    BlockJob *job = c->opaque;
      block_job_resume(job);
  }
 -static const BlockDevOps block_job_dev_ops = {
 -    .drained_begin = block_job_drained_begin,
 -    .drained_end = block_job_drained_end,
 +static const BdrvChildRole child_job = {
 +    .get_parent_desc    = child_job_get_parent_desc,
 +    .drained_begin      = child_job_drained_begin,
 +    .drained_end        = child_job_drained_end,
 +    .stay_at_node       = true,
  };
  void block_job_remove_all_bdrv(BlockJob *job)
@@ -XXX,XX +XXX,XX @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver,
      block_job_add_bdrv(job, "main node", bs, 0, BLK_PERM_ALL, &error_abort);
      bs->job = job;
 -    blk_set_dev_ops(blk, &block_job_dev_ops, job);
      bdrv_op_unblock(bs, BLOCK_OP_TYPE_DATAPLANE, job->blocker);
      QLIST_INSERT_HEAD(&block_jobs, job, job_list);
 --
 .13.6

-[Qemu-devel] [PULL 33/37] qemu-img: Check post-truncation size
+[Qemu-devel] [PULL v3 23/35] test-bdrv-drain: Test drain vs. block jobs
-From: Max Reitz <mreitz@redhat.com>
+Block jobs must be paused if any of the involved nodes are drained.
-Some block drivers (iscsi and file-posix when dealing with device files)
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-do not actually support truncation, even though they provide a
+---
-.bdrv_truncate() method and will happily return success when providing a
+ tests/test-bdrv-drain.c | 121 ++++++++++++++++++++++++++++++++++++++++++++++++
-new size that does not exceed the current size.  This is because these
+file changed, 121 insertions(+)
 drivers expect the user to resize the image outside of qemu and then
 provide qemu with that information through the block_resize command
 (compare cb1b83e740384b4e0d950f3d7c81c02b8ce86c2e).
-Of course, anyone using qemu-img resize will find that behavior useless.
+diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
 So we should check the actual size of the image after the supposedly
 successful truncation took place, emit an error if nothing changed and
 emit a warning if the target size was not met.
 Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1523065
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 Message-id: 20180421163957.29872-1-mreitz@redhat.com
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 ---
  qemu-img.c | 39 +++++++++++++++++++++++++++++++++++----
 file changed, 35 insertions(+), 4 deletions(-)
 diff --git a/qemu-img.c b/qemu-img.c
 index XXXXXXX..XXXXXXX 100644
---- a/qemu-img.c
+--- a/tests/test-bdrv-drain.c
-+++ b/qemu-img.c
++++ b/tests/test-bdrv-drain.c
-@@ -XXX,XX +XXX,XX @@ static int img_resize(int argc, char **argv)
+@@ -XXX,XX +XXX,XX @@
-     Error *err = NULL;
-     int c, ret, relative;
+ #include "qemu/osdep.h"
-     const char *filename, *fmt, *size;
+ #include "block/block.h"
--    int64_t n, total_size, current_size;
++#include "block/blockjob_int.h"
-+    int64_t n, total_size, current_size, new_size;
+ #include "sysemu/block-backend.h"
-     bool quiet = false;
+ #include "qapi/error.h"
-     BlockBackend *blk = NULL;
-     PreallocMode prealloc = PREALLOC_MODE_OFF;
+@@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain(void)
-@@ -XXX,XX +XXX,XX @@ static int img_resize(int argc, char **argv)
+     test_quiesce_common(BDRV_DRAIN, false);
-     }
+ }
-     ret = blk_truncate(blk, total_size, prealloc, &err);
++
--    if (!ret) {
++typedef struct TestBlockJob {
--        qprintf(quiet, "Image resized.\n");
++    BlockJob common;
--    } else {
++    bool should_complete;
-+    if (ret < 0) {
++} TestBlockJob;
-         error_report_err(err);
++
-+        goto out;
++static void test_job_completed(BlockJob *job, void *opaque)
 +{
 +    block_job_completed(job, 0);
 +}
 +
 +static void coroutine_fn test_job_start(void *opaque)
 +{
 +    TestBlockJob *s = opaque;
 +
 +    while (!s->should_complete) {
 +        block_job_sleep_ns(&s->common, 100000);
 +    }
 +
-+    new_size = blk_getlength(blk);
++    block_job_defer_to_main_loop(&s->common, test_job_completed, NULL);
-+    if (new_size < 0) {
++}
 +        error_report("Failed to verify truncated image length: %s",
 +                     strerror(-new_size));
 +        ret = -1;
 +        goto out;
      }
 +
-+    /* Some block drivers implement a truncation method, but only so
++static void test_job_complete(BlockJob *job, Error **errp)
-+     * the user can cause qemu to refresh the image's size from disk.
++{
-+     * The idea is that the user resizes the image outside of qemu and
++    TestBlockJob *s = container_of(job, TestBlockJob, common);
-+     * then invokes block_resize to inform qemu about it.
++    s->should_complete = true;
-+     * (This includes iscsi and file-posix for device files.)
++}
-+     * Of course, that is not the behavior someone invoking
++
-+     * qemu-img resize would find useful, so we catch that behavior
++BlockJobDriver test_job_driver = {
-+     * here and tell the user. */
++    .instance_size  = sizeof(TestBlockJob),
-+    if (new_size != total_size && new_size == current_size) {
++    .start          = test_job_start,
-+        error_report("Image was not resized; resizing may not be supported "
++    .complete       = test_job_complete,
-+                     "for this image");
++};
-+        ret = -1;
++
-+        goto out;
++static void test_blockjob_common(enum drain_type drain_type)
 +{
 +    BlockBackend *blk_src, *blk_target;
 +    BlockDriverState *src, *target;
 +    BlockJob *job;
 +    int ret;
 +
 +    src = bdrv_new_open_driver(&bdrv_test, "source", BDRV_O_RDWR,
 +                               &error_abort);
 +    blk_src = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
 +    blk_insert_bs(blk_src, src, &error_abort);
 +
 +    target = bdrv_new_open_driver(&bdrv_test, "target", BDRV_O_RDWR,
 +                                  &error_abort);
 +    blk_target = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
 +    blk_insert_bs(blk_target, target, &error_abort);
 +
 +    job = block_job_create("job0", &test_job_driver, src, 0, BLK_PERM_ALL, 0,
 +                           0, NULL, NULL, &error_abort);
 +    block_job_add_bdrv(job, "target", target, 0, BLK_PERM_ALL, &error_abort);
 +    block_job_start(job);
 +
 +    g_assert_cmpint(job->pause_count, ==, 0);
 +    g_assert_false(job->paused);
 +    g_assert_false(job->busy); /* We're in block_job_sleep_ns() */
 +
 +    do_drain_begin(drain_type, src);
 +
 +    if (drain_type == BDRV_DRAIN_ALL) {
 +        /* bdrv_drain_all() drains both src and target, and involves an
 +         * additional block_job_pause_all() */
 +        g_assert_cmpint(job->pause_count, ==, 3);
 +    } else {
 +        g_assert_cmpint(job->pause_count, ==, 1);
 +    }
++    /* XXX We don't wait until the job is actually paused. Is this okay? */
++    /* g_assert_true(job->paused); */
++    g_assert_false(job->busy); /* The job is paused */
 +
-+    if (new_size != total_size) {
++    do_drain_end(drain_type, src);
-+        warn_report("Image should have been resized to %" PRIi64
++
-+                    " bytes, but was resized to %" PRIi64 " bytes",
++    g_assert_cmpint(job->pause_count, ==, 0);
-+                    total_size, new_size);
++    g_assert_false(job->paused);
 +    g_assert_false(job->busy); /* We're in block_job_sleep_ns() */
 +
 +    do_drain_begin(drain_type, target);
 +
 +    if (drain_type == BDRV_DRAIN_ALL) {
 +        /* bdrv_drain_all() drains both src and target, and involves an
 +         * additional block_job_pause_all() */
 +        g_assert_cmpint(job->pause_count, ==, 3);
 +    } else {
 +        g_assert_cmpint(job->pause_count, ==, 1);
 +    }
++    /* XXX We don't wait until the job is actually paused. Is this okay? */
++    /* g_assert_true(job->paused); */
++    g_assert_false(job->busy); /* The job is paused */
 +
-+    qprintf(quiet, "Image resized.\n");
++    do_drain_end(drain_type, target);
 +
- out:
++    g_assert_cmpint(job->pause_count, ==, 0);
-     blk_unref(blk);
++    g_assert_false(job->paused);
-     if (ret) {
++    g_assert_false(job->busy); /* We're in block_job_sleep_ns() */
 +
 +    ret = block_job_complete_sync(job, &error_abort);
 +    g_assert_cmpint(ret, ==, 0);
 +
 +    blk_unref(blk_src);
 +    blk_unref(blk_target);
 +    bdrv_unref(src);
 +    bdrv_unref(target);
 +}
 +
 +static void test_blockjob_drain_all(void)
 +{
 +    test_blockjob_common(BDRV_DRAIN_ALL);
 +}
 +
 +static void test_blockjob_drain(void)
 +{
 +    test_blockjob_common(BDRV_DRAIN);
 +}
 +
  int main(int argc, char **argv)
  {
      bdrv_init();
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
      g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
      g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
 +    g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
 +    g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
 +
      return g_test_run();
  }
 --
 .13.6

-[Qemu-devel] [PULL 22/37] iotests: Add failure matching to common.qemu
+[Qemu-devel] [PULL v3 24/35] block: Don't block_job_pause_all() in bdrv_drain_all()
-From: Max Reitz <mreitz@redhat.com>
+Block jobs are already paused using the BdrvChildRole drain callbacks,
 so we don't need an additional block_job_pause_all() call.
-Currently, common.qemu only allows to match for results indicating
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-success.  The only way to fail is by provoking a timeout.  However,
+---
-sometimes we do have a defined failure output and can match for that,
+ block/io.c              |  4 ----
-which saves us from having to wait for the timeout in case of failure.
+ tests/test-bdrv-drain.c | 10 ++++------
-Because failure can sometimes just result in a _notrun in the test, it
+files changed, 4 insertions(+), 10 deletions(-)
 is actually important to care about being able to fail quickly.
-Also, sometimes we simply do not get any specific output in case of
+diff --git a/block/io.c b/block/io.c
 success.  The only way to handle this currently would be to define an
 error message as the string to look for, which means that actual success
 results in a timeout.  This is really bad because it unnecessarily slows
 down a succeeding test.
 Therefore, this patch adds a new parameter $success_or_failure to
 _timed_wait_for and _send_qemu_cmd.  Setting this to a non-empty string
 makes both commands expect two match parameters: If the first matches,
 the function succeeds.  If the second matches, the function fails.
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 Message-id: 20180406151731.4285-2-mreitz@redhat.com
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 ---
  tests/qemu-iotests/common.qemu | 58 +++++++++++++++++++++++++++++++++++++-----
 file changed, 51 insertions(+), 7 deletions(-)
 diff --git a/tests/qemu-iotests/common.qemu b/tests/qemu-iotests/common.qemu
 index XXXXXXX..XXXXXXX 100644
---- a/tests/qemu-iotests/common.qemu
+--- a/block/io.c
-+++ b/tests/qemu-iotests/common.qemu
++++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@ _in_fd=4
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
- # response is not echoed out.
+      * context. */
- # If $mismatch_only is set, only non-matching responses will
+     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
- # be echoed.
-+#
+-    block_job_pause_all();
-+# If $success_or_failure is set, the meaning of the arguments is
+-
-+# changed as follows:
+     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
-+# $2: A string to search for in the response; if found, this indicates
+         AioContext *aio_context = bdrv_get_aio_context(bs);
-+#     success and ${QEMU_STATUS[$1]} is set to 0.
-+# $3: A string to search for in the response; if found, this indicates
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
-+#     failure and the test is either aborted (if $qemu_error_no_exit
+         aio_enable_external(aio_context);
-+#     is not set) or ${QEMU_STATUS[$1]} is set to -1 (otherwise).
+         aio_context_release(aio_context);
- function _timed_wait_for()
+     }
- {
+-
-     local h=${1}
+-    block_job_resume_all();
      shift
 +    if [ -z "${success_or_failure}" ]; then
 +        success_match=${*}
 +        failure_match=
 +    else
 +        success_match=${1}
 +        failure_match=${2}
 +    fi
 +
 +    timeout=yes
 +
      QEMU_STATUS[$h]=0
      while IFS= read -t ${QEMU_COMM_TIMEOUT} resp <&${QEMU_OUT[$h]}
      do
@@ -XXX,XX +XXX,XX @@ function _timed_wait_for()
              echo "${resp}" | _filter_testdir | _filter_qemu \
                             | _filter_qemu_io | _filter_qmp | _filter_hmp
          fi
 -        grep -q "${*}" < <(echo "${resp}")
 +        if [ -n "${failure_match}" ]; then
 +            grep -q "${failure_match}" < <(echo "${resp}")
 +            if [ $? -eq 0 ]; then
 +                timeout=
 +                break
 +            fi
 +        fi
 +        grep -q "${success_match}" < <(echo "${resp}")
          if [ $? -eq 0 ]; then
              return
 -        elif [ -z "${silent}" ] && [ -n "${mismatch_only}" ]; then
 +        fi
 +        if [ -z "${silent}" ] && [ -n "${mismatch_only}" ]; then
              echo "${resp}" | _filter_testdir | _filter_qemu \
                             | _filter_qemu_io | _filter_qmp | _filter_hmp
          fi
@@ -XXX,XX +XXX,XX @@ function _timed_wait_for()
      done
      QEMU_STATUS[$h]=-1
      if [ -z "${qemu_error_no_exit}" ]; then
 -        echo "Timeout waiting for ${*} on handle ${h}"
 -        exit 1  # Timeout means the test failed
 +        if [ -n "${timeout}" ]; then
 +            echo "Timeout waiting for ${success_match} on handle ${h}"
 +        else
 +            echo "Wrong response matching ${failure_match} on handle ${h}"
 +        fi
 +        exit 1  # Timeout or wrong match mean the test failed
      fi
  }
-@@ -XXX,XX +XXX,XX @@ function _timed_wait_for()
+ void bdrv_drain_all(void)
- # If $qemu_error_no_exit is set, then even if the expected response
+diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
- # is not seen, we will not exit.  $QEMU_STATUS[$1] will be set it -1 in
+index XXXXXXX..XXXXXXX 100644
- # that case.
+--- a/tests/test-bdrv-drain.c
-+#
++++ b/tests/test-bdrv-drain.c
-+# If $success_or_failure is set, then the last two strings are the
+@@ -XXX,XX +XXX,XX @@ static void test_blockjob_common(enum drain_type drain_type)
-+# strings the response will be scanned for.  The first of the two
+     do_drain_begin(drain_type, src);
-+# indicates success, the latter indicates failure.  Failure is handled
-+# like a timeout.
+     if (drain_type == BDRV_DRAIN_ALL) {
- function _send_qemu_cmd()
+-        /* bdrv_drain_all() drains both src and target, and involves an
- {
+-         * additional block_job_pause_all() */
-     local h=${1}
+-        g_assert_cmpint(job->pause_count, ==, 3);
-@@ -XXX,XX +XXX,XX @@ function _send_qemu_cmd()
++        /* bdrv_drain_all() drains both src and target */
-         use_error="no"
++        g_assert_cmpint(job->pause_count, ==, 2);
-     fi
+     } else {
-     # This array element extraction is done to accommodate pathnames with spaces
+         g_assert_cmpint(job->pause_count, ==, 1);
--    cmd=${@: 1:${#@}-1}
+     }
--    shift $(($# - 1))
+@@ -XXX,XX +XXX,XX @@ static void test_blockjob_common(enum drain_type drain_type)
-+    if [ -z "${success_or_failure}" ]; then
+     do_drain_begin(drain_type, target);
-+        cmd=${@: 1:${#@}-1}
-+        shift $(($# - 1))
+     if (drain_type == BDRV_DRAIN_ALL) {
-+    else
+-        /* bdrv_drain_all() drains both src and target, and involves an
-+        cmd=${@: 1:${#@}-2}
+-         * additional block_job_pause_all() */
-+        shift $(($# - 2))
+-        g_assert_cmpint(job->pause_count, ==, 3);
-+    fi
++        /* bdrv_drain_all() drains both src and target */
++        g_assert_cmpint(job->pause_count, ==, 2);
-     while [ ${count} -gt 0 ]
+     } else {
-     do
+         g_assert_cmpint(job->pause_count, ==, 1);
-         echo "${cmd}" >&${QEMU_IN[${h}]}
+     }
          if [ -n "${1}" ]; then
 -            qemu_error_no_exit=${use_error} _timed_wait_for ${h} "${1}"
 +            if [ -z "${success_or_failure}" ]; then
 +                qemu_error_no_exit=${use_error} _timed_wait_for ${h} "${1}"
 +            else
 +                qemu_error_no_exit=${use_error} _timed_wait_for ${h} "${1}" "${2}"
 +            fi
              if [ ${QEMU_STATUS[$h]} -eq 0 ]; then
                  return
              fi
 --
 .13.6

-[Qemu-devel] [PULL 13/37] blockjob: Move RateLimit to BlockJob
+[Qemu-devel] [PULL v3 25/35] block: Nested drain_end must still call callbacks
-Every block job has a RateLimit, and they all do the exact same thing
+bdrv_do_drained_begin() restricts the call of parent callbacks and
-with it, so it should be common infrastructure. Move the struct field
+aio_disable_external() to the outermost drain section, but the block
-for a start.
+driver callbacks are always called. bdrv_do_drained_end() must match
 this behaviour, otherwise nodes stay drained even if begin/end calls
 were balanced.
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Reviewed-by: Max Reitz <mreitz@redhat.com>
-Reviewed-by: John Snow <jsnow@redhat.com>
 ---
- include/block/blockjob.h | 4 ++++
+ block/io.c | 12 +++++++-----
- block/backup.c           | 5 ++---
+file changed, 7 insertions(+), 5 deletions(-)
  block/commit.c           | 5 ++---
  block/mirror.c           | 6 +++---
  block/stream.c           | 5 ++---
 files changed, 13 insertions(+), 12 deletions(-)
-diff --git a/include/block/blockjob.h b/include/block/blockjob.h
+diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/block/blockjob.h
+--- a/block/io.c
-+++ b/include/block/blockjob.h
++++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
- #define BLOCKJOB_H
+ void bdrv_drained_end(BlockDriverState *bs)
- #include "block/block.h"
+ {
-+#include "qemu/ratelimit.h"
++    int old_quiesce_counter;
  typedef struct BlockJobDriver BlockJobDriver;
  typedef struct BlockJobTxn BlockJobTxn;
@@ -XXX,XX +XXX,XX @@ typedef struct BlockJob {
      /** Speed that was set with @block_job_set_speed.  */
      int64_t speed;
 +    /** Rate limiting data structure for implementing @speed. */
 +    RateLimit limit;
 +
-     /** The completion function that will be called when the job completes.  */
+     if (qemu_in_coroutine()) {
-     BlockCompletionFunc *cb;
+         bdrv_co_yield_to_drain(bs, false);
 diff --git a/block/backup.c b/block/backup.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/backup.c
 +++ b/block/backup.c
@@ -XXX,XX +XXX,XX @@ typedef struct BackupBlockJob {
      /* bitmap for sync=incremental */
      BdrvDirtyBitmap *sync_bitmap;
      MirrorSyncMode sync_mode;
 -    RateLimit limit;
      BlockdevOnError on_source_error;
      BlockdevOnError on_target_error;
      CoRwlock flush_rwlock;
@@ -XXX,XX +XXX,XX @@ static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp)
          error_setg(errp, QERR_INVALID_PARAMETER, "speed");
          return;
      }
--    ratelimit_set_speed(&s->limit, speed, SLICE_TIME);
+     assert(bs->quiesce_counter > 0);
-+    ratelimit_set_speed(&s->common.limit, speed, SLICE_TIME);
+-    if (atomic_fetch_dec(&bs->quiesce_counter) > 1) {
 -        return;
 -    }
 +    old_quiesce_counter = atomic_fetch_dec(&bs->quiesce_counter);
      /* Re-enable things in child-to-parent order */
      bdrv_drain_invoke(bs, false, false);
 -    bdrv_parent_drained_end(bs);
 -    aio_enable_external(bdrv_get_aio_context(bs));
 +    if (old_quiesce_counter == 1) {
 +        bdrv_parent_drained_end(bs);
 +        aio_enable_external(bdrv_get_aio_context(bs));
 +    }
  }
- static void backup_cleanup_sync_bitmap(BackupBlockJob *job, int ret)
+ /*
@@ -XXX,XX +XXX,XX @@ static bool coroutine_fn yield_and_check(BackupBlockJob *job)
       * (without, VM does not reboot)
       */
      if (job->common.speed) {
 -        uint64_t delay_ns = ratelimit_calculate_delay(&job->limit,
 +        uint64_t delay_ns = ratelimit_calculate_delay(&job->common.limit,
                                                        job->bytes_read);
          job->bytes_read = 0;
          block_job_sleep_ns(&job->common, delay_ns);
 diff --git a/block/commit.c b/block/commit.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/commit.c
 +++ b/block/commit.c
@@ -XXX,XX +XXX,XX @@ enum {
  typedef struct CommitBlockJob {
      BlockJob common;
 -    RateLimit limit;
      BlockDriverState *commit_top_bs;
      BlockBackend *top;
      BlockBackend *base;
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn commit_run(void *opaque)
          block_job_progress_update(&s->common, n);
          if (copy && s->common.speed) {
 -            delay_ns = ratelimit_calculate_delay(&s->limit, n);
 +            delay_ns = ratelimit_calculate_delay(&s->common.limit, n);
          } else {
              delay_ns = 0;
          }
@@ -XXX,XX +XXX,XX @@ static void commit_set_speed(BlockJob *job, int64_t speed, Error **errp)
          error_setg(errp, QERR_INVALID_PARAMETER, "speed");
          return;
      }
 -    ratelimit_set_speed(&s->limit, speed, SLICE_TIME);
 +    ratelimit_set_speed(&s->common.limit, speed, SLICE_TIME);
  }
  static const BlockJobDriver commit_job_driver = {
 diff --git a/block/mirror.c b/block/mirror.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/mirror.c
 +++ b/block/mirror.c
@@ -XXX,XX +XXX,XX @@ typedef struct MirrorBuffer {
  typedef struct MirrorBlockJob {
      BlockJob common;
 -    RateLimit limit;
      BlockBackend *target;
      BlockDriverState *mirror_top_bs;
      BlockDriverState *source;
@@ -XXX,XX +XXX,XX @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
          offset += io_bytes;
          nb_chunks -= DIV_ROUND_UP(io_bytes, s->granularity);
          if (s->common.speed) {
 -            delay_ns = ratelimit_calculate_delay(&s->limit, io_bytes_acct);
 +            delay_ns = ratelimit_calculate_delay(&s->common.limit,
 +                                                 io_bytes_acct);
          }
      }
      return delay_ns;
@@ -XXX,XX +XXX,XX @@ static void mirror_set_speed(BlockJob *job, int64_t speed, Error **errp)
          error_setg(errp, QERR_INVALID_PARAMETER, "speed");
          return;
      }
 -    ratelimit_set_speed(&s->limit, speed, SLICE_TIME);
 +    ratelimit_set_speed(&s->common.limit, speed, SLICE_TIME);
  }
  static void mirror_complete(BlockJob *job, Error **errp)
 diff --git a/block/stream.c b/block/stream.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/stream.c
 +++ b/block/stream.c
@@ -XXX,XX +XXX,XX @@ enum {
  typedef struct StreamBlockJob {
      BlockJob common;
 -    RateLimit limit;
      BlockDriverState *base;
      BlockdevOnError on_error;
      char *backing_file_str;
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn stream_run(void *opaque)
          /* Publish progress */
          block_job_progress_update(&s->common, n);
          if (copy && s->common.speed) {
 -            delay_ns = ratelimit_calculate_delay(&s->limit, n);
 +            delay_ns = ratelimit_calculate_delay(&s->common.limit, n);
          } else {
              delay_ns = 0;
          }
@@ -XXX,XX +XXX,XX @@ static void stream_set_speed(BlockJob *job, int64_t speed, Error **errp)
          error_setg(errp, QERR_INVALID_PARAMETER, "speed");
          return;
      }
 -    ratelimit_set_speed(&s->limit, speed, SLICE_TIME);
 +    ratelimit_set_speed(&s->common.limit, speed, SLICE_TIME);
  }
  static const BlockJobDriver stream_job_driver = {
 --
 .13.6

-[Qemu-devel] [PULL 23/37] iotests: Skip 181 and 201 without userfaultfd
+[Qemu-devel] [PULL v3 26/35] test-bdrv-drain: Test nested drain sections
-From: Max Reitz <mreitz@redhat.com>
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
  tests/test-bdrv-drain.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++
 file changed, 57 insertions(+)
-userfaultfd support depends on the host kernel, so it may not be
+diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
-available.  If so, 181 and 201 should be skipped.
+index XXXXXXX..XXXXXXX 100644
+--- a/tests/test-bdrv-drain.c
-Signed-off-by: Max Reitz <mreitz@redhat.com>
++++ b/tests/test-bdrv-drain.c
-Message-id: 20180406151731.4285-3-mreitz@redhat.com
+@@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret)
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+ enum drain_type {
----
+     BDRV_DRAIN_ALL,
- tests/qemu-iotests/181 | 13 +++++++++++++
+     BDRV_DRAIN,
- tests/qemu-iotests/201 | 13 +++++++++++++
++    DRAIN_TYPE_MAX,
-files changed, 26 insertions(+)
+ };
-diff --git a/tests/qemu-iotests/181 b/tests/qemu-iotests/181
+ static void do_drain_begin(enum drain_type drain_type, BlockDriverState *bs)
-index XXXXXXX..XXXXXXX 100755
+@@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain(void)
---- a/tests/qemu-iotests/181
+     test_quiesce_common(BDRV_DRAIN, false);
-+++ b/tests/qemu-iotests/181
+ }
-@@ -XXX,XX +XXX,XX @@ echo
- # Enable postcopy-ram capability both on source and destination
++static void test_nested(void)
- silent=yes
++{
- _send_qemu_cmd $dest 'migrate_set_capability postcopy-ram on' "(qemu)"
++    BlockBackend *blk;
 +    BlockDriverState *bs, *backing;
 +    BDRVTestState *s, *backing_s;
 +    enum drain_type outer, inner;
 +
-+qemu_error_no_exit=yes success_or_failure=yes \
++    blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
-+    _send_qemu_cmd $dest '' "(qemu)" "Postcopy is not supported"
++    bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
-+if [ ${QEMU_STATUS[$dest]} -lt 0 ]; then
++                              &error_abort);
-+    _send_qemu_cmd $dest '' "(qemu)"
++    s = bs->opaque;
 +    blk_insert_bs(blk, bs, &error_abort);
 +
-+    _send_qemu_cmd $src 'quit' ""
++    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
-+    _send_qemu_cmd $dest 'quit' ""
++    backing_s = backing->opaque;
-+    wait=1 _cleanup_qemu
++    bdrv_set_backing_hd(bs, backing, &error_abort);
 +
-+    _notrun 'Postcopy is not supported'
++    for (outer = 0; outer < DRAIN_TYPE_MAX; outer++) {
-+fi
++        for (inner = 0; inner < DRAIN_TYPE_MAX; inner++) {
 +            /* XXX bdrv_drain_all() doesn't increase the quiesce_counter */
 +            int bs_quiesce      = (outer != BDRV_DRAIN_ALL) +
 +                                  (inner != BDRV_DRAIN_ALL);
 +            int backing_quiesce = 0;
 +            int backing_cb_cnt  = (outer != BDRV_DRAIN) +
 +                                  (inner != BDRV_DRAIN);
 +
- _send_qemu_cmd $src 'migrate_set_speed 4k' "(qemu)"
++            g_assert_cmpint(bs->quiesce_counter, ==, 0);
- _send_qemu_cmd $src 'migrate_set_capability postcopy-ram on' "(qemu)"
++            g_assert_cmpint(backing->quiesce_counter, ==, 0);
- _send_qemu_cmd $src "migrate -d unix:${MIG_SOCKET}" "(qemu)"
++            g_assert_cmpint(s->drain_count, ==, 0);
-diff --git a/tests/qemu-iotests/201 b/tests/qemu-iotests/201
++            g_assert_cmpint(backing_s->drain_count, ==, 0);
 index XXXXXXX..XXXXXXX 100755
 --- a/tests/qemu-iotests/201
 +++ b/tests/qemu-iotests/201
@@ -XXX,XX +XXX,XX @@ echo
  silent=yes
  _send_qemu_cmd $dest 'migrate_set_capability postcopy-ram on' "(qemu)"
 +
-+qemu_error_no_exit=yes success_or_failure=yes \
++            do_drain_begin(outer, bs);
-+    _send_qemu_cmd $dest '' "(qemu)" "Postcopy is not supported"
++            do_drain_begin(inner, bs);
 +if [ ${QEMU_STATUS[$dest]} -lt 0 ]; then
 +    _send_qemu_cmd $dest '' "(qemu)"
 +
-+    _send_qemu_cmd $src 'quit' ""
++            g_assert_cmpint(bs->quiesce_counter, ==, bs_quiesce);
-+    _send_qemu_cmd $dest 'quit' ""
++            g_assert_cmpint(backing->quiesce_counter, ==, backing_quiesce);
-+    wait=1 _cleanup_qemu
++            g_assert_cmpint(s->drain_count, ==, 2);
 +            g_assert_cmpint(backing_s->drain_count, ==, backing_cb_cnt);
 +
-+    _notrun 'Postcopy is not supported'
++            do_drain_end(inner, bs);
-+fi
++            do_drain_end(outer, bs);
 +
- _send_qemu_cmd $src 'migrate_set_capability postcopy-ram on' "(qemu)"
++            g_assert_cmpint(bs->quiesce_counter, ==, 0);
- _send_qemu_cmd $src "migrate -d unix:${MIG_SOCKET}" "(qemu)"
++            g_assert_cmpint(backing->quiesce_counter, ==, 0);
 +            g_assert_cmpint(s->drain_count, ==, 0);
 +            g_assert_cmpint(backing_s->drain_count, ==, 0);
 +        }
 +    }
 +
 +    bdrv_unref(backing);
 +    bdrv_unref(bs);
 +    blk_unref(blk);
 +}
 +
  typedef struct TestBlockJob {
      BlockJob common;
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
      g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
      g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
 +    g_test_add_func("/bdrv-drain/nested", test_nested);
 +
      g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
      g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
 --
 .13.6

-[Qemu-devel] [PULL 25/37] block: BLK_PERM_WRITE includes ..._UNCHANGED
+[Qemu-devel] [PULL v3 27/35] block: Don't notify parents in drain call chain
-From: Max Reitz <mreitz@redhat.com>
+This is in preparation for subtree drains, i.e. drained sections that
+affect not only a single node, but recursively all child nodes, too.
-Currently we never actually check whether the WRITE_UNCHANGED
-permission has been taken for unchanging writes.  But the one check that
+Calling the parent callbacks for drain is pointless when we just came
-is commented out checks both WRITE and WRITE_UNCHANGED; and considering
+from that parent node recursively and leads to multiple increases of
-that WRITE_UNCHANGED is already documented as being weaker than WRITE,
+bs->quiesce_counter in a single drain call. Don't do it.
-we should probably explicitly document WRITE to include WRITE_UNCHANGED.
+In order for this to work correctly, the parent callback must be called
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+for every bdrv_drain_begin/end() call, not only for the outermost one:
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: Alberto Garcia <berto@igalia.com>
+If we have a node N with two parents A and B, recursive draining of A
-Message-id: 20180421132929.21610-3-mreitz@redhat.com
+should cause the quiesce_counter of B to increase because its child N is
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
+drained independently of B. If now B is recursively drained, too, A must
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+increase its quiesce_counter because N is drained independently of A
 only now, even if N is going from quiesce_counter 1 to 2.
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- include/block/block.h | 3 +++
+ include/block/block.h |  4 ++--
-file changed, 3 insertions(+)
+ block.c               | 13 +++++++++----
  block/io.c            | 47 ++++++++++++++++++++++++++++++++++-------------
 files changed, 45 insertions(+), 19 deletions(-)
 diff --git a/include/block/block.h b/include/block/block.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/block/block.h
 +++ b/include/block/block.h
-@@ -XXX,XX +XXX,XX @@ enum {
+@@ -XXX,XX +XXX,XX @@ void bdrv_io_unplug(BlockDriverState *bs);
-      * This permission (which is weaker than BLK_PERM_WRITE) is both enough and
+  * Begin a quiesced section of all users of @bs. This is part of
-      * required for writes to the block node when the caller promises that
+  * bdrv_drained_begin.
-      * the visible disk content doesn't change.
+  */
-+     *
+-void bdrv_parent_drained_begin(BlockDriverState *bs);
-+     * As the BLK_PERM_WRITE permission is strictly stronger, either is
++void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore);
-+     * sufficient to perform an unchanging write.
  /**
   * bdrv_parent_drained_end:
@@ -XXX,XX +XXX,XX @@ void bdrv_parent_drained_begin(BlockDriverState *bs);
   * End a quiesced section of all users of @bs. This is part of
   * bdrv_drained_end.
   */
 -void bdrv_parent_drained_end(BlockDriverState *bs);
 +void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore);
  /**
   * bdrv_drained_begin:
 diff --git a/block.c b/block.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block.c
 +++ b/block.c
@@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child,
                                        BlockDriverState *new_bs)
  {
      BlockDriverState *old_bs = child->bs;
 +    int i;
      if (old_bs && new_bs) {
          assert(bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs));
      }
      if (old_bs) {
          if (old_bs->quiesce_counter && child->role->drained_end) {
 -            child->role->drained_end(child);
 +            for (i = 0; i < old_bs->quiesce_counter; i++) {
 +                child->role->drained_end(child);
 +            }
          }
          if (child->role->detach) {
              child->role->detach(child);
@@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child,
      if (new_bs) {
          QLIST_INSERT_HEAD(&new_bs->parents, child, next_parent);
          if (new_bs->quiesce_counter && child->role->drained_begin) {
 -            child->role->drained_begin(child);
 +            for (i = 0; i < new_bs->quiesce_counter; i++) {
 +                child->role->drained_begin(child);
 +            }
          }
          if (child->role->attach) {
@@ -XXX,XX +XXX,XX @@ void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
      AioContext *ctx = bdrv_get_aio_context(bs);
      aio_disable_external(ctx);
 -    bdrv_parent_drained_begin(bs);
 +    bdrv_parent_drained_begin(bs, NULL);
      bdrv_drain(bs); /* ensure there are no in-flight requests */
      while (aio_poll(ctx, false)) {
@@ -XXX,XX +XXX,XX @@ void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
       */
-     BLK_PERM_WRITE_UNCHANGED    = 0x04,
+     aio_context_acquire(new_context);
+     bdrv_attach_aio_context(bs, new_context);
 -    bdrv_parent_drained_end(bs);
 +    bdrv_parent_drained_end(bs, NULL);
      aio_enable_external(ctx);
      aio_context_release(new_context);
  }
 diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/io.c
 +++ b/block/io.c
@@ -XXX,XX +XXX,XX @@
  static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
      int64_t offset, int bytes, BdrvRequestFlags flags);
 -void bdrv_parent_drained_begin(BlockDriverState *bs)
 +void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore)
  {
      BdrvChild *c, *next;
      QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
 +        if (c == ignore) {
 +            continue;
 +        }
          if (c->role->drained_begin) {
              c->role->drained_begin(c);
          }
      }
  }
 -void bdrv_parent_drained_end(BlockDriverState *bs)
 +void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore)
  {
      BdrvChild *c, *next;
      QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
 +        if (c == ignore) {
 +            continue;
 +        }
          if (c->role->drained_end) {
              c->role->drained_end(c);
          }
@@ -XXX,XX +XXX,XX @@ typedef struct {
      BlockDriverState *bs;
      bool done;
      bool begin;
 +    BdrvChild *parent;
  } BdrvCoDrainData;
  static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs)
      return waited;
  }
 +static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent);
 +static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent);
 +
  static void bdrv_co_drain_bh_cb(void *opaque)
  {
      BdrvCoDrainData *data = opaque;
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
      bdrv_dec_in_flight(bs);
      if (data->begin) {
 -        bdrv_drained_begin(bs);
 +        bdrv_do_drained_begin(bs, data->parent);
      } else {
 -        bdrv_drained_end(bs);
 +        bdrv_do_drained_end(bs, data->parent);
      }
      data->done = true;
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
  }
  static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
 -                                                bool begin)
 +                                                bool begin, BdrvChild *parent)
  {
      BdrvCoDrainData data;
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
          .bs = bs,
          .done = false,
          .begin = begin,
 +        .parent = parent,
      };
      bdrv_inc_in_flight(bs);
      aio_bh_schedule_oneshot(bdrv_get_aio_context(bs),
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
      assert(data.done);
  }
 -void bdrv_drained_begin(BlockDriverState *bs)
 +static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent)
  {
      if (qemu_in_coroutine()) {
 -        bdrv_co_yield_to_drain(bs, true);
 +        bdrv_co_yield_to_drain(bs, true, parent);
          return;
      }
      /* Stop things in parent-to-child order */
      if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
          aio_disable_external(bdrv_get_aio_context(bs));
 -        bdrv_parent_drained_begin(bs);
      }
 +    bdrv_parent_drained_begin(bs, parent);
      bdrv_drain_invoke(bs, true, false);
      bdrv_drain_recurse(bs);
  }
 -void bdrv_drained_end(BlockDriverState *bs)
 +void bdrv_drained_begin(BlockDriverState *bs)
 +{
 +    bdrv_do_drained_begin(bs, NULL);
 +}
 +
 +static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent)
  {
      int old_quiesce_counter;
      if (qemu_in_coroutine()) {
 -        bdrv_co_yield_to_drain(bs, false);
 +        bdrv_co_yield_to_drain(bs, false, parent);
          return;
      }
      assert(bs->quiesce_counter > 0);
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
      /* Re-enable things in child-to-parent order */
      bdrv_drain_invoke(bs, false, false);
 +    bdrv_parent_drained_end(bs, parent);
      if (old_quiesce_counter == 1) {
 -        bdrv_parent_drained_end(bs);
          aio_enable_external(bdrv_get_aio_context(bs));
      }
  }
 +void bdrv_drained_end(BlockDriverState *bs)
 +{
 +    bdrv_do_drained_end(bs, NULL);
 +}
 +
  /*
   * Wait for pending requests to complete on a single BlockDriverState subtree,
   * and suspend block driver's internal I/O until next request arrives.
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
          /* Stop things in parent-to-child order */
          aio_context_acquire(aio_context);
          aio_disable_external(aio_context);
 -        bdrv_parent_drained_begin(bs);
 +        bdrv_parent_drained_begin(bs, NULL);
          bdrv_drain_invoke(bs, true, true);
          aio_context_release(aio_context);
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
          /* Re-enable things in child-to-parent order */
          aio_context_acquire(aio_context);
          bdrv_drain_invoke(bs, false, true);
 -        bdrv_parent_drained_end(bs);
 +        bdrv_parent_drained_end(bs, NULL);
          aio_enable_external(aio_context);
          aio_context_release(aio_context);
      }
 --
 .13.6

-[Qemu-devel] [PULL 12/37] blockjob: Wrappers for progress counter access
+[Qemu-devel] [PULL v3 28/35] block: Add bdrv_subtree_drained_begin/end()
-Block job drivers are not expected to mess with the internals of the
+bdrv_drained_begin() waits for the completion of requests in the whole
-BlockJob object, so provide wrapper functions for one of the cases where
+subtree, but it only actually keeps its immediate bs parameter quiesced
-they still do it: Updating the progress counter.
+until bdrv_drained_end().
 Add a version that keeps the whole subtree drained. As of this commit,
 graph changes cannot be allowed during a subtree drained section, but
 this will be fixed soon.
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Reviewed-by: Max Reitz <mreitz@redhat.com>
-Reviewed-by: John Snow <jsnow@redhat.com>
 ---
- include/block/blockjob.h | 19 +++++++++++++++++++
+ include/block/block.h | 13 +++++++++++++
- block/backup.c           | 22 +++++++++++++---------
+ block/io.c            | 54 ++++++++++++++++++++++++++++++++++++++++-----------
- block/commit.c           | 16 ++++++++--------
+files changed, 56 insertions(+), 11 deletions(-)
  block/mirror.c           | 11 +++++------
  block/stream.c           | 14 ++++++++------
  blockjob.c               | 10 ++++++++++
 files changed, 63 insertions(+), 29 deletions(-)
-diff --git a/include/block/blockjob.h b/include/block/blockjob.h
+diff --git a/include/block/block.h b/include/block/block.h
 index XXXXXXX..XXXXXXX 100644
---- a/include/block/blockjob.h
+--- a/include/block/block.h
-+++ b/include/block/blockjob.h
++++ b/include/block/block.h
-@@ -XXX,XX +XXX,XX @@ void block_job_finalize(BlockJob *job, Error **errp);
+@@ -XXX,XX +XXX,XX @@ void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore);
- void block_job_dismiss(BlockJob **job, Error **errp);
+ void bdrv_drained_begin(BlockDriverState *bs);
  /**
-+ * block_job_progress_update:
++ * Like bdrv_drained_begin, but recursively begins a quiesced section for
-+ * @job: The job that has made progress
++ * exclusive access to all child nodes as well.
 + * @done: How much progress the job made
 + *
-+ * Updates the progress counter of the job.
++ * Graph changes are not allowed during a subtree drain section.
 + */
-+void block_job_progress_update(BlockJob *job, uint64_t done);
++void bdrv_subtree_drained_begin(BlockDriverState *bs);
 +
 +/**
-+ * block_job_progress_set_remaining:
+  * bdrv_drained_end:
-+ * @job: The job whose expected progress end value is set
+  *
-+ * @remaining: Expected end value of the progress counter of the job
+  * End a quiescent section started by bdrv_drained_begin().
-+ *
+  */
-+ * Sets the expected end value of the progress counter of a job so that a
+ void bdrv_drained_end(BlockDriverState *bs);
-+ * completion percentage can be calculated when the progress is updated.
 +/**
 + * End a quiescent section started by bdrv_subtree_drained_begin().
 + */
-+void block_job_progress_set_remaining(BlockJob *job, uint64_t remaining);
++void bdrv_subtree_drained_end(BlockDriverState *bs);
 +
-+/**
+ void bdrv_add_child(BlockDriverState *parent, BlockDriverState *child,
-  * block_job_query:
+                     Error **errp);
-  * @job: The job to get information about.
+ void bdrv_del_child(BlockDriverState *parent, BdrvChild *child, Error **errp);
-  *
+diff --git a/block/io.c b/block/io.c
 diff --git a/block/backup.c b/block/backup.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/backup.c
+--- a/block/io.c
-+++ b/block/backup.c
++++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@ typedef struct BackupBlockJob {
+@@ -XXX,XX +XXX,XX @@ typedef struct {
-     BlockdevOnError on_source_error;
+     BlockDriverState *bs;
-     BlockdevOnError on_target_error;
+     bool done;
-     CoRwlock flush_rwlock;
+     bool begin;
-+    uint64_t len;
++    bool recursive;
-     uint64_t bytes_read;
+     BdrvChild *parent;
-     int64_t cluster_size;
+ } BdrvCoDrainData;
-     bool compress;
-@@ -XXX,XX +XXX,XX @@ static int coroutine_fn backup_do_cow(BackupBlockJob *job,
+@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs)
+     return waited;
-         trace_backup_do_cow_process(job, start);
+ }
--        n = MIN(job->cluster_size, job->common.len - start);
+-static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent);
-+        n = MIN(job->cluster_size, job->len - start);
+-static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent);
++static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
-         if (!bounce_buffer) {
++                                  BdrvChild *parent);
-             bounce_buffer = blk_blockalign(blk, job->cluster_size);
++static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
-@@ -XXX,XX +XXX,XX @@ static int coroutine_fn backup_do_cow(BackupBlockJob *job,
++                                BdrvChild *parent);
-          * offset field is an opaque progress value, it is not a disk offset.
-          */
+ static void bdrv_co_drain_bh_cb(void *opaque)
-         job->bytes_read += n;
+ {
--        job->common.offset += n;
+@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
-+        block_job_progress_update(&job->common, n);
      bdrv_dec_in_flight(bs);
      if (data->begin) {
 -        bdrv_do_drained_begin(bs, data->parent);
 +        bdrv_do_drained_begin(bs, data->recursive, data->parent);
      } else {
 -        bdrv_do_drained_end(bs, data->parent);
 +        bdrv_do_drained_end(bs, data->recursive, data->parent);
      }
- out:
+     data->done = true;
-@@ -XXX,XX +XXX,XX @@ void backup_do_checkpoint(BlockJob *job, Error **errp)
+@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
  }
  static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
 -                                                bool begin, BdrvChild *parent)
 +                                                bool begin, bool recursive,
 +                                                BdrvChild *parent)
  {
      BdrvCoDrainData data;
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
          .bs = bs,
          .done = false,
          .begin = begin,
 +        .recursive = recursive,
          .parent = parent,
      };
      bdrv_inc_in_flight(bs);
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
      assert(data.done);
  }
 -static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent)
 +static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
 +                                  BdrvChild *parent)
  {
 +    BdrvChild *child, *next;
 +
      if (qemu_in_coroutine()) {
 -        bdrv_co_yield_to_drain(bs, true, parent);
 +        bdrv_co_yield_to_drain(bs, true, recursive, parent);
          return;
      }
--    len = DIV_ROUND_UP(backup_job->common.len, backup_job->cluster_size);
+@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent)
-+    len = DIV_ROUND_UP(backup_job->len, backup_job->cluster_size);
+     bdrv_parent_drained_begin(bs, parent);
-     hbitmap_set(backup_job->copy_bitmap, 0, len);
+     bdrv_drain_invoke(bs, true, false);
      bdrv_drain_recurse(bs);
 +
 +    if (recursive) {
 +        QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
 +            bdrv_do_drained_begin(child->bs, true, child);
 +        }
 +    }
  }
-@@ -XXX,XX +XXX,XX @@ static void backup_incremental_init_copy_bitmap(BackupBlockJob *job)
+ void bdrv_drained_begin(BlockDriverState *bs)
-         bdrv_set_dirty_iter(dbi, next_cluster * job->cluster_size);
+ {
-     }
+-    bdrv_do_drained_begin(bs, NULL);
++    bdrv_do_drained_begin(bs, false, NULL);
 -    job->common.offset = job->common.len -
 -                         hbitmap_count(job->copy_bitmap) * job->cluster_size;
 +    /* TODO block_job_progress_set_remaining() would make more sense */
 +    block_job_progress_update(&job->common,
 +        job->len - hbitmap_count(job->copy_bitmap) * job->cluster_size);
      bdrv_dirty_iter_free(dbi);
  }
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn backup_run(void *opaque)
      QLIST_INIT(&job->inflight_reqs);
      qemu_co_rwlock_init(&job->flush_rwlock);
 -    nb_clusters = DIV_ROUND_UP(job->common.len, job->cluster_size);
 +    nb_clusters = DIV_ROUND_UP(job->len, job->cluster_size);
 +    block_job_progress_set_remaining(&job->common, job->len);
 +
      job->copy_bitmap = hbitmap_alloc(nb_clusters, 0);
      if (job->sync_mode == MIRROR_SYNC_MODE_INCREMENTAL) {
          backup_incremental_init_copy_bitmap(job);
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn backup_run(void *opaque)
          ret = backup_run_incremental(job);
      } else {
          /* Both FULL and TOP SYNC_MODE's require copying.. */
 -        for (offset = 0; offset < job->common.len;
 +        for (offset = 0; offset < job->len;
               offset += job->cluster_size) {
              bool error_is_read;
              int alloced = 0;
@@ -XXX,XX +XXX,XX @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
          goto error;
      }
 -    /* job->common.len is fixed, so we can't allow resize */
 +    /* job->len is fixed, so we can't allow resize */
      job = block_job_create(job_id, &backup_job_driver, txn, bs,
                             BLK_PERM_CONSISTENT_READ,
                             BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE |
@@ -XXX,XX +XXX,XX @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
      /* Required permissions are already taken with target's blk_new() */
      block_job_add_bdrv(&job->common, "target", target, 0, BLK_PERM_ALL,
                         &error_abort);
 -    job->common.len = len;
 +    job->len = len;
      return &job->common;
 diff --git a/block/commit.c b/block/commit.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/commit.c
 +++ b/block/commit.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn commit_run(void *opaque)
      int64_t n = 0; /* bytes */
      void *buf = NULL;
      int bytes_written = 0;
 -    int64_t base_len;
 +    int64_t len, base_len;
 -    ret = s->common.len = blk_getlength(s->top);
 -
 -    if (s->common.len < 0) {
 +    ret = len = blk_getlength(s->top);
 +    if (len < 0) {
          goto out;
      }
 +    block_job_progress_set_remaining(&s->common, len);
      ret = base_len = blk_getlength(s->base);
      if (base_len < 0) {
          goto out;
      }
 -    if (base_len < s->common.len) {
 -        ret = blk_truncate(s->base, s->common.len, PREALLOC_MODE_OFF, NULL);
 +    if (base_len < len) {
 +        ret = blk_truncate(s->base, len, PREALLOC_MODE_OFF, NULL);
          if (ret) {
              goto out;
          }
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn commit_run(void *opaque)
      buf = blk_blockalign(s->top, COMMIT_BUFFER_SIZE);
 -    for (offset = 0; offset < s->common.len; offset += n) {
 +    for (offset = 0; offset < len; offset += n) {
          bool copy;
          /* Note that even when no rate limit is applied we need to yield
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn commit_run(void *opaque)
              }
          }
          /* Publish progress */
 -        s->common.offset += n;
 +        block_job_progress_update(&s->common, n);
          if (copy && s->common.speed) {
              delay_ns = ratelimit_calculate_delay(&s->limit, n);
 diff --git a/block/mirror.c b/block/mirror.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/mirror.c
 +++ b/block/mirror.c
@@ -XXX,XX +XXX,XX @@ static void mirror_iteration_done(MirrorOp *op, int ret)
              bitmap_set(s->cow_bitmap, chunk_num, nb_chunks);
          }
          if (!s->initial_zeroing_ongoing) {
 -            s->common.offset += op->bytes;
 +            block_job_progress_update(&s->common, op->bytes);
          }
      }
      qemu_iovec_destroy(&op->qiov);
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn mirror_run(void *opaque)
          block_job_pause_point(&s->common);
          cnt = bdrv_get_dirty_count(s->dirty_bitmap);
 -        /* s->common.offset contains the number of bytes already processed so
 -         * far, cnt is the number of dirty bytes remaining and
 -         * s->bytes_in_flight is the number of bytes currently being
 -         * processed; together those are the current total operation length */
 -        s->common.len = s->common.offset + s->bytes_in_flight + cnt;
 +        /* cnt is the number of dirty bytes remaining and s->bytes_in_flight is
 +         * the number of bytes currently being processed; together those are
 +         * the current remaining operation length */
 +        block_job_progress_set_remaining(&s->common, s->bytes_in_flight + cnt);
          /* Note that even when no rate limit is applied we need to yield
           * periodically with no pending I/O so that bdrv_drain_all() returns.
 diff --git a/block/stream.c b/block/stream.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/stream.c
 +++ b/block/stream.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn stream_run(void *opaque)
      BlockBackend *blk = s->common.blk;
      BlockDriverState *bs = blk_bs(blk);
      BlockDriverState *base = s->base;
 +    int64_t len;
      int64_t offset = 0;
      uint64_t delay_ns = 0;
      int error = 0;
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn stream_run(void *opaque)
          goto out;
      }
 -    s->common.len = bdrv_getlength(bs);
 -    if (s->common.len < 0) {
 -        ret = s->common.len;
 +    len = bdrv_getlength(bs);
 +    if (len < 0) {
 +        ret = len;
          goto out;
      }
 +    block_job_progress_set_remaining(&s->common, len);
      buf = qemu_blockalign(bs, STREAM_BUFFER_SIZE);
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn stream_run(void *opaque)
          bdrv_enable_copy_on_read(bs);
      }
 -    for ( ; offset < s->common.len; offset += n) {
 +    for ( ; offset < len; offset += n) {
          bool copy;
          /* Note that even when no rate limit is applied we need to yield
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn stream_run(void *opaque)
              /* Finish early if end of backing file has been reached */
              if (ret == 0 && n == 0) {
 -                n = s->common.len - offset;
 +                n = len - offset;
              }
              copy = (ret == 1);
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn stream_run(void *opaque)
          ret = 0;
          /* Publish progress */
 -        s->common.offset += n;
 +        block_job_progress_update(&s->common, n);
          if (copy && s->common.speed) {
              delay_ns = ratelimit_calculate_delay(&s->limit, n);
          } else {
 diff --git a/blockjob.c b/blockjob.c
 index XXXXXXX..XXXXXXX 100644
 --- a/blockjob.c
 +++ b/blockjob.c
@@ -XXX,XX +XXX,XX @@ int block_job_complete_sync(BlockJob *job, Error **errp)
      return block_job_finish_sync(job, &block_job_complete, errp);
  }
 +void block_job_progress_update(BlockJob *job, uint64_t done)
 +{
 +    job->offset += done;
 +}
 +
-+void block_job_progress_set_remaining(BlockJob *job, uint64_t remaining)
++void bdrv_subtree_drained_begin(BlockDriverState *bs)
 +{
-+    job->len = job->offset + remaining;
++    bdrv_do_drained_begin(bs, true, NULL);
  }
 -static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent)
 +static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
 +                                BdrvChild *parent)
  {
 +    BdrvChild *child, *next;
      int old_quiesce_counter;
      if (qemu_in_coroutine()) {
 -        bdrv_co_yield_to_drain(bs, false, parent);
 +        bdrv_co_yield_to_drain(bs, false, recursive, parent);
          return;
      }
      assert(bs->quiesce_counter > 0);
@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent)
      if (old_quiesce_counter == 1) {
          aio_enable_external(bdrv_get_aio_context(bs));
      }
 +
 +    if (recursive) {
 +        QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
 +            bdrv_do_drained_end(child->bs, true, child);
 +        }
 +    }
  }
  void bdrv_drained_end(BlockDriverState *bs)
  {
 -    bdrv_do_drained_end(bs, NULL);
 +    bdrv_do_drained_end(bs, false, NULL);
 +}
 +
- BlockJobInfo *block_job_query(BlockJob *job, Error **errp)
++void bdrv_subtree_drained_end(BlockDriverState *bs)
- {
++{
-     BlockJobInfo *info;
++    bdrv_do_drained_end(bs, true, NULL);
  }
  /*
 --
 .13.6

-[Qemu-devel] [PULL 03/37] file-win32: Switch to byte-based callbacks
+[Qemu-devel] [PULL v3 29/35] test-bdrv-drain: Tests for bdrv_subtree_drain
-From: Eric Blake <eblake@redhat.com>
+Add a subtree drain version to the existing test cases.
-We are gradually moving away from sector-based interfaces, towards
-byte-based.  Make the change for the last few sector-based callbacks
-in the file-win32 driver.
-Note that the driver was already using byte-based calls for
-performing actual I/O, so this just gets rid of a round trip
-of scaling; however, as I don't know if Windows is tolerant of
-non-sector AIO operations, I went with the conservative approach
-of modifying .bdrv_refresh_limits to override the block layer
-defaults back to the pre-patch value of 512.
-Signed-off-by: Eric Blake <eblake@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- include/block/raw-aio.h |  2 +-
+ tests/test-bdrv-drain.c | 27 ++++++++++++++++++++++++++-
- block/file-win32.c      | 47 +++++++++++++++++++++++++++++------------------
+file changed, 26 insertions(+), 1 deletion(-)
  block/win32-aio.c       |  5 ++---
 files changed, 32 insertions(+), 22 deletions(-)
-diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
+diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/block/raw-aio.h
+--- a/tests/test-bdrv-drain.c
-+++ b/include/block/raw-aio.h
++++ b/tests/test-bdrv-drain.c
-@@ -XXX,XX +XXX,XX @@ void win32_aio_cleanup(QEMUWin32AIOState *aio);
+@@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret)
- int win32_aio_attach(QEMUWin32AIOState *aio, HANDLE hfile);
+ enum drain_type {
- BlockAIOCB *win32_aio_submit(BlockDriverState *bs,
+     BDRV_DRAIN_ALL,
-         QEMUWin32AIOState *aio, HANDLE hfile,
+     BDRV_DRAIN,
--        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
++    BDRV_SUBTREE_DRAIN,
-+        uint64_t offset, uint64_t bytes, QEMUIOVector *qiov,
+     DRAIN_TYPE_MAX,
-         BlockCompletionFunc *cb, void *opaque, int type);
+ };
- void win32_aio_detach_aio_context(QEMUWin32AIOState *aio,
-                                   AioContext *old_context);
+@@ -XXX,XX +XXX,XX @@ static void do_drain_begin(enum drain_type drain_type, BlockDriverState *bs)
-diff --git a/block/file-win32.c b/block/file-win32.c
+     switch (drain_type) {
-index XXXXXXX..XXXXXXX 100644
+     case BDRV_DRAIN_ALL:        bdrv_drain_all_begin(); break;
---- a/block/file-win32.c
+     case BDRV_DRAIN:            bdrv_drained_begin(bs); break;
-+++ b/block/file-win32.c
++    case BDRV_SUBTREE_DRAIN:    bdrv_subtree_drained_begin(bs); break;
-@@ -XXX,XX +XXX,XX @@ static void raw_probe_alignment(BlockDriverState *bs, Error **errp)
+     default:                    g_assert_not_reached();
                           &dg.Geometry.BytesPerSector,
                           &freeClusters, &totalClusters);
          bs->bl.request_alignment = dg.Geometry.BytesPerSector;
 +        return;
      }
 +
 +    /* XXX Does Windows support AIO on less than 512-byte alignment? */
 +    bs->bl.request_alignment = 512;
  }
  static void raw_parse_flags(int flags, bool use_aio, int *access_flags,
@@ -XXX,XX +XXX,XX @@ fail:
      return ret;
  }
 -static BlockAIOCB *raw_aio_readv(BlockDriverState *bs,
 -                         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
 -                         BlockCompletionFunc *cb, void *opaque)
 +static BlockAIOCB *raw_aio_preadv(BlockDriverState *bs,
 +                                  uint64_t offset, uint64_t bytes,
 +                                  QEMUIOVector *qiov, int flags,
 +                                  BlockCompletionFunc *cb, void *opaque)
  {
      BDRVRawState *s = bs->opaque;
      if (s->aio) {
 -        return win32_aio_submit(bs, s->aio, s->hfile, sector_num, qiov,
 -                                nb_sectors, cb, opaque, QEMU_AIO_READ);
 +        return win32_aio_submit(bs, s->aio, s->hfile, offset, bytes, qiov,
 +                                cb, opaque, QEMU_AIO_READ);
      } else {
 -        return paio_submit(bs, s->hfile, sector_num << BDRV_SECTOR_BITS, qiov,
 -                           nb_sectors << BDRV_SECTOR_BITS,
 +        return paio_submit(bs, s->hfile, offset, qiov, bytes,
                             cb, opaque, QEMU_AIO_READ);
      }
  }
+@@ -XXX,XX +XXX,XX @@ static void do_drain_end(enum drain_type drain_type, BlockDriverState *bs)
--static BlockAIOCB *raw_aio_writev(BlockDriverState *bs,
+     switch (drain_type) {
--                          int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+     case BDRV_DRAIN_ALL:        bdrv_drain_all_end(); break;
--                          BlockCompletionFunc *cb, void *opaque)
+     case BDRV_DRAIN:            bdrv_drained_end(bs); break;
-+static BlockAIOCB *raw_aio_pwritev(BlockDriverState *bs,
++    case BDRV_SUBTREE_DRAIN:    bdrv_subtree_drained_end(bs); break;
-+                                   uint64_t offset, uint64_t bytes,
+     default:                    g_assert_not_reached();
 +                                   QEMUIOVector *qiov, int flags,
 +                                   BlockCompletionFunc *cb, void *opaque)
  {
      BDRVRawState *s = bs->opaque;
      if (s->aio) {
 -        return win32_aio_submit(bs, s->aio, s->hfile, sector_num, qiov,
 -                                nb_sectors, cb, opaque, QEMU_AIO_WRITE);
 +        return win32_aio_submit(bs, s->aio, s->hfile, offset, bytes, qiov,
 +                                cb, opaque, QEMU_AIO_WRITE);
      } else {
 -        return paio_submit(bs, s->hfile, sector_num << BDRV_SECTOR_BITS, qiov,
 -                           nb_sectors << BDRV_SECTOR_BITS,
 +        return paio_submit(bs, s->hfile, offset, qiov, bytes,
                             cb, opaque, QEMU_AIO_WRITE);
      }
  }
-@@ -XXX,XX +XXX,XX @@ BlockDriver bdrv_file = {
+@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain(void)
-     .bdrv_co_create_opts = raw_co_create_opts,
+     test_drv_cb_common(BDRV_DRAIN, false);
      .bdrv_has_zero_init = bdrv_has_zero_init_1,
 -    .bdrv_aio_readv     = raw_aio_readv,
 -    .bdrv_aio_writev    = raw_aio_writev,
 +    .bdrv_aio_preadv    = raw_aio_preadv,
 +    .bdrv_aio_pwritev   = raw_aio_pwritev,
      .bdrv_aio_flush     = raw_aio_flush,
      .bdrv_truncate    = raw_truncate,
@@ -XXX,XX +XXX,XX @@ static void hdev_parse_filename(const char *filename, QDict *options,
      bdrv_parse_filename_strip_prefix(filename, "host_device:", options);
  }
-+static void hdev_refresh_limits(BlockDriverState *bs, Error **errp)
++static void test_drv_cb_drain_subtree(void)
 +{
-+    /* XXX Does Windows support AIO on less than 512-byte alignment? */
++    test_drv_cb_common(BDRV_SUBTREE_DRAIN, true);
 +    bs->bl.request_alignment = 512;
 +}
 +
- static int hdev_open(BlockDriverState *bs, QDict *options, int flags,
+ static void test_quiesce_common(enum drain_type drain_type, bool recursive)
                       Error **errp)
  {
-@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_host_device = {
+     BlockBackend *blk;
-     .bdrv_probe_device    = hdev_probe_device,
+@@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain(void)
-     .bdrv_file_open    = hdev_open,
+     test_quiesce_common(BDRV_DRAIN, false);
-     .bdrv_close        = raw_close,
+ }
-+    .bdrv_refresh_limits = hdev_refresh_limits,
++static void test_quiesce_drain_subtree(void)
--    .bdrv_aio_readv     = raw_aio_readv,
++{
--    .bdrv_aio_writev    = raw_aio_writev,
++    test_quiesce_common(BDRV_SUBTREE_DRAIN, true);
-+    .bdrv_aio_preadv    = raw_aio_preadv,
++}
-+    .bdrv_aio_pwritev   = raw_aio_pwritev,
++
-     .bdrv_aio_flush     = raw_aio_flush,
+ static void test_nested(void)
      .bdrv_detach_aio_context = raw_detach_aio_context,
 diff --git a/block/win32-aio.c b/block/win32-aio.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/win32-aio.c
 +++ b/block/win32-aio.c
@@ -XXX,XX +XXX,XX @@ static const AIOCBInfo win32_aiocb_info = {
  BlockAIOCB *win32_aio_submit(BlockDriverState *bs,
          QEMUWin32AIOState *aio, HANDLE hfile,
 -        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
 +        uint64_t offset, uint64_t bytes, QEMUIOVector *qiov,
          BlockCompletionFunc *cb, void *opaque, int type)
  {
-     struct QEMUWin32AIOCB *waiocb;
+     BlockBackend *blk;
--    uint64_t offset = sector_num * 512;
+@@ -XXX,XX +XXX,XX @@ static void test_nested(void)
-     DWORD rc;
+             /* XXX bdrv_drain_all() doesn't increase the quiesce_counter */
+             int bs_quiesce      = (outer != BDRV_DRAIN_ALL) +
-     waiocb = qemu_aio_get(&win32_aiocb_info, bs, cb, opaque);
+                                   (inner != BDRV_DRAIN_ALL);
--    waiocb->nbytes = nb_sectors * 512;
+-            int backing_quiesce = 0;
-+    waiocb->nbytes = bytes;
++            int backing_quiesce = (outer == BDRV_SUBTREE_DRAIN) +
-     waiocb->qiov = qiov;
++                                  (inner == BDRV_SUBTREE_DRAIN);
-     waiocb->is_read = (type == QEMU_AIO_READ);
+             int backing_cb_cnt  = (outer != BDRV_DRAIN) +
+                                   (inner != BDRV_DRAIN);
@@ -XXX,XX +XXX,XX @@ static void test_blockjob_drain(void)
      test_blockjob_common(BDRV_DRAIN);
  }
 +static void test_blockjob_drain_subtree(void)
 +{
 +    test_blockjob_common(BDRV_SUBTREE_DRAIN);
 +}
 +
  int main(int argc, char **argv)
  {
      bdrv_init();
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
      g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
      g_test_add_func("/bdrv-drain/driver-cb/drain", test_drv_cb_drain);
 +    g_test_add_func("/bdrv-drain/driver-cb/drain_subtree",
 +                    test_drv_cb_drain_subtree);
      g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
      g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
 +    g_test_add_func("/bdrv-drain/quiesce/drain_subtree",
 +                    test_quiesce_drain_subtree);
      g_test_add_func("/bdrv-drain/nested", test_nested);
      g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
      g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
 +    g_test_add_func("/bdrv-drain/blockjob/drain_subtree",
 +                    test_blockjob_drain_subtree);
      return g_test_run();
  }
 --
 .13.6

-[Qemu-devel] [PULL 17/37] iotests: Split 214 off of 122
+[Qemu-devel] [PULL v3 30/35] test-bdrv-drain: Test behaviour in coroutine context
-From: Max Reitz <mreitz@redhat.com>
+If bdrv_do_drained_begin/end() are called in coroutine context, they
 first use a BH to get out of the coroutine context. Call some existing
 tests again from a coroutine to cover this code path.
-Commit abd3622cc03cf41ed542126a540385f30a4c0175 added a case to 122
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-regarding how the qcow2 driver handles an incorrect compressed data
+---
-length value.  This does not really fit into 122, as that file is
+ tests/test-bdrv-drain.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++
-supposed to contain qemu-img convert test cases, which this case is not.
+file changed, 59 insertions(+)
 So this patch splits it off into its own file; maybe we will even get
 more qcow2-only compression tests in the future.
-Also, that test case does not work with refcount_bits=1, so mark that
+diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
 option as unsupported.
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 Message-id: 20180406164108.26118-1-mreitz@redhat.com
 Reviewed-by: Eric Blake <eblake@redhat.com>
 Signed-off-by: Alberto Garcia <berto@igalia.com>
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 ---
  tests/qemu-iotests/122     | 47 ----------------------
  tests/qemu-iotests/122.out | 33 ----------------
  tests/qemu-iotests/214     | 97 ++++++++++++++++++++++++++++++++++++++++++++++
  tests/qemu-iotests/214.out | 35 +++++++++++++++++
  tests/qemu-iotests/group   |  1 +
 files changed, 133 insertions(+), 80 deletions(-)
  create mode 100755 tests/qemu-iotests/214
  create mode 100644 tests/qemu-iotests/214.out
 diff --git a/tests/qemu-iotests/122 b/tests/qemu-iotests/122
 index XXXXXXX..XXXXXXX 100755
 --- a/tests/qemu-iotests/122
 +++ b/tests/qemu-iotests/122
@@ -XXX,XX +XXX,XX @@ $QEMU_IO -c "read -P 0    1024k 1022k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _fil
  echo
 -echo "=== Corrupted size field in compressed cluster descriptor ==="
 -echo
 -# Create an empty image and fill half of it with compressed data.
 -# The L2 entries of the two compressed clusters are located at
 -# 0x800000 and 0x800008, their original values are 0x4008000000a00000
 -# and 0x4008000000a00802 (5 sectors for compressed data each).
 -_make_test_img 8M -o cluster_size=2M
 -$QEMU_IO -c "write -c -P 0x11 0 2M" -c "write -c -P 0x11 2M 2M" "$TEST_IMG" \
 -         2>&1 | _filter_qemu_io | _filter_testdir
 -
 -# Reduce size of compressed data to 4 sectors: this corrupts the image.
 -poke_file "$TEST_IMG" $((0x800000)) "\x40\x06"
 -$QEMU_IO -c "read  -P 0x11 0 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
 -
 -# 'qemu-img check' however doesn't see anything wrong because it
 -# doesn't try to decompress the data and the refcounts are consistent.
 -# TODO: update qemu-img so this can be detected.
 -_check_test_img
 -
 -# Increase size of compressed data to the maximum (8192 sectors).
 -# This makes QEMU read more data (8192 sectors instead of 5, host
 -# addresses [0xa00000, 0xdfffff]), but the decompression algorithm
 -# stops once we have enough to restore the uncompressed cluster, so
 -# the rest of the data is ignored.
 -poke_file "$TEST_IMG" $((0x800000)) "\x7f\xfe"
 -# Do it also for the second compressed cluster (L2 entry at 0x800008).
 -# In this case the compressed data would span 3 host clusters
 -# (host addresses: [0xa00802, 0xe00801])
 -poke_file "$TEST_IMG" $((0x800008)) "\x7f\xfe"
 -
 -# Here the image is too small so we're asking QEMU to read beyond the
 -# end of the image.
 -$QEMU_IO -c "read  -P 0x11  0 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
 -# But if we grow the image we won't be reading beyond its end anymore.
 -$QEMU_IO -c "write -P 0x22 4M 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
 -$QEMU_IO -c "read  -P 0x11  0 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
 -
 -# The refcount data is however wrong because due to the increased size
 -# of the compressed data it now reaches the following host clusters.
 -# This can be repaired by qemu-img check by increasing the refcount of
 -# those clusters.
 -# TODO: update qemu-img to correct the compressed cluster size instead.
 -_check_test_img -r all
 -$QEMU_IO -c "read  -P 0x11  0 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
 -$QEMU_IO -c "read  -P 0x22 4M 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
 -
 -echo
  echo "=== Full allocation with -S 0 ==="
  echo
 diff --git a/tests/qemu-iotests/122.out b/tests/qemu-iotests/122.out
 index XXXXXXX..XXXXXXX 100644
---- a/tests/qemu-iotests/122.out
+--- a/tests/test-bdrv-drain.c
-+++ b/tests/qemu-iotests/122.out
++++ b/tests/test-bdrv-drain.c
-@@ -XXX,XX +XXX,XX @@ read 1024/1024 bytes at offset 1047552
+@@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret)
- read 1046528/1046528 bytes at offset 1048576
+     *aio_ret = ret;
-KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+ }
--=== Corrupted size field in compressed cluster descriptor ===
++typedef struct CallInCoroutineData {
--
++    void (*entry)(void);
--Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=8388608
++    bool done;
--wrote 2097152/2097152 bytes at offset 0
++} CallInCoroutineData;
 -2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 -wrote 2097152/2097152 bytes at offset 2097152
 -2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 -read failed: Input/output error
 -No errors were found on the image.
 -read 4194304/4194304 bytes at offset 0
 -4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 -wrote 4194304/4194304 bytes at offset 4194304
 -4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 -read 4194304/4194304 bytes at offset 0
 -4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 -ERROR cluster 6 refcount=1 reference=3
 -ERROR cluster 7 refcount=1 reference=2
 -Repairing cluster 6 refcount=1 reference=3
 -Repairing cluster 7 refcount=1 reference=2
 -Repairing OFLAG_COPIED data cluster: l2_entry=8000000000c00000 refcount=3
 -Repairing OFLAG_COPIED data cluster: l2_entry=8000000000e00000 refcount=2
 -The following inconsistencies were found and repaired:
 -
 -    0 leaked clusters
 -    4 corruptions
 -
 -Double checking the fixed image now...
 -No errors were found on the image.
 -read 4194304/4194304 bytes at offset 0
 -4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 -read 4194304/4194304 bytes at offset 4194304
 -4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 -
  === Full allocation with -S 0 ===
  Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864
 diff --git a/tests/qemu-iotests/214 b/tests/qemu-iotests/214
 new file mode 100755
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/tests/qemu-iotests/214
@@ -XXX,XX +XXX,XX @@
 +#!/bin/bash
 +#
 +# Test qcow2 image compression
 +#
 +# Copyright (C) 2018 Igalia, S.L.
 +# Author: Alberto Garcia <berto@igalia.com>
 +#
 +# This program is free software; you can redistribute it and/or modify
 +# it under the terms of the GNU General Public License as published by
 +# the Free Software Foundation; either version 2 of the License, or
 +# (at your option) any later version.
 +#
 +# This program is distributed in the hope that it will be useful,
 +# but WITHOUT ANY WARRANTY; without even the implied warranty of
 +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 +# GNU General Public License for more details.
 +#
 +# You should have received a copy of the GNU General Public License
 +# along with this program.  If not, see <http://www.gnu.org/licenses/>.
 +#
 +
-+seq=$(basename "$0")
++static coroutine_fn void call_in_coroutine_entry(void *opaque)
-+echo "QA output created by $seq"
++{
 +    CallInCoroutineData *data = opaque;
 +
-+here=$PWD
++    data->entry();
-+status=1    # failure is the default!
++    data->done = true;
 +}
 +
-+_cleanup()
++static void call_in_coroutine(void (*entry)(void))
 +{
-+    _cleanup_test_img
++    Coroutine *co;
 +    CallInCoroutineData data = {
 +        .entry  = entry,
 +        .done   = false,
 +    };
 +
 +    co = qemu_coroutine_create(call_in_coroutine_entry, &data);
 +    qemu_coroutine_enter(co);
 +    while (!data.done) {
 +        aio_poll(qemu_get_aio_context(), true);
 +    }
 +}
-+trap "_cleanup; exit \$status" 0 1 2 3 15
 +
-+# get standard environment, filters and checks
+ enum drain_type {
-+. ./common.rc
+     BDRV_DRAIN_ALL,
-+. ./common.filter
+     BDRV_DRAIN,
@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain_subtree(void)
      test_drv_cb_common(BDRV_SUBTREE_DRAIN, true);
  }
 +static void test_drv_cb_co_drain(void)
 +{
 +    call_in_coroutine(test_drv_cb_drain);
 +}
 +
-+_supported_fmt qcow2
++static void test_drv_cb_co_drain_subtree(void)
-+_supported_proto file
++{
-+_supported_os Linux
++    call_in_coroutine(test_drv_cb_drain_subtree);
 +}
 +
-+# Repairing the corrupted image requires qemu-img check to store a
+ static void test_quiesce_common(enum drain_type drain_type, bool recursive)
-+# refcount up to 3, which requires at least two refcount bits.
+ {
-+_unsupported_imgopts 'refcount_bits=1[^0-9]'
+     BlockBackend *blk;
@@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain_subtree(void)
      test_quiesce_common(BDRV_SUBTREE_DRAIN, true);
  }
 +static void test_quiesce_co_drain(void)
 +{
 +    call_in_coroutine(test_quiesce_drain);
 +}
 +
 +static void test_quiesce_co_drain_subtree(void)
 +{
 +    call_in_coroutine(test_quiesce_drain_subtree);
 +}
 +
  static void test_nested(void)
  {
      BlockBackend *blk;
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
      g_test_add_func("/bdrv-drain/driver-cb/drain_subtree",
                      test_drv_cb_drain_subtree);
 +    // XXX bdrv_drain_all() doesn't work in coroutine context
 +    g_test_add_func("/bdrv-drain/driver-cb/co/drain", test_drv_cb_co_drain);
 +    g_test_add_func("/bdrv-drain/driver-cb/co/drain_subtree",
 +                    test_drv_cb_co_drain_subtree);
 +
 +
-+echo
+     g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
-+echo "=== Corrupted size field in compressed cluster descriptor ==="
+     g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
-+echo
+     g_test_add_func("/bdrv-drain/quiesce/drain_subtree",
-+# Create an empty image and fill half of it with compressed data.
+                     test_quiesce_drain_subtree);
-+# The L2 entries of the two compressed clusters are located at
-+# 0x800000 and 0x800008, their original values are 0x4008000000a00000
++    // XXX bdrv_drain_all() doesn't work in coroutine context
-+# and 0x4008000000a00802 (5 sectors for compressed data each).
++    g_test_add_func("/bdrv-drain/quiesce/co/drain", test_quiesce_co_drain);
-+_make_test_img 8M -o cluster_size=2M
++    g_test_add_func("/bdrv-drain/quiesce/co/drain_subtree",
-+$QEMU_IO -c "write -c -P 0x11 0 2M" -c "write -c -P 0x11 2M 2M" "$TEST_IMG" \
++                    test_quiesce_co_drain_subtree);
 +         2>&1 | _filter_qemu_io | _filter_testdir
 +
-+# Reduce size of compressed data to 4 sectors: this corrupts the image.
+     g_test_add_func("/bdrv-drain/nested", test_nested);
-+poke_file "$TEST_IMG" $((0x800000)) "\x40\x06"
-+$QEMU_IO -c "read  -P 0x11 0 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+     g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
 +
 +# 'qemu-img check' however doesn't see anything wrong because it
 +# doesn't try to decompress the data and the refcounts are consistent.
 +# TODO: update qemu-img so this can be detected.
 +_check_test_img
 +
 +# Increase size of compressed data to the maximum (8192 sectors).
 +# This makes QEMU read more data (8192 sectors instead of 5, host
 +# addresses [0xa00000, 0xdfffff]), but the decompression algorithm
 +# stops once we have enough to restore the uncompressed cluster, so
 +# the rest of the data is ignored.
 +poke_file "$TEST_IMG" $((0x800000)) "\x7f\xfe"
 +# Do it also for the second compressed cluster (L2 entry at 0x800008).
 +# In this case the compressed data would span 3 host clusters
 +# (host addresses: [0xa00802, 0xe00801])
 +poke_file "$TEST_IMG" $((0x800008)) "\x7f\xfe"
 +
 +# Here the image is too small so we're asking QEMU to read beyond the
 +# end of the image.
 +$QEMU_IO -c "read  -P 0x11  0 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
 +# But if we grow the image we won't be reading beyond its end anymore.
 +$QEMU_IO -c "write -P 0x22 4M 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
 +$QEMU_IO -c "read  -P 0x11  0 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
 +
 +# The refcount data is however wrong because due to the increased size
 +# of the compressed data it now reaches the following host clusters.
 +# This can be repaired by qemu-img check by increasing the refcount of
 +# those clusters.
 +# TODO: update qemu-img to correct the compressed cluster size instead.
 +_check_test_img -r all
 +$QEMU_IO -c "read  -P 0x11  0 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
 +$QEMU_IO -c "read  -P 0x22 4M 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
 +
 +# success, all done
 +echo '*** done'
 +rm -f $seq.full
 +status=0
 diff --git a/tests/qemu-iotests/214.out b/tests/qemu-iotests/214.out
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/tests/qemu-iotests/214.out
@@ -XXX,XX +XXX,XX @@
 +QA output created by 214
 +
 +=== Corrupted size field in compressed cluster descriptor ===
 +
 +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=8388608
 +wrote 2097152/2097152 bytes at offset 0
 +2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +wrote 2097152/2097152 bytes at offset 2097152
 +2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +read failed: Input/output error
 +No errors were found on the image.
 +read 4194304/4194304 bytes at offset 0
 +4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +wrote 4194304/4194304 bytes at offset 4194304
 +4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +read 4194304/4194304 bytes at offset 0
 +4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +ERROR cluster 6 refcount=1 reference=3
 +ERROR cluster 7 refcount=1 reference=2
 +Repairing cluster 6 refcount=1 reference=3
 +Repairing cluster 7 refcount=1 reference=2
 +Repairing OFLAG_COPIED data cluster: l2_entry=8000000000c00000 refcount=3
 +Repairing OFLAG_COPIED data cluster: l2_entry=8000000000e00000 refcount=2
 +The following inconsistencies were found and repaired:
 +
 +    0 leaked clusters
 +    4 corruptions
 +
 +Double checking the fixed image now...
 +No errors were found on the image.
 +read 4194304/4194304 bytes at offset 0
 +4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +read 4194304/4194304 bytes at offset 4194304
 +4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +*** done
 diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
 index XXXXXXX..XXXXXXX 100644
 --- a/tests/qemu-iotests/group
 +++ b/tests/qemu-iotests/group
@@ -XXX,XX +XXX,XX @@
 rw auto quick
 rw auto quick
 rw auto quick
 +214 rw auto
 rw auto quick
 --
 .13.6

-[Qemu-devel] [PULL 06/37] vxhs: Switch to byte-based callbacks
+[Qemu-devel] [PULL v3 31/35] test-bdrv-drain: Recursive draining with multiple parents
-From: Eric Blake <eblake@redhat.com>
+Test that drain sections are correctly propagated through the graph.
-We are gradually moving away from sector-based interfaces, towards
-byte-based.  Make the change for the last few sector-based callbacks
-in the vxhs driver.
-Note that the driver was already using byte-based calls for
-performing actual I/O, so this just gets rid of a round trip
-of scaling; however, as I don't know if VxHS is tolerant of
-non-sector AIO operations, I went with the conservative approach
-of adding .bdrv_refresh_limits to override the block layer
-defaults back to the pre-patch value of 512.
-Signed-off-by: Eric Blake <eblake@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- block/vxhs.c | 43 ++++++++++++++++++++++---------------------
+ tests/test-bdrv-drain.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++
-file changed, 22 insertions(+), 21 deletions(-)
+file changed, 74 insertions(+)
-diff --git a/block/vxhs.c b/block/vxhs.c
+diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/vxhs.c
+--- a/tests/test-bdrv-drain.c
-+++ b/block/vxhs.c
++++ b/tests/test-bdrv-drain.c
-@@ -XXX,XX +XXX,XX @@ static void vxhs_parse_filename(const char *filename, QDict *options,
+@@ -XXX,XX +XXX,XX @@ static void test_nested(void)
-     }
+     blk_unref(blk);
  }
-+static void vxhs_refresh_limits(BlockDriverState *bs, Error **errp)
++static void test_multiparent(void)
 +{
-+    /* XXX Does VXHS support AIO on less than 512-byte alignment? */
++    BlockBackend *blk_a, *blk_b;
-+    bs->bl.request_alignment = 512;
++    BlockDriverState *bs_a, *bs_b, *backing;
 +    BDRVTestState *a_s, *b_s, *backing_s;
 +
 +    blk_a = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
 +    bs_a = bdrv_new_open_driver(&bdrv_test, "test-node-a", BDRV_O_RDWR,
 +                                &error_abort);
 +    a_s = bs_a->opaque;
 +    blk_insert_bs(blk_a, bs_a, &error_abort);
 +
 +    blk_b = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
 +    bs_b = bdrv_new_open_driver(&bdrv_test, "test-node-b", BDRV_O_RDWR,
 +                                &error_abort);
 +    b_s = bs_b->opaque;
 +    blk_insert_bs(blk_b, bs_b, &error_abort);
 +
 +    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
 +    backing_s = backing->opaque;
 +    bdrv_set_backing_hd(bs_a, backing, &error_abort);
 +    bdrv_set_backing_hd(bs_b, backing, &error_abort);
 +
 +    g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
 +    g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
 +    g_assert_cmpint(backing->quiesce_counter, ==, 0);
 +    g_assert_cmpint(a_s->drain_count, ==, 0);
 +    g_assert_cmpint(b_s->drain_count, ==, 0);
 +    g_assert_cmpint(backing_s->drain_count, ==, 0);
 +
 +    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
 +
 +    g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
 +    g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
 +    g_assert_cmpint(backing->quiesce_counter, ==, 1);
 +    g_assert_cmpint(a_s->drain_count, ==, 1);
 +    g_assert_cmpint(b_s->drain_count, ==, 1);
 +    g_assert_cmpint(backing_s->drain_count, ==, 1);
 +
 +    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b);
 +
 +    g_assert_cmpint(bs_a->quiesce_counter, ==, 2);
 +    g_assert_cmpint(bs_b->quiesce_counter, ==, 2);
 +    g_assert_cmpint(backing->quiesce_counter, ==, 2);
 +    g_assert_cmpint(a_s->drain_count, ==, 2);
 +    g_assert_cmpint(b_s->drain_count, ==, 2);
 +    g_assert_cmpint(backing_s->drain_count, ==, 2);
 +
 +    do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
 +
 +    g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
 +    g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
 +    g_assert_cmpint(backing->quiesce_counter, ==, 1);
 +    g_assert_cmpint(a_s->drain_count, ==, 1);
 +    g_assert_cmpint(b_s->drain_count, ==, 1);
 +    g_assert_cmpint(backing_s->drain_count, ==, 1);
 +
 +    do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
 +
 +    g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
 +    g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
 +    g_assert_cmpint(backing->quiesce_counter, ==, 0);
 +    g_assert_cmpint(a_s->drain_count, ==, 0);
 +    g_assert_cmpint(b_s->drain_count, ==, 0);
 +    g_assert_cmpint(backing_s->drain_count, ==, 0);
 +
 +    bdrv_unref(backing);
 +    bdrv_unref(bs_a);
 +    bdrv_unref(bs_b);
 +    blk_unref(blk_a);
 +    blk_unref(blk_b);
 +}
 +
- static int vxhs_init_and_ref(void)
- {
+ typedef struct TestBlockJob {
-     if (vxhs_ref++ == 0) {
+     BlockJob common;
-@@ -XXX,XX +XXX,XX @@ static const AIOCBInfo vxhs_aiocb_info = {
+@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
-  * and is passed to QNIO. When QNIO completes the work,
+                     test_quiesce_co_drain_subtree);
-  * it will be passed back through the callback.
-  */
+     g_test_add_func("/bdrv-drain/nested", test_nested);
--static BlockAIOCB *vxhs_aio_rw(BlockDriverState *bs, int64_t sector_num,
++    g_test_add_func("/bdrv-drain/multiparent", test_multiparent);
--                               QEMUIOVector *qiov, int nb_sectors,
-+static BlockAIOCB *vxhs_aio_rw(BlockDriverState *bs, uint64_t offset,
+     g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
-+                               QEMUIOVector *qiov, uint64_t size,
+     g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
                                 BlockCompletionFunc *cb, void *opaque,
                                 VDISKAIOCmd iodir)
  {
      VXHSAIOCB *acb = NULL;
      BDRVVXHSState *s = bs->opaque;
 -    size_t size;
 -    uint64_t offset;
      int iio_flags = 0;
      int ret = 0;
      void *dev_handle = s->vdisk_hostinfo.dev_handle;
 -    offset = sector_num * BDRV_SECTOR_SIZE;
 -    size = nb_sectors * BDRV_SECTOR_SIZE;
      acb = qemu_aio_get(&vxhs_aiocb_info, bs, cb, opaque);
      /*
@@ -XXX,XX +XXX,XX @@ static BlockAIOCB *vxhs_aio_rw(BlockDriverState *bs, int64_t sector_num,
      switch (iodir) {
      case VDISK_AIO_WRITE:
              ret = iio_writev(dev_handle, acb, qiov->iov, qiov->niov,
 -                             offset, (uint64_t)size, iio_flags);
 +                             offset, size, iio_flags);
              break;
      case VDISK_AIO_READ:
              ret = iio_readv(dev_handle, acb, qiov->iov, qiov->niov,
 -                            offset, (uint64_t)size, iio_flags);
 +                            offset, size, iio_flags);
              break;
      default:
              trace_vxhs_aio_rw_invalid(iodir);
@@ -XXX,XX +XXX,XX @@ errout:
      return NULL;
  }
 -static BlockAIOCB *vxhs_aio_readv(BlockDriverState *bs,
 -                                   int64_t sector_num, QEMUIOVector *qiov,
 -                                   int nb_sectors,
 +static BlockAIOCB *vxhs_aio_preadv(BlockDriverState *bs,
 +                                   uint64_t offset, uint64_t bytes,
 +                                   QEMUIOVector *qiov, int flags,
                                     BlockCompletionFunc *cb, void *opaque)
  {
 -    return vxhs_aio_rw(bs, sector_num, qiov, nb_sectors, cb,
 -                       opaque, VDISK_AIO_READ);
 +    return vxhs_aio_rw(bs, offset, qiov, bytes, cb, opaque, VDISK_AIO_READ);
  }
 -static BlockAIOCB *vxhs_aio_writev(BlockDriverState *bs,
 -                                   int64_t sector_num, QEMUIOVector *qiov,
 -                                   int nb_sectors,
 -                                   BlockCompletionFunc *cb, void *opaque)
 +static BlockAIOCB *vxhs_aio_pwritev(BlockDriverState *bs,
 +                                    uint64_t offset, uint64_t bytes,
 +                                    QEMUIOVector *qiov, int flags,
 +                                    BlockCompletionFunc *cb, void *opaque)
  {
 -    return vxhs_aio_rw(bs, sector_num, qiov, nb_sectors,
 -                       cb, opaque, VDISK_AIO_WRITE);
 +    return vxhs_aio_rw(bs, offset, qiov, bytes, cb, opaque, VDISK_AIO_WRITE);
  }
  static void vxhs_close(BlockDriverState *bs)
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_vxhs = {
      .instance_size                = sizeof(BDRVVXHSState),
      .bdrv_file_open               = vxhs_open,
      .bdrv_parse_filename          = vxhs_parse_filename,
 +    .bdrv_refresh_limits          = vxhs_refresh_limits,
      .bdrv_close                   = vxhs_close,
      .bdrv_getlength               = vxhs_getlength,
 -    .bdrv_aio_readv               = vxhs_aio_readv,
 -    .bdrv_aio_writev              = vxhs_aio_writev,
 +    .bdrv_aio_preadv              = vxhs_aio_preadv,
 +    .bdrv_aio_pwritev             = vxhs_aio_pwritev,
  };
  static void bdrv_vxhs_init(void)
 --
 .13.6

-[Qemu-devel] [PULL 26/37] block: Add BDRV_REQ_WRITE_UNCHANGED flag
+[Qemu-devel] [PULL v3 32/35] block: Allow graph changes in subtree drained section
-From: Max Reitz <mreitz@redhat.com>
+We need to remember how many of the drain sections in which a node is
+were recursive (i.e. subtree drain rather than node drain), so that they
-This flag signifies that a write request will not change the visible
+can be correctly applied when children are added or removed during the
-disk content.  With this flag set, it is sufficient to have the
+drained section.
-BLK_PERM_WRITE_UNCHANGED permission instead of BLK_PERM_WRITE.
+With this change, it is safe to modify the graph even inside a
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+bdrv_subtree_drained_begin/end() section.
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: Alberto Garcia <berto@igalia.com>
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 Message-id: 20180421132929.21610-4-mreitz@redhat.com
 Reviewed-by: Kevin Wolf <kwolf@redhat.com>
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 ---
- include/block/block.h | 6 +++++-
+ include/block/block.h     |  2 --
- block/io.c            | 6 +++++-
+ include/block/block_int.h |  5 +++++
-files changed, 10 insertions(+), 2 deletions(-)
+ block.c                   | 32 +++++++++++++++++++++++++++++---
  block/io.c                | 28 ++++++++++++++++++++++++----
 files changed, 58 insertions(+), 9 deletions(-)
 diff --git a/include/block/block.h b/include/block/block.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/block/block.h
 +++ b/include/block/block.h
-@@ -XXX,XX +XXX,XX @@ typedef enum {
+@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs);
-     BDRV_REQ_FUA                = 0x10,
+ /**
-     BDRV_REQ_WRITE_COMPRESSED   = 0x20,
+  * Like bdrv_drained_begin, but recursively begins a quiesced section for
+  * exclusive access to all child nodes as well.
-+    /* Signifies that this write request will not change the visible disk
+- *
-+     * content. */
+- * Graph changes are not allowed during a subtree drain section.
-+    BDRV_REQ_WRITE_UNCHANGED    = 0x40,
+  */
-+
+ void bdrv_subtree_drained_begin(BlockDriverState *bs);
-     /* Mask of valid flags */
--    BDRV_REQ_MASK               = 0x3f,
+diff --git a/include/block/block_int.h b/include/block/block_int.h
-+    BDRV_REQ_MASK               = 0x7f,
+index XXXXXXX..XXXXXXX 100644
- } BdrvRequestFlags;
+--- a/include/block/block_int.h
++++ b/include/block/block_int.h
- typedef struct BlockSizes {
+@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
      /* Accessed with atomic ops.  */
      int quiesce_counter;
 +    int recursive_quiesce_counter;
 +
      unsigned int write_gen;               /* Current data generation */
      /* Protected by reqs_lock.  */
@@ -XXX,XX +XXX,XX @@ int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
      int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
      BdrvRequestFlags flags);
 +void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent);
 +void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent);
 +
  int get_tmp_filename(char *filename, int size);
  BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
                              const char *filename);
 diff --git a/block.c b/block.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block.c
 +++ b/block.c
@@ -XXX,XX +XXX,XX @@ static void bdrv_child_cb_drained_end(BdrvChild *child)
      bdrv_drained_end(bs);
  }
 +static void bdrv_child_cb_attach(BdrvChild *child)
 +{
 +    BlockDriverState *bs = child->opaque;
 +    bdrv_apply_subtree_drain(child, bs);
 +}
 +
 +static void bdrv_child_cb_detach(BdrvChild *child)
 +{
 +    BlockDriverState *bs = child->opaque;
 +    bdrv_unapply_subtree_drain(child, bs);
 +}
 +
  static int bdrv_child_cb_inactivate(BdrvChild *child)
  {
      BlockDriverState *bs = child->opaque;
@@ -XXX,XX +XXX,XX @@ const BdrvChildRole child_file = {
      .inherit_options = bdrv_inherited_options,
      .drained_begin   = bdrv_child_cb_drained_begin,
      .drained_end     = bdrv_child_cb_drained_end,
 +    .attach          = bdrv_child_cb_attach,
 +    .detach          = bdrv_child_cb_detach,
      .inactivate      = bdrv_child_cb_inactivate,
  };
@@ -XXX,XX +XXX,XX @@ const BdrvChildRole child_format = {
      .inherit_options = bdrv_inherited_fmt_options,
      .drained_begin   = bdrv_child_cb_drained_begin,
      .drained_end     = bdrv_child_cb_drained_end,
 +    .attach          = bdrv_child_cb_attach,
 +    .detach          = bdrv_child_cb_detach,
      .inactivate      = bdrv_child_cb_inactivate,
  };
@@ -XXX,XX +XXX,XX @@ static void bdrv_backing_attach(BdrvChild *c)
                      parent->backing_blocker);
      bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_BACKUP_TARGET,
                      parent->backing_blocker);
 +
 +    bdrv_child_cb_attach(c);
  }
  static void bdrv_backing_detach(BdrvChild *c)
@@ -XXX,XX +XXX,XX @@ static void bdrv_backing_detach(BdrvChild *c)
      bdrv_op_unblock_all(c->bs, parent->backing_blocker);
      error_free(parent->backing_blocker);
      parent->backing_blocker = NULL;
 +
 +    bdrv_child_cb_detach(c);
  }
  /*
@@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child,
          assert(bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs));
      }
      if (old_bs) {
 +        /* Detach first so that the recursive drain sections coming from @child
 +         * are already gone and we only end the drain sections that came from
 +         * elsewhere. */
 +        if (child->role->detach) {
 +            child->role->detach(child);
 +        }
          if (old_bs->quiesce_counter && child->role->drained_end) {
              for (i = 0; i < old_bs->quiesce_counter; i++) {
                  child->role->drained_end(child);
              }
          }
 -        if (child->role->detach) {
 -            child->role->detach(child);
 -        }
          QLIST_REMOVE(child, next_parent);
      }
@@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child,
              }
          }
 +        /* Attach only after starting new drained sections, so that recursive
 +         * drain sections coming from @child don't get an extra .drained_begin
 +         * callback. */
          if (child->role->attach) {
              child->role->attach(child);
          }
 diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/io.c
 +++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child,
+@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
-     assert(!waited || !req->serialising);
+     assert(data.done);
-     assert(req->overlap_offset <= offset);
+ }
-     assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
--    assert(child->perm & BLK_PERM_WRITE);
+-static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
-+    if (flags & BDRV_REQ_WRITE_UNCHANGED) {
+-                                  BdrvChild *parent)
-+        assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE));
++void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
-+    } else {
++                           BdrvChild *parent)
-+        assert(child->perm & BLK_PERM_WRITE);
+ {
      BdrvChild *child, *next;
@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
      bdrv_drain_recurse(bs);
      if (recursive) {
 +        bs->recursive_quiesce_counter++;
          QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
              bdrv_do_drained_begin(child->bs, true, child);
          }
@@ -XXX,XX +XXX,XX @@ void bdrv_subtree_drained_begin(BlockDriverState *bs)
      bdrv_do_drained_begin(bs, true, NULL);
  }
 -static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
 -                                BdrvChild *parent)
 +void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
 +                         BdrvChild *parent)
  {
      BdrvChild *child, *next;
      int old_quiesce_counter;
@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
      }
      if (recursive) {
 +        bs->recursive_quiesce_counter--;
          QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
              bdrv_do_drained_end(child->bs, true, child);
          }
@@ -XXX,XX +XXX,XX @@ void bdrv_subtree_drained_end(BlockDriverState *bs)
      bdrv_do_drained_end(bs, true, NULL);
  }
 +void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent)
 +{
 +    int i;
 +
 +    for (i = 0; i < new_parent->recursive_quiesce_counter; i++) {
 +        bdrv_do_drained_begin(child->bs, true, child);
 +    }
-     assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE);
++}
++
-     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
++void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent)
 +{
 +    int i;
 +
 +    for (i = 0; i < old_parent->recursive_quiesce_counter; i++) {
 +        bdrv_do_drained_end(child->bs, true, child);
 +    }
 +}
 +
  /*
   * Wait for pending requests to complete on a single BlockDriverState subtree,
   * and suspend block driver's internal I/O until next request arrives.
 --
 .13.6

-[Qemu-devel] [PULL 01/37] block-backend: simplify blk_get_aio_context
+[Qemu-devel] [PULL v3 33/35] test-bdrv-drain: Test graph changes in drained section
-From: Daniel Henrique Barboza <danielhb@linux.vnet.ibm.com>
-blk_get_aio_context verifies if BlockDriverState bs is not NULL,
-return bdrv_get_aio_context(bs) if true or qemu_get_aio_context()
-otherwise. However, bdrv_get_aio_context from block.c already does
-this verification itself, also returning qemu_get_aio_context()
-if bs is NULL:
-AioContext *bdrv_get_aio_context(BlockDriverState *bs)
-{
-    return bs ? bs->aio_context : qemu_get_aio_context();
-}
-This patch simplifies blk_get_aio_context to simply call
-bdrv_get_aio_context instead of replicating the same logic.
-Signed-off-by: Daniel Henrique Barboza <danielhb@linux.vnet.ibm.com>
-Reviewed-by: Darren Kenny <darren.kenny@oracle.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- block/block-backend.c | 8 +-------
+ tests/test-bdrv-drain.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++
-file changed, 1 insertion(+), 7 deletions(-)
+file changed, 80 insertions(+)
-diff --git a/block/block-backend.c b/block/block-backend.c
+diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/block-backend.c
+--- a/tests/test-bdrv-drain.c
-+++ b/block/block-backend.c
++++ b/tests/test-bdrv-drain.c
-@@ -XXX,XX +XXX,XX @@ void blk_op_unblock_all(BlockBackend *blk, Error *reason)
+@@ -XXX,XX +XXX,XX @@ static void test_multiparent(void)
+     blk_unref(blk_b);
  AioContext *blk_get_aio_context(BlockBackend *blk)
  {
 -    BlockDriverState *bs = blk_bs(blk);
 -
 -    if (bs) {
 -        return bdrv_get_aio_context(bs);
 -    } else {
 -        return qemu_get_aio_context();
 -    }
 +    return bdrv_get_aio_context(blk_bs(blk));
  }
- static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb)
++static void test_graph_change(void)
 +{
 +    BlockBackend *blk_a, *blk_b;
 +    BlockDriverState *bs_a, *bs_b, *backing;
 +    BDRVTestState *a_s, *b_s, *backing_s;
 +
 +    blk_a = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
 +    bs_a = bdrv_new_open_driver(&bdrv_test, "test-node-a", BDRV_O_RDWR,
 +                                &error_abort);
 +    a_s = bs_a->opaque;
 +    blk_insert_bs(blk_a, bs_a, &error_abort);
 +
 +    blk_b = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
 +    bs_b = bdrv_new_open_driver(&bdrv_test, "test-node-b", BDRV_O_RDWR,
 +                                &error_abort);
 +    b_s = bs_b->opaque;
 +    blk_insert_bs(blk_b, bs_b, &error_abort);
 +
 +    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
 +    backing_s = backing->opaque;
 +    bdrv_set_backing_hd(bs_a, backing, &error_abort);
 +
 +    g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
 +    g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
 +    g_assert_cmpint(backing->quiesce_counter, ==, 0);
 +    g_assert_cmpint(a_s->drain_count, ==, 0);
 +    g_assert_cmpint(b_s->drain_count, ==, 0);
 +    g_assert_cmpint(backing_s->drain_count, ==, 0);
 +
 +    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
 +    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
 +    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
 +    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b);
 +    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b);
 +
 +    bdrv_set_backing_hd(bs_b, backing, &error_abort);
 +    g_assert_cmpint(bs_a->quiesce_counter, ==, 5);
 +    g_assert_cmpint(bs_b->quiesce_counter, ==, 5);
 +    g_assert_cmpint(backing->quiesce_counter, ==, 5);
 +    g_assert_cmpint(a_s->drain_count, ==, 5);
 +    g_assert_cmpint(b_s->drain_count, ==, 5);
 +    g_assert_cmpint(backing_s->drain_count, ==, 5);
 +
 +    bdrv_set_backing_hd(bs_b, NULL, &error_abort);
 +    g_assert_cmpint(bs_a->quiesce_counter, ==, 3);
 +    g_assert_cmpint(bs_b->quiesce_counter, ==, 2);
 +    g_assert_cmpint(backing->quiesce_counter, ==, 3);
 +    g_assert_cmpint(a_s->drain_count, ==, 3);
 +    g_assert_cmpint(b_s->drain_count, ==, 2);
 +    g_assert_cmpint(backing_s->drain_count, ==, 3);
 +
 +    bdrv_set_backing_hd(bs_b, backing, &error_abort);
 +    g_assert_cmpint(bs_a->quiesce_counter, ==, 5);
 +    g_assert_cmpint(bs_b->quiesce_counter, ==, 5);
 +    g_assert_cmpint(backing->quiesce_counter, ==, 5);
 +    g_assert_cmpint(a_s->drain_count, ==, 5);
 +    g_assert_cmpint(b_s->drain_count, ==, 5);
 +    g_assert_cmpint(backing_s->drain_count, ==, 5);
 +
 +    do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
 +    do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
 +    do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
 +    do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
 +    do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
 +
 +    g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
 +    g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
 +    g_assert_cmpint(backing->quiesce_counter, ==, 0);
 +    g_assert_cmpint(a_s->drain_count, ==, 0);
 +    g_assert_cmpint(b_s->drain_count, ==, 0);
 +    g_assert_cmpint(backing_s->drain_count, ==, 0);
 +
 +    bdrv_unref(backing);
 +    bdrv_unref(bs_a);
 +    bdrv_unref(bs_b);
 +    blk_unref(blk_a);
 +    blk_unref(blk_b);
 +}
 +
  typedef struct TestBlockJob {
      BlockJob common;
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
      g_test_add_func("/bdrv-drain/nested", test_nested);
      g_test_add_func("/bdrv-drain/multiparent", test_multiparent);
 +    g_test_add_func("/bdrv-drain/graph-change", test_graph_change);
      g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
      g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
 --
 .13.6

-[Qemu-devel] [PULL 15/37] blockjob: Introduce block_job_ratelimit_get_delay()
+[Qemu-devel] [PULL v3 34/35] commit: Simplify reopen of base
-This gets us rid of more direct accesses to BlockJob fields from the
+Since commit bde70715, base is the only node that is reopened in
-job drivers.
+commit_start(). This means that the code, which still involves an
 explicit BlockReopenQueue, can now be simplified by using bdrv_reopen().
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
+Reviewed-by: Fam Zheng <famz@redhat.com>
 Reviewed-by: Max Reitz <mreitz@redhat.com>
 Reviewed-by: John Snow <jsnow@redhat.com>
 ---
- include/block/blockjob_int.h |  8 ++++++++
+ block/commit.c | 8 +-------
- block/backup.c               | 18 +++++++-----------
+file changed, 1 insertion(+), 7 deletions(-)
  block/commit.c               |  4 ++--
  block/mirror.c               |  5 +----
  block/stream.c               |  4 ++--
  blockjob.c                   |  9 +++++++++
 files changed, 29 insertions(+), 19 deletions(-)
-diff --git a/include/block/blockjob_int.h b/include/block/blockjob_int.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/block/blockjob_int.h
-+++ b/include/block/blockjob_int.h
-@@ -XXX,XX +XXX,XX @@ void block_job_sleep_ns(BlockJob *job, int64_t ns);
- void block_job_yield(BlockJob *job);
- /**
-+ * block_job_ratelimit_get_delay:
-+ *
-+ * Calculate and return delay for the next request in ns. See the documentation
-+ * of ratelimit_calculate_delay() for details.
-+ */
-+int64_t block_job_ratelimit_get_delay(BlockJob *job, uint64_t n);
-+
-+/**
-  * block_job_early_fail:
-  * @bs: The block device.
-  *
-diff --git a/block/backup.c b/block/backup.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/backup.c
-+++ b/block/backup.c
-@@ -XXX,XX +XXX,XX @@ static void backup_complete(BlockJob *job, void *opaque)
- static bool coroutine_fn yield_and_check(BackupBlockJob *job)
- {
-+    uint64_t delay_ns;
-+
-     if (block_job_is_cancelled(&job->common)) {
-         return true;
-     }
--    /* we need to yield so that bdrv_drain_all() returns.
--     * (without, VM does not reboot)
--     */
--    if (job->common.speed) {
--        uint64_t delay_ns = ratelimit_calculate_delay(&job->common.limit,
--                                                      job->bytes_read);
--        job->bytes_read = 0;
--        block_job_sleep_ns(&job->common, delay_ns);
--    } else {
--        block_job_sleep_ns(&job->common, 0);
--    }
-+    /* We need to yield even for delay_ns = 0 so that bdrv_drain_all() can
-+     * return. Without a yield, the VM would not reboot. */
-+    delay_ns = block_job_ratelimit_get_delay(&job->common, job->bytes_read);
-+    job->bytes_read = 0;
-+    block_job_sleep_ns(&job->common, delay_ns);
-     if (block_job_is_cancelled(&job->common)) {
-         return true;
 diff --git a/block/commit.c b/block/commit.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/commit.c
 +++ b/block/commit.c
-@@ -XXX,XX +XXX,XX @@ static void coroutine_fn commit_run(void *opaque)
+@@ -XXX,XX +XXX,XX @@ void commit_start(const char *job_id, BlockDriverState *bs,
-         /* Publish progress */
+                   const char *filter_node_name, Error **errp)
          block_job_progress_update(&s->common, n);
 -        if (copy && s->common.speed) {
 -            delay_ns = ratelimit_calculate_delay(&s->common.limit, n);
 +        if (copy) {
 +            delay_ns = block_job_ratelimit_get_delay(&s->common, n);
          } else {
              delay_ns = 0;
          }
 diff --git a/block/mirror.c b/block/mirror.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/mirror.c
 +++ b/block/mirror.c
@@ -XXX,XX +XXX,XX @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
          assert(io_bytes);
          offset += io_bytes;
          nb_chunks -= DIV_ROUND_UP(io_bytes, s->granularity);
 -        if (s->common.speed) {
 -            delay_ns = ratelimit_calculate_delay(&s->common.limit,
 -                                                 io_bytes_acct);
 -        }
 +        delay_ns = block_job_ratelimit_get_delay(&s->common, io_bytes_acct);
      }
      return delay_ns;
  }
 diff --git a/block/stream.c b/block/stream.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/stream.c
 +++ b/block/stream.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn stream_run(void *opaque)
          /* Publish progress */
          block_job_progress_update(&s->common, n);
 -        if (copy && s->common.speed) {
 -            delay_ns = ratelimit_calculate_delay(&s->common.limit, n);
 +        if (copy) {
 +            delay_ns = block_job_ratelimit_get_delay(&s->common, n);
          } else {
              delay_ns = 0;
          }
 diff --git a/blockjob.c b/blockjob.c
 index XXXXXXX..XXXXXXX 100644
 --- a/blockjob.c
 +++ b/blockjob.c
@@ -XXX,XX +XXX,XX @@ void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
      block_job_enter_cond(job, block_job_timer_pending);
  }
 +int64_t block_job_ratelimit_get_delay(BlockJob *job, uint64_t n)
 +{
 +    if (!job->speed) {
 +        return 0;
 +    }
 +
 +    return ratelimit_calculate_delay(&job->limit, n);
 +}
 +
  void block_job_complete(BlockJob *job, Error **errp)
  {
-     /* Should not be reachable via external interface for internal jobs */
+     CommitBlockJob *s;
 -    BlockReopenQueue *reopen_queue = NULL;
      int orig_base_flags;
      BlockDriverState *iter;
      BlockDriverState *commit_top_bs = NULL;
@@ -XXX,XX +XXX,XX @@ void commit_start(const char *job_id, BlockDriverState *bs,
      /* convert base to r/w, if necessary */
      orig_base_flags = bdrv_get_flags(base);
      if (!(orig_base_flags & BDRV_O_RDWR)) {
 -        reopen_queue = bdrv_reopen_queue(reopen_queue, base, NULL,
 -                                         orig_base_flags | BDRV_O_RDWR);
 -    }
 -
 -    if (reopen_queue) {
 -        bdrv_reopen_multiple(bdrv_get_aio_context(bs), reopen_queue, &local_err);
 +        bdrv_reopen(base, orig_base_flags | BDRV_O_RDWR, &local_err);
          if (local_err != NULL) {
              error_propagate(errp, local_err);
              goto fail;
 --
 .13.6

-[Qemu-devel] [PULL 04/37] null: Switch to byte-based read/write
+[Qemu-devel] [PULL v3 35/35] block: Keep nodes drained between reopen_queue/multiple
-From: Eric Blake <eblake@redhat.com>
+The bdrv_reopen*() implementation doesn't like it if the graph is
 changed between queuing nodes for reopen and actually reopening them
 (one of the reasons is that queuing can be recursive).
-We are gradually moving away from sector-based interfaces, towards
+So instead of draining the device only in bdrv_reopen_multiple(),
-byte-based.  Make the change for the last few sector-based callbacks
+require that callers already drained all affected nodes, and assert this
-in the null-co and null-aio drivers.
+in bdrv_reopen_queue().
-Note that since the null driver does nothing on writes, it trivially
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-supports the BDRV_REQ_FUA flag (all writes have already landed to
+Reviewed-by: Fam Zheng <famz@redhat.com>
-the same bit-bucket without needing an extra flush call).  Also, since
+---
-the null driver does just as well with byte-based requests, we can
+ block.c             | 23 ++++++++++++++++-------
-now avoid cycles wasted on read-modify-write by taking advantage of
+ block/replication.c |  6 ++++++
-the block layer now defaulting the alignment to 1 instead of 512.
+ qemu-io-cmds.c      |  3 +++
 files changed, 25 insertions(+), 7 deletions(-)
-Signed-off-by: Eric Blake <eblake@redhat.com>
+diff --git a/block.c b/block.c
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
  block/null.c | 45 +++++++++++++++++++++++----------------------
 file changed, 23 insertions(+), 22 deletions(-)
 diff --git a/block/null.c b/block/null.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/null.c
+--- a/block.c
-+++ b/block/null.c
++++ b/block.c
-@@ -XXX,XX +XXX,XX @@ static int null_file_open(BlockDriverState *bs, QDict *options, int flags,
+@@ -XXX,XX +XXX,XX @@ BlockDriverState *bdrv_open(const char *filename, const char *reference,
   * returns a pointer to bs_queue, which is either the newly allocated
   * bs_queue, or the existing bs_queue being used.
   *
 + * bs must be drained between bdrv_reopen_queue() and bdrv_reopen_multiple().
   */
  static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue,
                                                   BlockDriverState *bs,
@@ -XXX,XX +XXX,XX @@ static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue,
      BdrvChild *child;
      QDict *old_options, *explicit_options;
 +    /* Make sure that the caller remembered to use a drained section. This is
 +     * important to avoid graph changes between the recursive queuing here and
 +     * bdrv_reopen_multiple(). */
 +    assert(bs->quiesce_counter > 0);
 +
      if (bs_queue == NULL) {
          bs_queue = g_new0(BlockReopenQueue, 1);
          QSIMPLEQ_INIT(bs_queue);
@@ -XXX,XX +XXX,XX @@ BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
   * If all devices prepare successfully, then the changes are committed
   * to all devices.
   *
 + * All affected nodes must be drained between bdrv_reopen_queue() and
 + * bdrv_reopen_multiple().
   */
  int bdrv_reopen_multiple(AioContext *ctx, BlockReopenQueue *bs_queue, Error **errp)
  {
@@ -XXX,XX +XXX,XX @@ int bdrv_reopen_multiple(AioContext *ctx, BlockReopenQueue *bs_queue, Error **er
      assert(bs_queue != NULL);
 -    aio_context_release(ctx);
 -    bdrv_drain_all_begin();
 -    aio_context_acquire(ctx);
 -
      QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
 +        assert(bs_entry->state.bs->quiesce_counter > 0);
          if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
              error_propagate(errp, local_err);
              goto cleanup;
@@ -XXX,XX +XXX,XX @@ cleanup:
      }
-     s->read_zeroes = qemu_opt_get_bool(opts, NULL_OPT_ZEROES, false);
+     g_free(bs_queue);
-     qemu_opts_del(opts);
-+    bs->supported_write_flags = BDRV_REQ_FUA;
+-    bdrv_drain_all_end();
 -
      return ret;
  }
-@@ -XXX,XX +XXX,XX @@ static coroutine_fn int null_co_common(BlockDriverState *bs)
+@@ -XXX,XX +XXX,XX @@ int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
-     return 0;
+ {
      int ret = -1;
      Error *local_err = NULL;
 -    BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, NULL, bdrv_flags);
 +    BlockReopenQueue *queue;
 +    bdrv_subtree_drained_begin(bs);
 +
 +    queue = bdrv_reopen_queue(NULL, bs, NULL, bdrv_flags);
      ret = bdrv_reopen_multiple(bdrv_get_aio_context(bs), queue, &local_err);
      if (local_err != NULL) {
          error_propagate(errp, local_err);
      }
 +
 +    bdrv_subtree_drained_end(bs);
 +
      return ret;
  }
--static coroutine_fn int null_co_readv(BlockDriverState *bs,
+diff --git a/block/replication.c b/block/replication.c
--                                      int64_t sector_num, int nb_sectors,
+index XXXXXXX..XXXXXXX 100644
--                                      QEMUIOVector *qiov)
+--- a/block/replication.c
-+static coroutine_fn int null_co_preadv(BlockDriverState *bs,
++++ b/block/replication.c
-+                                       uint64_t offset, uint64_t bytes,
+@@ -XXX,XX +XXX,XX @@ static void reopen_backing_file(BlockDriverState *bs, bool writable,
-+                                       QEMUIOVector *qiov, int flags)
+         new_secondary_flags = s->orig_secondary_flags;
  {
      BDRVNullState *s = bs->opaque;
      if (s->read_zeroes) {
 -        qemu_iovec_memset(qiov, 0, 0, nb_sectors * BDRV_SECTOR_SIZE);
 +        qemu_iovec_memset(qiov, 0, 0, bytes);
      }
-     return null_co_common(bs);
++    bdrv_subtree_drained_begin(s->hidden_disk->bs);
 +    bdrv_subtree_drained_begin(s->secondary_disk->bs);
 +
      if (orig_hidden_flags != new_hidden_flags) {
          reopen_queue = bdrv_reopen_queue(reopen_queue, s->hidden_disk->bs, NULL,
                                           new_hidden_flags);
@@ -XXX,XX +XXX,XX @@ static void reopen_backing_file(BlockDriverState *bs, bool writable,
                               reopen_queue, &local_err);
          error_propagate(errp, local_err);
      }
 +
 +    bdrv_subtree_drained_end(s->hidden_disk->bs);
 +    bdrv_subtree_drained_end(s->secondary_disk->bs);
  }
--static coroutine_fn int null_co_writev(BlockDriverState *bs,
+ static void backup_job_cleanup(BlockDriverState *bs)
--                                       int64_t sector_num, int nb_sectors,
+diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
--                                       QEMUIOVector *qiov)
+index XXXXXXX..XXXXXXX 100644
-+static coroutine_fn int null_co_pwritev(BlockDriverState *bs,
+--- a/qemu-io-cmds.c
-+                                        uint64_t offset, uint64_t bytes,
++++ b/qemu-io-cmds.c
-+                                        QEMUIOVector *qiov, int flags)
+@@ -XXX,XX +XXX,XX @@ static int reopen_f(BlockBackend *blk, int argc, char **argv)
- {
+     opts = qopts ? qemu_opts_to_qdict(qopts, NULL) : NULL;
-     return null_co_common(bs);
+     qemu_opts_reset(&reopen_opts);
- }
-@@ -XXX,XX +XXX,XX @@ static inline BlockAIOCB *null_aio_common(BlockDriverState *bs,
++    bdrv_subtree_drained_begin(bs);
-     return &acb->common;
+     brq = bdrv_reopen_queue(NULL, bs, opts, flags);
- }
+     bdrv_reopen_multiple(bdrv_get_aio_context(bs), brq, &local_err);
++    bdrv_subtree_drained_end(bs);
--static BlockAIOCB *null_aio_readv(BlockDriverState *bs,
++
--                                  int64_t sector_num, QEMUIOVector *qiov,
+     if (local_err) {
--                                  int nb_sectors,
+         error_report_err(local_err);
--                                  BlockCompletionFunc *cb,
+     } else {
 -                                  void *opaque)
 +static BlockAIOCB *null_aio_preadv(BlockDriverState *bs,
 +                                   uint64_t offset, uint64_t bytes,
 +                                   QEMUIOVector *qiov, int flags,
 +                                   BlockCompletionFunc *cb,
 +                                   void *opaque)
  {
      BDRVNullState *s = bs->opaque;
      if (s->read_zeroes) {
 -        qemu_iovec_memset(qiov, 0, 0, nb_sectors * BDRV_SECTOR_SIZE);
 +        qemu_iovec_memset(qiov, 0, 0, bytes);
      }
      return null_aio_common(bs, cb, opaque);
  }
 -static BlockAIOCB *null_aio_writev(BlockDriverState *bs,
 -                                   int64_t sector_num, QEMUIOVector *qiov,
 -                                   int nb_sectors,
 -                                   BlockCompletionFunc *cb,
 -                                   void *opaque)
 +static BlockAIOCB *null_aio_pwritev(BlockDriverState *bs,
 +                                    uint64_t offset, uint64_t bytes,
 +                                    QEMUIOVector *qiov, int flags,
 +                                    BlockCompletionFunc *cb,
 +                                    void *opaque)
  {
      return null_aio_common(bs, cb, opaque);
  }
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_null_co = {
      .bdrv_close             = null_close,
      .bdrv_getlength         = null_getlength,
 -    .bdrv_co_readv          = null_co_readv,
 -    .bdrv_co_writev         = null_co_writev,
 +    .bdrv_co_preadv         = null_co_preadv,
 +    .bdrv_co_pwritev        = null_co_pwritev,
      .bdrv_co_flush_to_disk  = null_co_flush,
      .bdrv_reopen_prepare    = null_reopen_prepare,
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_null_aio = {
      .bdrv_close             = null_close,
      .bdrv_getlength         = null_getlength,
 -    .bdrv_aio_readv         = null_aio_readv,
 -    .bdrv_aio_writev        = null_aio_writev,
 +    .bdrv_aio_preadv        = null_aio_preadv,
 +    .bdrv_aio_pwritev       = null_aio_pwritev,
      .bdrv_aio_flush         = null_aio_flush,
      .bdrv_reopen_prepare    = null_reopen_prepare,
 --
 .13.6

-[Qemu-devel] [PULL 19/37] specs/qcow2: Clarify that compressed clusters have the COPIED bit reset
+Deleted patch
-From: Alberto Garcia <berto@igalia.com>
-Compressed clusters are not supposed to have the COPIED bit set, but
-this is not made explicit in the specs, so let's document it.
-Signed-off-by: Alberto Garcia <berto@igalia.com>
-Message-id: 74552e1d6e858d3159cb0c0e188e80bc9248e337.1523376013.git.berto@igalia.com
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- docs/interop/qcow2.txt | 8 ++++----
-file changed, 4 insertions(+), 4 deletions(-)
-diff --git a/docs/interop/qcow2.txt b/docs/interop/qcow2.txt
-index XXXXXXX..XXXXXXX 100644
---- a/docs/interop/qcow2.txt
-+++ b/docs/interop/qcow2.txt
-@@ -XXX,XX +XXX,XX @@ L2 table entry:
-:   0 for standard clusters
-for compressed clusters
--              63:   0 for a cluster that is unused or requires COW, 1 if its
--                    refcount is exactly one. This information is only accurate
--                    in L2 tables that are reachable from the active L1
--                    table.
-+              63:   0 for clusters that are unused, compressed or require COW.
-+                    1 for standard clusters whose refcount is exactly one.
-+                    This information is only accurate in L2 tables
-+                    that are reachable from the active L1 table.
- Standard Cluster Descriptor:
---
-.13.6

-[Qemu-devel] [PULL 21/37] docs: Document the new default sizes of the qcow2 caches
+Deleted patch
-From: Alberto Garcia <berto@igalia.com>
-We have just reduced the refcount cache size to the minimum unless
-the user explicitly requests a larger one, so we have to update the
-documentation to reflect this change.
-Signed-off-by: Alberto Garcia <berto@igalia.com>
-Message-id: c5f0bde23558dd9d33b21fffc76ac9953cc19c56.1523968389.git.berto@igalia.com
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- docs/qcow2-cache.txt | 33 ++++++++++++++++-----------------
-file changed, 16 insertions(+), 17 deletions(-)
-diff --git a/docs/qcow2-cache.txt b/docs/qcow2-cache.txt
-index XXXXXXX..XXXXXXX 100644
---- a/docs/qcow2-cache.txt
-+++ b/docs/qcow2-cache.txt
-@@ -XXX,XX +XXX,XX @@ There are three options available, and all of them take bytes:
- "refcount-cache-size":   maximum size of the refcount block cache
- "cache-size":            maximum size of both caches combined
--There are two things that need to be taken into account:
-+There are a few things that need to be taken into account:
-  - Both caches must have a size that is a multiple of the cluster size
-    (or the cache entry size: see "Using smaller cache sizes" below).
-- - If you only set one of the options above, QEMU will automatically
--   adjust the others so that the L2 cache is 4 times bigger than the
--   refcount cache.
-+ - The default L2 cache size is 8 clusters or 1MB (whichever is more),
-+   and the minimum is 2 clusters (or 2 cache entries, see below).
--This means that these options are equivalent:
-+ - The default (and minimum) refcount cache size is 4 clusters.
--   -drive file=hd.qcow2,l2-cache-size=2097152
--   -drive file=hd.qcow2,refcount-cache-size=524288
--   -drive file=hd.qcow2,cache-size=2621440
-+ - If only "cache-size" is specified then QEMU will assign as much
-+   memory as possible to the L2 cache before increasing the refcount
-+   cache size.
--The reason for this 1/4 ratio is to ensure that both caches cover the
--same amount of disk space. Note however that this is only valid with
--the default value of refcount_bits (16). If you are using a different
--value you might want to calculate both cache sizes yourself since QEMU
--will always use the same 1/4 ratio.
-+Unlike L2 tables, refcount blocks are not used during normal I/O but
-+only during allocations and internal snapshots. In most cases they are
-+accessed sequentially (even during random guest I/O) so increasing the
-+refcount cache size won't have any measurable effect in performance
-+(this can change if you are using internal snapshots, so you may want
-+to think about increasing the cache size if you use them heavily).
--It's also worth mentioning that there's no strict need for both caches
--to cover the same amount of disk space. The refcount cache is used
--much less often than the L2 cache, so it's perfectly reasonable to
--keep it small.
-+Before QEMU 2.12 the refcount cache had a default size of 1/4 of the
-+L2 cache size. This resulted in unnecessarily large caches, so now the
-+refcount cache is as small as possible unless overridden by the user.
- Using smaller cache entries
---
-.13.6

The following changes since commit ad1b4ec39caa5b3f17cbd8160283a03a3dcfe2ae:

Merge remote-tracking branch 'remotes/kraxel/tags/input-20180515-pull-request' into staging (2018-05-15 12:50:06 +0100)

are available in the git repository at:

git://repo.or.cz/qemu/kevin.git tags/for-upstream

for you to fetch changes up to 1fce860ea5eba1ca00a67911fc0b8a5d80009514:

Merge remote-tracking branch 'mreitz/tags/pull-block-2018-05-15' into queue-block (2018-05-15 16:19:53 +0200)

----------------------------------------------------------------
Block layer patches:

- Switch AIO/callback based block drivers to a byte-based interface
- Block jobs: Expose error string via query-block-jobs
- Block job cleanups and fixes
- hmp: Allow using a qdev id in block_set_io_throttle
- Copy-on-read block driver
- The qcow2 default refcount cache size has been decreased
- Various bug fixes
----------------------------------------------------------------
Alberto Garcia (5):
      hmp: Allow using a qdev id in block_set_io_throttle
      Fix error message about compressed clusters with OFLAG_COPIED
      specs/qcow2: Clarify that compressed clusters have the COPIED bit reset
      qcow2: Give the refcount cache the minimum possible size by default
      docs: Document the new default sizes of the qcow2 caches

Daniel Henrique Barboza (1):
      block-backend: simplify blk_get_aio_context

Eric Blake (7):
      block: Support byte-based aio callbacks
      file-win32: Switch to byte-based callbacks
      null: Switch to byte-based read/write
      rbd: Switch to byte-based callbacks
      vxhs: Switch to byte-based callbacks
      block: Drop last of the sector-based aio callbacks
      block: Merge .bdrv_co_writev{,_flags} in drivers

John Snow (1):
      blockjob: expose error string via query

Kevin Wolf (7):
      blockjob: Fix assertion in block_job_finalize()
      blockjob: Wrappers for progress counter access
      blockjob: Move RateLimit to BlockJob
      blockjob: Implement block_job_set_speed() centrally
      blockjob: Introduce block_job_ratelimit_get_delay()
      blockjob: Add block_job_driver()
      Merge remote-tracking branch 'mreitz/tags/pull-block-2018-05-15' into queue-block

Max Reitz (17):
      iotests: Split 214 off of 122
      iotests: Add failure matching to common.qemu
      iotests: Skip 181 and 201 without userfaultfd
      block: Add COR filter driver
      block: BLK_PERM_WRITE includes ..._UNCHANGED
      block: Add BDRV_REQ_WRITE_UNCHANGED flag
      block: Set BDRV_REQ_WRITE_UNCHANGED for COR writes
      block/quorum: Support BDRV_REQ_WRITE_UNCHANGED
      block: Support BDRV_REQ_WRITE_UNCHANGED in filters
      iotests: Clean up wrap image in 197
      iotests: Copy 197 for COR filter driver
      iotests: Add test for COR across nodes
      qemu-img: Check post-truncation size
      block: Document BDRV_REQ_WRITE_UNCHANGED support
      qemu-io: Use purely string blockdev options
      qemu-img: Use only string options in img_open_opts
      iotests: Add test for -U/force-share conflicts

From: Daniel Henrique Barboza <danielhb@linux.vnet.ibm.com>

blk_get_aio_context verifies if BlockDriverState bs is not NULL,
return bdrv_get_aio_context(bs) if true or qemu_get_aio_context()
otherwise. However, bdrv_get_aio_context from block.c already does
this verification itself, also returning qemu_get_aio_context()
if bs is NULL:

AioContext *bdrv_get_aio_context(BlockDriverState *bs)
{
    return bs ? bs->aio_context : qemu_get_aio_context();
}

This patch simplifies blk_get_aio_context to simply call
bdrv_get_aio_context instead of replicating the same logic.

Signed-off-by: Daniel Henrique Barboza <danielhb@linux.vnet.ibm.com>
Reviewed-by: Darren Kenny <darren.kenny@oracle.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/block-backend.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/block/block-backend.c b/block/block-backend.c
index XXXXXXX..XXXXXXX 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -XXX,XX +XXX,XX @@ void blk_op_unblock_all(BlockBackend *blk, Error *reason)
 
 AioContext *blk_get_aio_context(BlockBackend *blk)
 {
-    BlockDriverState *bs = blk_bs(blk);
-
-    if (bs) {
-        return bdrv_get_aio_context(bs);
-    } else {
-        return qemu_get_aio_context();
-    }
+    return bdrv_get_aio_context(blk_bs(blk));
 }
 
 static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb)
-- 
2.13.6

From: Eric Blake <eblake@redhat.com>

We are gradually moving away from sector-based interfaces, towards
byte-based.  Add new sector-based aio callbacks for read and write,
to match the fact that bdrv_aio_pdiscard is already byte-based.

Ideally, drivers should be converted to use coroutine callbacks
rather than aio; but that is not quite as trivial (and if we were
to do that conversion, the null-aio driver would disappear), so for
the short term, converting the signature but keeping things with
aio is easier.  However, we CAN declare that a driver that uses
the byte-based aio interfaces now defaults to byte-based
operations, and must explicitly provide a refresh_limits override
to stick with larger alignments (making the alignment issues more
obvious directly in the drivers touched in the next few patches).

Once all drivers are converted, the sector-based aio callbacks will
be removed; in the meantime, a FIXME comment is added due to a
slight inefficiency that will be touched up as part of that later
cleanup.

Simplify some instances of 'bs->drv' into 'drv' while touching this,
since the local variable already exists to reduce typing.

Signed-off-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/block_int.h |  6 ++++++
 block/io.c                | 38 +++++++++++++++++++++++++++++---------
 2 files changed, 35 insertions(+), 9 deletions(-)

From: Eric Blake <eblake@redhat.com>

We are gradually moving away from sector-based interfaces, towards
byte-based.  Make the change for the last few sector-based callbacks
in the file-win32 driver.

Note that the driver was already using byte-based calls for
performing actual I/O, so this just gets rid of a round trip
of scaling; however, as I don't know if Windows is tolerant of
non-sector AIO operations, I went with the conservative approach
of modifying .bdrv_refresh_limits to override the block layer
defaults back to the pre-patch value of 512.

Signed-off-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/raw-aio.h |  2 +-
 block/file-win32.c      | 47 +++++++++++++++++++++++++++++------------------
 block/win32-aio.c       |  5 ++---
 3 files changed, 32 insertions(+), 22 deletions(-)

diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/raw-aio.h
+++ b/include/block/raw-aio.h
@@ -XXX,XX +XXX,XX @@ void win32_aio_cleanup(QEMUWin32AIOState *aio);
 int win32_aio_attach(QEMUWin32AIOState *aio, HANDLE hfile);
 BlockAIOCB *win32_aio_submit(BlockDriverState *bs,
         QEMUWin32AIOState *aio, HANDLE hfile,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        uint64_t offset, uint64_t bytes, QEMUIOVector *qiov,
         BlockCompletionFunc *cb, void *opaque, int type);
 void win32_aio_detach_aio_context(QEMUWin32AIOState *aio,
                                   AioContext *old_context);
diff --git a/block/file-win32.c b/block/file-win32.c
index XXXXXXX..XXXXXXX 100644
--- a/block/file-win32.c
+++ b/block/file-win32.c
@@ -XXX,XX +XXX,XX @@ static void raw_probe_alignment(BlockDriverState *bs, Error **errp)
                          &dg.Geometry.BytesPerSector,
                          &freeClusters, &totalClusters);
         bs->bl.request_alignment = dg.Geometry.BytesPerSector;
+        return;
     }
+
+    /* XXX Does Windows support AIO on less than 512-byte alignment? */
+    bs->bl.request_alignment = 512;
 }
 
 static void raw_parse_flags(int flags, bool use_aio, int *access_flags,
@@ -XXX,XX +XXX,XX @@ fail:
     return ret;
 }
 
-static BlockAIOCB *raw_aio_readv(BlockDriverState *bs,
-                         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-                         BlockCompletionFunc *cb, void *opaque)
+static BlockAIOCB *raw_aio_preadv(BlockDriverState *bs,
+                                  uint64_t offset, uint64_t bytes,
+                                  QEMUIOVector *qiov, int flags,
+                                  BlockCompletionFunc *cb, void *opaque)
 {
     BDRVRawState *s = bs->opaque;
     if (s->aio) {
-        return win32_aio_submit(bs, s->aio, s->hfile, sector_num, qiov,
-                                nb_sectors, cb, opaque, QEMU_AIO_READ);
+        return win32_aio_submit(bs, s->aio, s->hfile, offset, bytes, qiov,
+                                cb, opaque, QEMU_AIO_READ);
     } else {
-        return paio_submit(bs, s->hfile, sector_num << BDRV_SECTOR_BITS, qiov,
-                           nb_sectors << BDRV_SECTOR_BITS,
+        return paio_submit(bs, s->hfile, offset, qiov, bytes,
                            cb, opaque, QEMU_AIO_READ);
     }
 }
 
-static BlockAIOCB *raw_aio_writev(BlockDriverState *bs,
-                          int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-                          BlockCompletionFunc *cb, void *opaque)
+static BlockAIOCB *raw_aio_pwritev(BlockDriverState *bs,
+                                   uint64_t offset, uint64_t bytes,
+                                   QEMUIOVector *qiov, int flags,
+                                   BlockCompletionFunc *cb, void *opaque)
 {
     BDRVRawState *s = bs->opaque;
     if (s->aio) {
-        return win32_aio_submit(bs, s->aio, s->hfile, sector_num, qiov,
-                                nb_sectors, cb, opaque, QEMU_AIO_WRITE);
+        return win32_aio_submit(bs, s->aio, s->hfile, offset, bytes, qiov,
+                                cb, opaque, QEMU_AIO_WRITE);
     } else {
-        return paio_submit(bs, s->hfile, sector_num << BDRV_SECTOR_BITS, qiov,
-                           nb_sectors << BDRV_SECTOR_BITS,
+        return paio_submit(bs, s->hfile, offset, qiov, bytes,
                            cb, opaque, QEMU_AIO_WRITE);
     }
 }
@@ -XXX,XX +XXX,XX @@ BlockDriver bdrv_file = {
     .bdrv_co_create_opts = raw_co_create_opts,
     .bdrv_has_zero_init = bdrv_has_zero_init_1,
 
-    .bdrv_aio_readv     = raw_aio_readv,
-    .bdrv_aio_writev    = raw_aio_writev,
+    .bdrv_aio_preadv    = raw_aio_preadv,
+    .bdrv_aio_pwritev   = raw_aio_pwritev,
     .bdrv_aio_flush     = raw_aio_flush,
 
     .bdrv_truncate	= raw_truncate,
@@ -XXX,XX +XXX,XX @@ static void hdev_parse_filename(const char *filename, QDict *options,
     bdrv_parse_filename_strip_prefix(filename, "host_device:", options);
 }
 
+static void hdev_refresh_limits(BlockDriverState *bs, Error **errp)
+{
+    /* XXX Does Windows support AIO on less than 512-byte alignment? */
+    bs->bl.request_alignment = 512;
+}
+
 static int hdev_open(BlockDriverState *bs, QDict *options, int flags,
                      Error **errp)
 {
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_host_device = {
     .bdrv_probe_device	= hdev_probe_device,
     .bdrv_file_open	= hdev_open,
     .bdrv_close		= raw_close,
+    .bdrv_refresh_limits = hdev_refresh_limits,
 
-    .bdrv_aio_readv     = raw_aio_readv,
-    .bdrv_aio_writev    = raw_aio_writev,
+    .bdrv_aio_preadv    = raw_aio_preadv,
+    .bdrv_aio_pwritev   = raw_aio_pwritev,
     .bdrv_aio_flush     = raw_aio_flush,
 
     .bdrv_detach_aio_context = raw_detach_aio_context,
diff --git a/block/win32-aio.c b/block/win32-aio.c
index XXXXXXX..XXXXXXX 100644
--- a/block/win32-aio.c
+++ b/block/win32-aio.c
@@ -XXX,XX +XXX,XX @@ static const AIOCBInfo win32_aiocb_info = {
 
 BlockAIOCB *win32_aio_submit(BlockDriverState *bs,
         QEMUWin32AIOState *aio, HANDLE hfile,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        uint64_t offset, uint64_t bytes, QEMUIOVector *qiov,
         BlockCompletionFunc *cb, void *opaque, int type)
 {
     struct QEMUWin32AIOCB *waiocb;
-    uint64_t offset = sector_num * 512;
     DWORD rc;
 
     waiocb = qemu_aio_get(&win32_aiocb_info, bs, cb, opaque);
-    waiocb->nbytes = nb_sectors * 512;
+    waiocb->nbytes = bytes;
     waiocb->qiov = qiov;
     waiocb->is_read = (type == QEMU_AIO_READ);
 
-- 
2.13.6

From: Eric Blake <eblake@redhat.com>

We are gradually moving away from sector-based interfaces, towards
byte-based.  Make the change for the last few sector-based callbacks
in the null-co and null-aio drivers.

Note that since the null driver does nothing on writes, it trivially
supports the BDRV_REQ_FUA flag (all writes have already landed to
the same bit-bucket without needing an extra flush call).  Also, since
the null driver does just as well with byte-based requests, we can
now avoid cycles wasted on read-modify-write by taking advantage of
the block layer now defaulting the alignment to 1 instead of 512.

Signed-off-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/null.c | 45 +++++++++++++++++++++++----------------------
 1 file changed, 23 insertions(+), 22 deletions(-)

diff --git a/block/null.c b/block/null.c
index XXXXXXX..XXXXXXX 100644
--- a/block/null.c
+++ b/block/null.c
@@ -XXX,XX +XXX,XX @@ static int null_file_open(BlockDriverState *bs, QDict *options, int flags,
     }
     s->read_zeroes = qemu_opt_get_bool(opts, NULL_OPT_ZEROES, false);
     qemu_opts_del(opts);
+    bs->supported_write_flags = BDRV_REQ_FUA;
     return ret;
 }
 
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int null_co_common(BlockDriverState *bs)
     return 0;
 }
 
-static coroutine_fn int null_co_readv(BlockDriverState *bs,
-                                      int64_t sector_num, int nb_sectors,
-                                      QEMUIOVector *qiov)
+static coroutine_fn int null_co_preadv(BlockDriverState *bs,
+                                       uint64_t offset, uint64_t bytes,
+                                       QEMUIOVector *qiov, int flags)
 {
     BDRVNullState *s = bs->opaque;
 
     if (s->read_zeroes) {
-        qemu_iovec_memset(qiov, 0, 0, nb_sectors * BDRV_SECTOR_SIZE);
+        qemu_iovec_memset(qiov, 0, 0, bytes);
     }
 
     return null_co_common(bs);
 }
 
-static coroutine_fn int null_co_writev(BlockDriverState *bs,
-                                       int64_t sector_num, int nb_sectors,
-                                       QEMUIOVector *qiov)
+static coroutine_fn int null_co_pwritev(BlockDriverState *bs,
+                                        uint64_t offset, uint64_t bytes,
+                                        QEMUIOVector *qiov, int flags)
 {
     return null_co_common(bs);
 }
@@ -XXX,XX +XXX,XX @@ static inline BlockAIOCB *null_aio_common(BlockDriverState *bs,
     return &acb->common;
 }
 
-static BlockAIOCB *null_aio_readv(BlockDriverState *bs,
-                                  int64_t sector_num, QEMUIOVector *qiov,
-                                  int nb_sectors,
-                                  BlockCompletionFunc *cb,
-                                  void *opaque)
+static BlockAIOCB *null_aio_preadv(BlockDriverState *bs,
+                                   uint64_t offset, uint64_t bytes,
+                                   QEMUIOVector *qiov, int flags,
+                                   BlockCompletionFunc *cb,
+                                   void *opaque)
 {
     BDRVNullState *s = bs->opaque;
 
     if (s->read_zeroes) {
-        qemu_iovec_memset(qiov, 0, 0, nb_sectors * BDRV_SECTOR_SIZE);
+        qemu_iovec_memset(qiov, 0, 0, bytes);
     }
 
     return null_aio_common(bs, cb, opaque);
 }
 
-static BlockAIOCB *null_aio_writev(BlockDriverState *bs,
-                                   int64_t sector_num, QEMUIOVector *qiov,
-                                   int nb_sectors,
-                                   BlockCompletionFunc *cb,
-                                   void *opaque)
+static BlockAIOCB *null_aio_pwritev(BlockDriverState *bs,
+                                    uint64_t offset, uint64_t bytes,
+                                    QEMUIOVector *qiov, int flags,
+                                    BlockCompletionFunc *cb,
+                                    void *opaque)
 {
     return null_aio_common(bs, cb, opaque);
 }
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_null_co = {
     .bdrv_close             = null_close,
     .bdrv_getlength         = null_getlength,
 
-    .bdrv_co_readv          = null_co_readv,
-    .bdrv_co_writev         = null_co_writev,
+    .bdrv_co_preadv         = null_co_preadv,
+    .bdrv_co_pwritev        = null_co_pwritev,
     .bdrv_co_flush_to_disk  = null_co_flush,
     .bdrv_reopen_prepare    = null_reopen_prepare,
 
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_null_aio = {
     .bdrv_close             = null_close,
     .bdrv_getlength         = null_getlength,
 
-    .bdrv_aio_readv         = null_aio_readv,
-    .bdrv_aio_writev        = null_aio_writev,
+    .bdrv_aio_preadv        = null_aio_preadv,
+    .bdrv_aio_pwritev       = null_aio_pwritev,
     .bdrv_aio_flush         = null_aio_flush,
     .bdrv_reopen_prepare    = null_reopen_prepare,
 
-- 
2.13.6

From: Eric Blake <eblake@redhat.com>

We are gradually moving away from sector-based interfaces, towards
byte-based.  Make the change for the last few sector-based callbacks
in the rbd driver.

Note that the driver was already using byte-based calls for
performing actual I/O, so this just gets rid of a round trip
of scaling; however, as I don't know if RBD is tolerant of
non-sector AIO operations, I went with the conservate approach
of adding .bdrv_refresh_limits to override the block layer
defaults back to the pre-patch value of 512.

Signed-off-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/rbd.c | 40 ++++++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/block/rbd.c b/block/rbd.c
index XXXXXXX..XXXXXXX 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -XXX,XX +XXX,XX @@ done:
 }
 
 
+static void qemu_rbd_refresh_limits(BlockDriverState *bs, Error **errp)
+{
+    /* XXX Does RBD support AIO on less than 512-byte alignment? */
+    bs->bl.request_alignment = 512;
+}
+
+
 static int qemu_rbd_set_auth(rados_t cluster, const char *secretid,
                              Error **errp)
 {
@@ -XXX,XX +XXX,XX @@ failed:
     return NULL;
 }
 
-static BlockAIOCB *qemu_rbd_aio_readv(BlockDriverState *bs,
-                                      int64_t sector_num,
-                                      QEMUIOVector *qiov,
-                                      int nb_sectors,
-                                      BlockCompletionFunc *cb,
-                                      void *opaque)
+static BlockAIOCB *qemu_rbd_aio_preadv(BlockDriverState *bs,
+                                       uint64_t offset, uint64_t bytes,
+                                       QEMUIOVector *qiov, int flags,
+                                       BlockCompletionFunc *cb,
+                                       void *opaque)
 {
-    return rbd_start_aio(bs, sector_num << BDRV_SECTOR_BITS, qiov,
-                         (int64_t) nb_sectors << BDRV_SECTOR_BITS, cb, opaque,
+    return rbd_start_aio(bs, offset, qiov, bytes, cb, opaque,
                          RBD_AIO_READ);
 }
 
-static BlockAIOCB *qemu_rbd_aio_writev(BlockDriverState *bs,
-                                       int64_t sector_num,
-                                       QEMUIOVector *qiov,
-                                       int nb_sectors,
-                                       BlockCompletionFunc *cb,
-                                       void *opaque)
+static BlockAIOCB *qemu_rbd_aio_pwritev(BlockDriverState *bs,
+                                        uint64_t offset, uint64_t bytes,
+                                        QEMUIOVector *qiov, int flags,
+                                        BlockCompletionFunc *cb,
+                                        void *opaque)
 {
-    return rbd_start_aio(bs, sector_num << BDRV_SECTOR_BITS, qiov,
-                         (int64_t) nb_sectors << BDRV_SECTOR_BITS, cb, opaque,
+    return rbd_start_aio(bs, offset, qiov, bytes, cb, opaque,
                          RBD_AIO_WRITE);
 }
 
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_rbd = {
     .format_name            = "rbd",
     .instance_size          = sizeof(BDRVRBDState),
     .bdrv_parse_filename    = qemu_rbd_parse_filename,
+    .bdrv_refresh_limits    = qemu_rbd_refresh_limits,
     .bdrv_file_open         = qemu_rbd_open,
     .bdrv_close             = qemu_rbd_close,
     .bdrv_reopen_prepare    = qemu_rbd_reopen_prepare,
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_rbd = {
     .bdrv_truncate          = qemu_rbd_truncate,
     .protocol_name          = "rbd",
 
-    .bdrv_aio_readv         = qemu_rbd_aio_readv,
-    .bdrv_aio_writev        = qemu_rbd_aio_writev,
+    .bdrv_aio_preadv        = qemu_rbd_aio_preadv,
+    .bdrv_aio_pwritev       = qemu_rbd_aio_pwritev,
 
 #ifdef LIBRBD_SUPPORTS_AIO_FLUSH
     .bdrv_aio_flush         = qemu_rbd_aio_flush,
-- 
2.13.6

From: Eric Blake <eblake@redhat.com>

We are gradually moving away from sector-based interfaces, towards
byte-based.  Make the change for the last few sector-based callbacks
in the vxhs driver.

Note that the driver was already using byte-based calls for
performing actual I/O, so this just gets rid of a round trip
of scaling; however, as I don't know if VxHS is tolerant of
non-sector AIO operations, I went with the conservative approach
of adding .bdrv_refresh_limits to override the block layer
defaults back to the pre-patch value of 512.

Signed-off-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/vxhs.c | 43 ++++++++++++++++++++++---------------------
 1 file changed, 22 insertions(+), 21 deletions(-)

diff --git a/block/vxhs.c b/block/vxhs.c
index XXXXXXX..XXXXXXX 100644
--- a/block/vxhs.c
+++ b/block/vxhs.c
@@ -XXX,XX +XXX,XX @@ static void vxhs_parse_filename(const char *filename, QDict *options,
     }
 }
 
+static void vxhs_refresh_limits(BlockDriverState *bs, Error **errp)
+{
+    /* XXX Does VXHS support AIO on less than 512-byte alignment? */
+    bs->bl.request_alignment = 512;
+}
+
 static int vxhs_init_and_ref(void)
 {
     if (vxhs_ref++ == 0) {
@@ -XXX,XX +XXX,XX @@ static const AIOCBInfo vxhs_aiocb_info = {
  * and is passed to QNIO. When QNIO completes the work,
  * it will be passed back through the callback.
  */
-static BlockAIOCB *vxhs_aio_rw(BlockDriverState *bs, int64_t sector_num,
-                               QEMUIOVector *qiov, int nb_sectors,
+static BlockAIOCB *vxhs_aio_rw(BlockDriverState *bs, uint64_t offset,
+                               QEMUIOVector *qiov, uint64_t size,
                                BlockCompletionFunc *cb, void *opaque,
                                VDISKAIOCmd iodir)
 {
     VXHSAIOCB *acb = NULL;
     BDRVVXHSState *s = bs->opaque;
-    size_t size;
-    uint64_t offset;
     int iio_flags = 0;
     int ret = 0;
     void *dev_handle = s->vdisk_hostinfo.dev_handle;
 
-    offset = sector_num * BDRV_SECTOR_SIZE;
-    size = nb_sectors * BDRV_SECTOR_SIZE;
     acb = qemu_aio_get(&vxhs_aiocb_info, bs, cb, opaque);
 
     /*
@@ -XXX,XX +XXX,XX @@ static BlockAIOCB *vxhs_aio_rw(BlockDriverState *bs, int64_t sector_num,
     switch (iodir) {
     case VDISK_AIO_WRITE:
             ret = iio_writev(dev_handle, acb, qiov->iov, qiov->niov,
-                             offset, (uint64_t)size, iio_flags);
+                             offset, size, iio_flags);
             break;
     case VDISK_AIO_READ:
             ret = iio_readv(dev_handle, acb, qiov->iov, qiov->niov,
-                            offset, (uint64_t)size, iio_flags);
+                            offset, size, iio_flags);
             break;
     default:
             trace_vxhs_aio_rw_invalid(iodir);
@@ -XXX,XX +XXX,XX @@ errout:
     return NULL;
 }
 
-static BlockAIOCB *vxhs_aio_readv(BlockDriverState *bs,
-                                   int64_t sector_num, QEMUIOVector *qiov,
-                                   int nb_sectors,
+static BlockAIOCB *vxhs_aio_preadv(BlockDriverState *bs,
+                                   uint64_t offset, uint64_t bytes,
+                                   QEMUIOVector *qiov, int flags,
                                    BlockCompletionFunc *cb, void *opaque)
 {
-    return vxhs_aio_rw(bs, sector_num, qiov, nb_sectors, cb,
-                       opaque, VDISK_AIO_READ);
+    return vxhs_aio_rw(bs, offset, qiov, bytes, cb, opaque, VDISK_AIO_READ);
 }
 
-static BlockAIOCB *vxhs_aio_writev(BlockDriverState *bs,
-                                   int64_t sector_num, QEMUIOVector *qiov,
-                                   int nb_sectors,
-                                   BlockCompletionFunc *cb, void *opaque)
+static BlockAIOCB *vxhs_aio_pwritev(BlockDriverState *bs,
+                                    uint64_t offset, uint64_t bytes,
+                                    QEMUIOVector *qiov, int flags,
+                                    BlockCompletionFunc *cb, void *opaque)
 {
-    return vxhs_aio_rw(bs, sector_num, qiov, nb_sectors,
-                       cb, opaque, VDISK_AIO_WRITE);
+    return vxhs_aio_rw(bs, offset, qiov, bytes, cb, opaque, VDISK_AIO_WRITE);
 }
 
 static void vxhs_close(BlockDriverState *bs)
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_vxhs = {
     .instance_size                = sizeof(BDRVVXHSState),
     .bdrv_file_open               = vxhs_open,
     .bdrv_parse_filename          = vxhs_parse_filename,
+    .bdrv_refresh_limits          = vxhs_refresh_limits,
     .bdrv_close                   = vxhs_close,
     .bdrv_getlength               = vxhs_getlength,
-    .bdrv_aio_readv               = vxhs_aio_readv,
-    .bdrv_aio_writev              = vxhs_aio_writev,
+    .bdrv_aio_preadv              = vxhs_aio_preadv,
+    .bdrv_aio_pwritev             = vxhs_aio_pwritev,
 };
 
 static void bdrv_vxhs_init(void)
-- 
2.13.6

From: Eric Blake <eblake@redhat.com>

We are gradually moving away from sector-based interfaces, towards
byte-based.  Now that all drivers with aio callbacks are using the
byte-based interfaces, we can remove the sector-based versions.

Signed-off-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/block_int.h |  6 ----
 block/io.c                | 84 ++++++++++++++++++++---------------------------
 2 files changed, 36 insertions(+), 54 deletions(-)

diff --git a/include/block/block_int.h b/include/block/block_int.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -XXX,XX +XXX,XX @@ struct BlockDriver {
     void (*bdrv_refresh_filename)(BlockDriverState *bs, QDict *options);
 
     /* aio */
-    BlockAIOCB *(*bdrv_aio_readv)(BlockDriverState *bs,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockCompletionFunc *cb, void *opaque);
     BlockAIOCB *(*bdrv_aio_preadv)(BlockDriverState *bs,
         uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags,
         BlockCompletionFunc *cb, void *opaque);
-    BlockAIOCB *(*bdrv_aio_writev)(BlockDriverState *bs,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockCompletionFunc *cb, void *opaque);
     BlockAIOCB *(*bdrv_aio_pwritev)(BlockDriverState *bs,
         uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags,
         BlockCompletionFunc *cb, void *opaque);
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
         return drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags);
     }
 
-    /* FIXME - no need to calculate these if .bdrv_aio_preadv exists */
-    sector_num = offset >> BDRV_SECTOR_BITS;
-    nb_sectors = bytes >> BDRV_SECTOR_BITS;
-
-    if (!drv->bdrv_aio_preadv) {
-        assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
-        assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
-        assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
-    }
-
-    if (drv->bdrv_co_readv) {
-        return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
-    } else {
+    if (drv->bdrv_aio_preadv) {
         BlockAIOCB *acb;
         CoroutineIOCompletion co = {
             .coroutine = qemu_coroutine_self(),
         };
 
-        if (drv->bdrv_aio_preadv) {
-            acb = drv->bdrv_aio_preadv(bs, offset, bytes, qiov, flags,
-                                       bdrv_co_io_em_complete, &co);
-        } else {
-            acb = drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
-                                      bdrv_co_io_em_complete, &co);
-        }
+        acb = drv->bdrv_aio_preadv(bs, offset, bytes, qiov, flags,
+                                   bdrv_co_io_em_complete, &co);
         if (acb == NULL) {
             return -EIO;
         } else {
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
             return co.ret;
         }
     }
+
+    sector_num = offset >> BDRV_SECTOR_BITS;
+    nb_sectors = bytes >> BDRV_SECTOR_BITS;
+
+    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
+    assert(drv->bdrv_co_readv);
+
+    return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
 }
 
 static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
         goto emulate_flags;
     }
 
-    /* FIXME - no need to calculate these if .bdrv_aio_pwritev exists */
-    sector_num = offset >> BDRV_SECTOR_BITS;
-    nb_sectors = bytes >> BDRV_SECTOR_BITS;
-
-    if (!drv->bdrv_aio_pwritev) {
-        assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
-        assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
-        assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
-    }
-
-    if (drv->bdrv_co_writev_flags) {
-        ret = drv->bdrv_co_writev_flags(bs, sector_num, nb_sectors, qiov,
-                                        flags & bs->supported_write_flags);
-        flags &= ~bs->supported_write_flags;
-    } else if (drv->bdrv_co_writev) {
-        assert(!bs->supported_write_flags);
-        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
-    } else {
+    if (drv->bdrv_aio_pwritev) {
         BlockAIOCB *acb;
         CoroutineIOCompletion co = {
             .coroutine = qemu_coroutine_self(),
         };
 
-        if (drv->bdrv_aio_pwritev) {
-            acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov,
-                                        flags & bs->supported_write_flags,
-                                        bdrv_co_io_em_complete, &co);
-            flags &= ~bs->supported_write_flags;
-        } else {
-            assert(!bs->supported_write_flags);
-            acb = drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors,
-                                       bdrv_co_io_em_complete, &co);
-        }
+        acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov,
+                                    flags & bs->supported_write_flags,
+                                    bdrv_co_io_em_complete, &co);
+        flags &= ~bs->supported_write_flags;
         if (acb == NULL) {
             ret = -EIO;
         } else {
             qemu_coroutine_yield();
             ret = co.ret;
         }
+        goto emulate_flags;
+    }
+
+    sector_num = offset >> BDRV_SECTOR_BITS;
+    nb_sectors = bytes >> BDRV_SECTOR_BITS;
+
+    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
+
+    if (drv->bdrv_co_writev_flags) {
+        ret = drv->bdrv_co_writev_flags(bs, sector_num, nb_sectors, qiov,
+                                        flags & bs->supported_write_flags);
+        flags &= ~bs->supported_write_flags;
+    } else {
+        assert(drv->bdrv_co_writev);
+        assert(!bs->supported_write_flags);
+        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
     }
 
 emulate_flags:
-- 
2.13.6

From: Eric Blake <eblake@redhat.com>

We have too many driver callback interfaces; simplify the mess
somewhat by merging the flags parameter of .bdrv_co_writev_flags()
into .bdrv_co_writev().  Note that as long as a driver doesn't set
.supported_write_flags, the flags argument will be 0 and behavior is
identical.  Also note that the public function bdrv_co_writev() still
lacks a flags argument; so the driver signature is thus intentionally
slightly different.  But that's not the end of the world, nor the first
time that the driver interface differs slightly from the public
interface.

Ideally, we should be rewriting all of these drivers to use modern
byte-based interfaces.  But that's a more invasive patch to write
and audit, compared to the simplification done here.

diff --git a/include/block/block_int.h b/include/block/block_int.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -XXX,XX +XXX,XX @@ struct BlockDriver {
     int coroutine_fn (*bdrv_co_preadv)(BlockDriverState *bs,
         uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags);
     int coroutine_fn (*bdrv_co_writev)(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
-    int coroutine_fn (*bdrv_co_writev_flags)(BlockDriverState *bs,
         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, int flags);
     /**
      * @offset: position in bytes to write at
diff --git a/block/gluster.c b/block/gluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/gluster.c
+++ b/block/gluster.c
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qemu_gluster_co_readv(BlockDriverState *bs,
 static coroutine_fn int qemu_gluster_co_writev(BlockDriverState *bs,
                                                int64_t sector_num,
                                                int nb_sectors,
-                                               QEMUIOVector *qiov)
+                                               QEMUIOVector *qiov,
+                                               int flags)
 {
+    assert(!flags);
     return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 1);
 }
 
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
     assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
 
-    if (drv->bdrv_co_writev_flags) {
-        ret = drv->bdrv_co_writev_flags(bs, sector_num, nb_sectors, qiov,
-                                        flags & bs->supported_write_flags);
-        flags &= ~bs->supported_write_flags;
-    } else {
-        assert(drv->bdrv_co_writev);
-        assert(!bs->supported_write_flags);
-        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
-    }
+    assert(drv->bdrv_co_writev);
+    ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov,
+                              flags & bs->supported_write_flags);
+    flags &= ~bs->supported_write_flags;
 
 emulate_flags:
     if (ret == 0 && (flags & BDRV_REQ_FUA)) {
diff --git a/block/iscsi.c b/block/iscsi.c
index XXXXXXX..XXXXXXX 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -XXX,XX +XXX,XX @@ static inline bool iscsi_allocmap_is_valid(IscsiLun *iscsilun,
 }
 
 static int coroutine_fn
-iscsi_co_writev_flags(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
-                      QEMUIOVector *iov, int flags)
+iscsi_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
+                QEMUIOVector *iov, int flags)
 {
     IscsiLun *iscsilun = bs->opaque;
     struct IscsiTask iTask;
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_iscsi = {
     .bdrv_co_pdiscard      = iscsi_co_pdiscard,
     .bdrv_co_pwrite_zeroes = iscsi_co_pwrite_zeroes,
     .bdrv_co_readv         = iscsi_co_readv,
-    .bdrv_co_writev_flags  = iscsi_co_writev_flags,
+    .bdrv_co_writev        = iscsi_co_writev,
     .bdrv_co_flush_to_disk = iscsi_co_flush,
 
 #ifdef __linux__
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_iser = {
     .bdrv_co_pdiscard      = iscsi_co_pdiscard,
     .bdrv_co_pwrite_zeroes = iscsi_co_pwrite_zeroes,
     .bdrv_co_readv         = iscsi_co_readv,
-    .bdrv_co_writev_flags  = iscsi_co_writev_flags,
+    .bdrv_co_writev        = iscsi_co_writev,
     .bdrv_co_flush_to_disk = iscsi_co_flush,
 
 #ifdef __linux__
diff --git a/block/parallels.c b/block/parallels.c
index XXXXXXX..XXXXXXX 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn parallels_co_block_status(BlockDriverState *bs,
 }
 
 static coroutine_fn int parallels_co_writev(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
+                                            int64_t sector_num, int nb_sectors,
+                                            QEMUIOVector *qiov, int flags)
 {
     BDRVParallelsState *s = bs->opaque;
     uint64_t bytes_done = 0;
     QEMUIOVector hd_qiov;
     int ret = 0;
 
+    assert(!flags);
     qemu_iovec_init(&hd_qiov, qiov->niov);
 
     while (nb_sectors > 0) {
diff --git a/block/qcow.c b/block/qcow.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
 }
 
 static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num,
-                          int nb_sectors, QEMUIOVector *qiov)
+                                       int nb_sectors, QEMUIOVector *qiov,
+                                       int flags)
 {
     BDRVQcowState *s = bs->opaque;
     int index_in_cluster;
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num,
     uint8_t *buf;
     void *orig_buf;
 
+    assert(!flags);
     s->cluster_cache_offset = -1; /* disable compressed cache */
 
     /* We must always copy the iov when encrypting, so we
@@ -XXX,XX +XXX,XX @@ qcow_co_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
     if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
         /* could not compress: write normal cluster */
         ret = qcow_co_writev(bs, offset >> BDRV_SECTOR_BITS,
-                             bytes >> BDRV_SECTOR_BITS, qiov);
+                             bytes >> BDRV_SECTOR_BITS, qiov, 0);
         if (ret < 0) {
             goto fail;
         }
diff --git a/block/qed.c b/block/qed.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_qed_co_readv(BlockDriverState *bs,
 
 static int coroutine_fn bdrv_qed_co_writev(BlockDriverState *bs,
                                            int64_t sector_num, int nb_sectors,
-                                           QEMUIOVector *qiov)
+                                           QEMUIOVector *qiov, int flags)
 {
+    assert(!flags);
     return qed_co_request(bs, sector_num, qiov, nb_sectors, QED_AIOCB_WRITE);
 }
 
diff --git a/block/replication.c b/block/replication.c
index XXXXXXX..XXXXXXX 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -XXX,XX +XXX,XX @@ out:
 static coroutine_fn int replication_co_writev(BlockDriverState *bs,
                                               int64_t sector_num,
                                               int remaining_sectors,
-                                              QEMUIOVector *qiov)
+                                              QEMUIOVector *qiov,
+                                              int flags)
 {
     BDRVReplicationState *s = bs->opaque;
     QEMUIOVector hd_qiov;
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int replication_co_writev(BlockDriverState *bs,
     int ret;
     int64_t n;
 
+    assert(!flags);
     ret = replication_get_io_status(s);
     if (ret < 0) {
         goto out;
diff --git a/block/sheepdog.c b/block/sheepdog.c
index XXXXXXX..XXXXXXX 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -XXX,XX +XXX,XX @@ static void sd_aio_complete(SheepdogAIOCB *acb)
 }
 
 static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
-                        int nb_sectors, QEMUIOVector *qiov)
+                                     int nb_sectors, QEMUIOVector *qiov,
+                                     int flags)
 {
     SheepdogAIOCB acb;
     int ret;
     int64_t offset = (sector_num + nb_sectors) * BDRV_SECTOR_SIZE;
     BDRVSheepdogState *s = bs->opaque;
 
+    assert(!flags);
     if (offset > s->inode.vdi_size) {
         ret = sd_truncate(bs, offset, PREALLOC_MODE_OFF, NULL);
         if (ret < 0) {
diff --git a/block/ssh.c b/block/ssh.c
index XXXXXXX..XXXXXXX 100644
--- a/block/ssh.c
+++ b/block/ssh.c
@@ -XXX,XX +XXX,XX @@ static int ssh_write(BDRVSSHState *s, BlockDriverState *bs,
 
 static coroutine_fn int ssh_co_writev(BlockDriverState *bs,
                                       int64_t sector_num,
-                                      int nb_sectors, QEMUIOVector *qiov)
+                                      int nb_sectors, QEMUIOVector *qiov,
+                                      int flags)
 {
     BDRVSSHState *s = bs->opaque;
     int ret;
 
+    assert(!flags);
     qemu_co_mutex_lock(&s->lock);
     ret = ssh_write(s, bs, sector_num * BDRV_SECTOR_SIZE,
                     nb_sectors * BDRV_SECTOR_SIZE, qiov);
diff --git a/block/vhdx.c b/block/vhdx.c
index XXXXXXX..XXXXXXX 100644
--- a/block/vhdx.c
+++ b/block/vhdx.c
@@ -XXX,XX +XXX,XX @@ int vhdx_user_visible_write(BlockDriverState *bs, BDRVVHDXState *s)
 }
 
 static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num,
-                                      int nb_sectors, QEMUIOVector *qiov)
+                                       int nb_sectors, QEMUIOVector *qiov,
+                                       int flags)
 {
     int ret = -ENOTSUP;
     BDRVVHDXState *s = bs->opaque;
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num,
     uint64_t bat_prior_offset = 0;
     bool bat_update = false;
 
+    assert(!flags);
     qemu_iovec_init(&hd_qiov, qiov->niov);
 
     qemu_co_mutex_lock(&s->lock);
-- 
2.13.6

From: Alberto Garcia <berto@igalia.com>

The QMP version of this command can take a qdev ID since 7a9877a02635,
but the HMP version is still using the deprecated block device name so
there's no way to refer to a block device added like this:

-blockdev node-name=disk0,driver=qcow2,file.driver=file,file.filename=hd.qcow2
  -device virtio-blk-pci,id=virtio-blk-pci0,drive=disk0

This patch works around this problem by using the specified name as a
qdev ID if the block device name is not found.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hmp.c           | 14 ++++++++++++--
 hmp-commands.hx |  3 ++-
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/hmp.c b/hmp.c
index XXXXXXX..XXXXXXX 100644
--- a/hmp.c
+++ b/hmp.c
@@ -XXX,XX +XXX,XX @@ void hmp_change(Monitor *mon, const QDict *qdict)
 void hmp_block_set_io_throttle(Monitor *mon, const QDict *qdict)
 {
     Error *err = NULL;
+    char *device = (char *) qdict_get_str(qdict, "device");
     BlockIOThrottle throttle = {
-        .has_device = true,
-        .device = (char *) qdict_get_str(qdict, "device"),
         .bps = qdict_get_int(qdict, "bps"),
         .bps_rd = qdict_get_int(qdict, "bps_rd"),
         .bps_wr = qdict_get_int(qdict, "bps_wr"),
@@ -XXX,XX +XXX,XX @@ void hmp_block_set_io_throttle(Monitor *mon, const QDict *qdict)
         .iops_wr = qdict_get_int(qdict, "iops_wr"),
     };
 
+    /* qmp_block_set_io_throttle has separate parameters for the
+     * (deprecated) block device name and the qdev ID but the HMP
+     * version has only one, so we must decide which one to pass. */
+    if (blk_by_name(device)) {
+        throttle.has_device = true;
+        throttle.device = device;
+    } else {
+        throttle.has_id = true;
+        throttle.id = device;
+    }
+
     qmp_block_set_io_throttle(&throttle, &err);
     hmp_handle_error(mon, &err);
 }
diff --git a/hmp-commands.hx b/hmp-commands.hx
index XXXXXXX..XXXXXXX 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -XXX,XX +XXX,XX @@ ETEXI
 STEXI
 @item block_set_io_throttle @var{device} @var{bps} @var{bps_rd} @var{bps_wr} @var{iops} @var{iops_rd} @var{iops_wr}
 @findex block_set_io_throttle
-Change I/O throttle limits for a block drive to @var{bps} @var{bps_rd} @var{bps_wr} @var{iops} @var{iops_rd} @var{iops_wr}
+Change I/O throttle limits for a block drive to @var{bps} @var{bps_rd} @var{bps_wr} @var{iops} @var{iops_rd} @var{iops_wr}.
+@var{device} can be a block device name, a qdev ID or a QOM path.
 ETEXI
 
     {
-- 
2.13.6

From: John Snow <jsnow@redhat.com>

When we've reached the concluded state, we need to expose the error
state if applicable. Add the new field.

This should be sufficient for determining if a job completed
successfully or not after concluding; if we want to discriminate
based on how it failed more mechanically, we can always add an
explicit return code enumeration later.

I didn't bother to make it only show up if we are in the concluded
state; I don't think it's necessary.

Cc: qemu-stable@nongnu.org
Signed-off-by: John Snow <jsnow@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Alberto Garcia <berto@igalia.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 qapi/block-core.json | 6 +++++-
 blockjob.c           | 2 ++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/qapi/block-core.json b/qapi/block-core.json
index XXXXXXX..XXXXXXX 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -XXX,XX +XXX,XX @@
 # @auto-dismiss: Job will dismiss itself when CONCLUDED, moving to the NULL
 #                state and disappearing from the query list. (since 2.12)
 #
+# @error: Error information if the job did not complete successfully.
+#         Not set if the job completed successfully. (since 2.12.1)
+#
 # Since: 1.1
 ##
 { 'struct': 'BlockJobInfo',
@@ -XXX,XX +XXX,XX @@
            'offset': 'int', 'busy': 'bool', 'paused': 'bool', 'speed': 'int',
            'io-status': 'BlockDeviceIoStatus', 'ready': 'bool',
            'status': 'BlockJobStatus',
-           'auto-finalize': 'bool', 'auto-dismiss': 'bool' } }
+           'auto-finalize': 'bool', 'auto-dismiss': 'bool',
+           '*error': 'str' } }
 
 ##
 # @query-block-jobs:
diff --git a/blockjob.c b/blockjob.c
index XXXXXXX..XXXXXXX 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -XXX,XX +XXX,XX @@ BlockJobInfo *block_job_query(BlockJob *job, Error **errp)
     info->status    = job->status;
     info->auto_finalize = job->auto_finalize;
     info->auto_dismiss  = job->auto_dismiss;
+    info->has_error = job->ret != 0;
+    info->error     = job->ret ? g_strdup(strerror(-job->ret)) : NULL;
     return info;
 }
 
-- 
2.13.6

Every job gets a non-NULL job->txn on creation, but it doesn't
necessarily keep it until it is decommissioned: Finalising a job removes
it from its transaction. Therefore, calling 'blockdev-job-finalize' a
second time on an already concluded job causes an assertion failure.

Remove job->txn from the assertion in block_job_finalize() to fix this.
block_job_do_finalize() still has the same assertion, but if a job is
already removed from its transaction, block_job_apply_verb() will
already error out before we run into that assertion.

Cc: qemu-stable@nongnu.org
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: John Snow <jsnow@redhat.com>
---
 blockjob.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/blockjob.c b/blockjob.c
index XXXXXXX..XXXXXXX 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -XXX,XX +XXX,XX @@ void block_job_complete(BlockJob *job, Error **errp)
 
 void block_job_finalize(BlockJob *job, Error **errp)
 {
-    assert(job && job->id && job->txn);
+    assert(job && job->id);
     if (block_job_apply_verb(job, BLOCK_JOB_VERB_FINALIZE, errp)) {
         return;
     }
-- 
2.13.6

Block job drivers are not expected to mess with the internals of the
BlockJob object, so provide wrapper functions for one of the cases where
they still do it: Updating the progress counter.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: John Snow <jsnow@redhat.com>
---
 include/block/blockjob.h | 19 +++++++++++++++++++
 block/backup.c           | 22 +++++++++++++---------
 block/commit.c           | 16 ++++++++--------
 block/mirror.c           | 11 +++++------
 block/stream.c           | 14 ++++++++------
 blockjob.c               | 10 ++++++++++
 6 files changed, 63 insertions(+), 29 deletions(-)

diff --git a/include/block/blockjob.h b/include/block/blockjob.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/blockjob.h
+++ b/include/block/blockjob.h
@@ -XXX,XX +XXX,XX @@ void block_job_finalize(BlockJob *job, Error **errp);
 void block_job_dismiss(BlockJob **job, Error **errp);
 
 /**
+ * block_job_progress_update:
+ * @job: The job that has made progress
+ * @done: How much progress the job made
+ *
+ * Updates the progress counter of the job.
+ */
+void block_job_progress_update(BlockJob *job, uint64_t done);
+
+/**
+ * block_job_progress_set_remaining:
+ * @job: The job whose expected progress end value is set
+ * @remaining: Expected end value of the progress counter of the job
+ *
+ * Sets the expected end value of the progress counter of a job so that a
+ * completion percentage can be calculated when the progress is updated.
+ */
+void block_job_progress_set_remaining(BlockJob *job, uint64_t remaining);
+
+/**
  * block_job_query:
  * @job: The job to get information about.
  *
diff --git a/block/backup.c b/block/backup.c
index XXXXXXX..XXXXXXX 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -XXX,XX +XXX,XX @@ typedef struct BackupBlockJob {
     BlockdevOnError on_source_error;
     BlockdevOnError on_target_error;
     CoRwlock flush_rwlock;
+    uint64_t len;
     uint64_t bytes_read;
     int64_t cluster_size;
     bool compress;
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn backup_do_cow(BackupBlockJob *job,
 
         trace_backup_do_cow_process(job, start);
 
-        n = MIN(job->cluster_size, job->common.len - start);
+        n = MIN(job->cluster_size, job->len - start);
 
         if (!bounce_buffer) {
             bounce_buffer = blk_blockalign(blk, job->cluster_size);
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn backup_do_cow(BackupBlockJob *job,
          * offset field is an opaque progress value, it is not a disk offset.
          */
         job->bytes_read += n;
-        job->common.offset += n;
+        block_job_progress_update(&job->common, n);
     }
 
 out:
@@ -XXX,XX +XXX,XX @@ void backup_do_checkpoint(BlockJob *job, Error **errp)
         return;
     }
 
-    len = DIV_ROUND_UP(backup_job->common.len, backup_job->cluster_size);
+    len = DIV_ROUND_UP(backup_job->len, backup_job->cluster_size);
     hbitmap_set(backup_job->copy_bitmap, 0, len);
 }
 
@@ -XXX,XX +XXX,XX @@ static void backup_incremental_init_copy_bitmap(BackupBlockJob *job)
         bdrv_set_dirty_iter(dbi, next_cluster * job->cluster_size);
     }
 
-    job->common.offset = job->common.len -
-                         hbitmap_count(job->copy_bitmap) * job->cluster_size;
+    /* TODO block_job_progress_set_remaining() would make more sense */
+    block_job_progress_update(&job->common,
+        job->len - hbitmap_count(job->copy_bitmap) * job->cluster_size);
 
     bdrv_dirty_iter_free(dbi);
 }
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn backup_run(void *opaque)
     QLIST_INIT(&job->inflight_reqs);
     qemu_co_rwlock_init(&job->flush_rwlock);
 
-    nb_clusters = DIV_ROUND_UP(job->common.len, job->cluster_size);
+    nb_clusters = DIV_ROUND_UP(job->len, job->cluster_size);
+    block_job_progress_set_remaining(&job->common, job->len);
+
     job->copy_bitmap = hbitmap_alloc(nb_clusters, 0);
     if (job->sync_mode == MIRROR_SYNC_MODE_INCREMENTAL) {
         backup_incremental_init_copy_bitmap(job);
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn backup_run(void *opaque)
         ret = backup_run_incremental(job);
     } else {
         /* Both FULL and TOP SYNC_MODE's require copying.. */
-        for (offset = 0; offset < job->common.len;
+        for (offset = 0; offset < job->len;
              offset += job->cluster_size) {
             bool error_is_read;
             int alloced = 0;
@@ -XXX,XX +XXX,XX @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
         goto error;
     }
 
-    /* job->common.len is fixed, so we can't allow resize */
+    /* job->len is fixed, so we can't allow resize */
     job = block_job_create(job_id, &backup_job_driver, txn, bs,
                            BLK_PERM_CONSISTENT_READ,
                            BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE |
@@ -XXX,XX +XXX,XX @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
     /* Required permissions are already taken with target's blk_new() */
     block_job_add_bdrv(&job->common, "target", target, 0, BLK_PERM_ALL,
                        &error_abort);
-    job->common.len = len;
+    job->len = len;
 
     return &job->common;
 
diff --git a/block/commit.c b/block/commit.c
index XXXXXXX..XXXXXXX 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn commit_run(void *opaque)
     int64_t n = 0; /* bytes */
     void *buf = NULL;
     int bytes_written = 0;
-    int64_t base_len;
+    int64_t len, base_len;
 
-    ret = s->common.len = blk_getlength(s->top);
-
-    if (s->common.len < 0) {
+    ret = len = blk_getlength(s->top);
+    if (len < 0) {
         goto out;
     }
+    block_job_progress_set_remaining(&s->common, len);
 
     ret = base_len = blk_getlength(s->base);
     if (base_len < 0) {
         goto out;
     }
 
-    if (base_len < s->common.len) {
-        ret = blk_truncate(s->base, s->common.len, PREALLOC_MODE_OFF, NULL);
+    if (base_len < len) {
+        ret = blk_truncate(s->base, len, PREALLOC_MODE_OFF, NULL);
         if (ret) {
             goto out;
         }
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn commit_run(void *opaque)
 
     buf = blk_blockalign(s->top, COMMIT_BUFFER_SIZE);
 
-    for (offset = 0; offset < s->common.len; offset += n) {
+    for (offset = 0; offset < len; offset += n) {
         bool copy;
 
         /* Note that even when no rate limit is applied we need to yield
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn commit_run(void *opaque)
             }
         }
         /* Publish progress */
-        s->common.offset += n;
+        block_job_progress_update(&s->common, n);
 
         if (copy && s->common.speed) {
             delay_ns = ratelimit_calculate_delay(&s->limit, n);
diff --git a/block/mirror.c b/block/mirror.c
index XXXXXXX..XXXXXXX 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -XXX,XX +XXX,XX @@ static void mirror_iteration_done(MirrorOp *op, int ret)
             bitmap_set(s->cow_bitmap, chunk_num, nb_chunks);
         }
         if (!s->initial_zeroing_ongoing) {
-            s->common.offset += op->bytes;
+            block_job_progress_update(&s->common, op->bytes);
         }
     }
     qemu_iovec_destroy(&op->qiov);
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn mirror_run(void *opaque)
         block_job_pause_point(&s->common);
 
         cnt = bdrv_get_dirty_count(s->dirty_bitmap);
-        /* s->common.offset contains the number of bytes already processed so
-         * far, cnt is the number of dirty bytes remaining and
-         * s->bytes_in_flight is the number of bytes currently being
-         * processed; together those are the current total operation length */
-        s->common.len = s->common.offset + s->bytes_in_flight + cnt;
+        /* cnt is the number of dirty bytes remaining and s->bytes_in_flight is
+         * the number of bytes currently being processed; together those are
+         * the current remaining operation length */
+        block_job_progress_set_remaining(&s->common, s->bytes_in_flight + cnt);
 
         /* Note that even when no rate limit is applied we need to yield
          * periodically with no pending I/O so that bdrv_drain_all() returns.
diff --git a/block/stream.c b/block/stream.c
index XXXXXXX..XXXXXXX 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn stream_run(void *opaque)
     BlockBackend *blk = s->common.blk;
     BlockDriverState *bs = blk_bs(blk);
     BlockDriverState *base = s->base;
+    int64_t len;
     int64_t offset = 0;
     uint64_t delay_ns = 0;
     int error = 0;
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn stream_run(void *opaque)
         goto out;
     }
 
-    s->common.len = bdrv_getlength(bs);
-    if (s->common.len < 0) {
-        ret = s->common.len;
+    len = bdrv_getlength(bs);
+    if (len < 0) {
+        ret = len;
         goto out;
     }
+    block_job_progress_set_remaining(&s->common, len);
 
     buf = qemu_blockalign(bs, STREAM_BUFFER_SIZE);
 
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn stream_run(void *opaque)
         bdrv_enable_copy_on_read(bs);
     }
 
-    for ( ; offset < s->common.len; offset += n) {
+    for ( ; offset < len; offset += n) {
         bool copy;
 
         /* Note that even when no rate limit is applied we need to yield
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn stream_run(void *opaque)
 
             /* Finish early if end of backing file has been reached */
             if (ret == 0 && n == 0) {
-                n = s->common.len - offset;
+                n = len - offset;
             }
 
             copy = (ret == 1);
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn stream_run(void *opaque)
         ret = 0;
 
         /* Publish progress */
-        s->common.offset += n;
+        block_job_progress_update(&s->common, n);
         if (copy && s->common.speed) {
             delay_ns = ratelimit_calculate_delay(&s->limit, n);
         } else {
diff --git a/blockjob.c b/blockjob.c
index XXXXXXX..XXXXXXX 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -XXX,XX +XXX,XX @@ int block_job_complete_sync(BlockJob *job, Error **errp)
     return block_job_finish_sync(job, &block_job_complete, errp);
 }
 
+void block_job_progress_update(BlockJob *job, uint64_t done)
+{
+    job->offset += done;
+}
+
+void block_job_progress_set_remaining(BlockJob *job, uint64_t remaining)
+{
+    job->len = job->offset + remaining;
+}
+
 BlockJobInfo *block_job_query(BlockJob *job, Error **errp)
 {
     BlockJobInfo *info;
-- 
2.13.6

Every block job has a RateLimit, and they all do the exact same thing
with it, so it should be common infrastructure. Move the struct field
for a start.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: John Snow <jsnow@redhat.com>
---
 include/block/blockjob.h | 4 ++++
 block/backup.c           | 5 ++---
 block/commit.c           | 5 ++---
 block/mirror.c           | 6 +++---
 block/stream.c           | 5 ++---
 5 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/include/block/blockjob.h b/include/block/blockjob.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/blockjob.h
+++ b/include/block/blockjob.h
@@ -XXX,XX +XXX,XX @@
 #define BLOCKJOB_H
 
 #include "block/block.h"
+#include "qemu/ratelimit.h"
 
 typedef struct BlockJobDriver BlockJobDriver;
 typedef struct BlockJobTxn BlockJobTxn;
@@ -XXX,XX +XXX,XX @@ typedef struct BlockJob {
     /** Speed that was set with @block_job_set_speed.  */
     int64_t speed;
 
+    /** Rate limiting data structure for implementing @speed. */
+    RateLimit limit;
+
     /** The completion function that will be called when the job completes.  */
     BlockCompletionFunc *cb;
 
diff --git a/block/backup.c b/block/backup.c
index XXXXXXX..XXXXXXX 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -XXX,XX +XXX,XX @@ typedef struct BackupBlockJob {
     /* bitmap for sync=incremental */
     BdrvDirtyBitmap *sync_bitmap;
     MirrorSyncMode sync_mode;
-    RateLimit limit;
     BlockdevOnError on_source_error;
     BlockdevOnError on_target_error;
     CoRwlock flush_rwlock;
@@ -XXX,XX +XXX,XX @@ static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp)
         error_setg(errp, QERR_INVALID_PARAMETER, "speed");
         return;
     }
-    ratelimit_set_speed(&s->limit, speed, SLICE_TIME);
+    ratelimit_set_speed(&s->common.limit, speed, SLICE_TIME);
 }
 
 static void backup_cleanup_sync_bitmap(BackupBlockJob *job, int ret)
@@ -XXX,XX +XXX,XX @@ static bool coroutine_fn yield_and_check(BackupBlockJob *job)
      * (without, VM does not reboot)
      */
     if (job->common.speed) {
-        uint64_t delay_ns = ratelimit_calculate_delay(&job->limit,
+        uint64_t delay_ns = ratelimit_calculate_delay(&job->common.limit,
                                                       job->bytes_read);
         job->bytes_read = 0;
         block_job_sleep_ns(&job->common, delay_ns);
diff --git a/block/commit.c b/block/commit.c
index XXXXXXX..XXXXXXX 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -XXX,XX +XXX,XX @@ enum {
 
 typedef struct CommitBlockJob {
     BlockJob common;
-    RateLimit limit;
     BlockDriverState *commit_top_bs;
     BlockBackend *top;
     BlockBackend *base;
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn commit_run(void *opaque)
         block_job_progress_update(&s->common, n);
 
         if (copy && s->common.speed) {
-            delay_ns = ratelimit_calculate_delay(&s->limit, n);
+            delay_ns = ratelimit_calculate_delay(&s->common.limit, n);
         } else {
             delay_ns = 0;
         }
@@ -XXX,XX +XXX,XX @@ static void commit_set_speed(BlockJob *job, int64_t speed, Error **errp)
         error_setg(errp, QERR_INVALID_PARAMETER, "speed");
         return;
     }
-    ratelimit_set_speed(&s->limit, speed, SLICE_TIME);
+    ratelimit_set_speed(&s->common.limit, speed, SLICE_TIME);
 }
 
 static const BlockJobDriver commit_job_driver = {
diff --git a/block/mirror.c b/block/mirror.c
index XXXXXXX..XXXXXXX 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -XXX,XX +XXX,XX @@ typedef struct MirrorBuffer {
 
 typedef struct MirrorBlockJob {
     BlockJob common;
-    RateLimit limit;
     BlockBackend *target;
     BlockDriverState *mirror_top_bs;
     BlockDriverState *source;
@@ -XXX,XX +XXX,XX @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
         offset += io_bytes;
         nb_chunks -= DIV_ROUND_UP(io_bytes, s->granularity);
         if (s->common.speed) {
-            delay_ns = ratelimit_calculate_delay(&s->limit, io_bytes_acct);
+            delay_ns = ratelimit_calculate_delay(&s->common.limit,
+                                                 io_bytes_acct);
         }
     }
     return delay_ns;
@@ -XXX,XX +XXX,XX @@ static void mirror_set_speed(BlockJob *job, int64_t speed, Error **errp)
         error_setg(errp, QERR_INVALID_PARAMETER, "speed");
         return;
     }
-    ratelimit_set_speed(&s->limit, speed, SLICE_TIME);
+    ratelimit_set_speed(&s->common.limit, speed, SLICE_TIME);
 }
 
 static void mirror_complete(BlockJob *job, Error **errp)
diff --git a/block/stream.c b/block/stream.c
index XXXXXXX..XXXXXXX 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -XXX,XX +XXX,XX @@ enum {
 
 typedef struct StreamBlockJob {
     BlockJob common;
-    RateLimit limit;
     BlockDriverState *base;
     BlockdevOnError on_error;
     char *backing_file_str;
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn stream_run(void *opaque)
         /* Publish progress */
         block_job_progress_update(&s->common, n);
         if (copy && s->common.speed) {
-            delay_ns = ratelimit_calculate_delay(&s->limit, n);
+            delay_ns = ratelimit_calculate_delay(&s->common.limit, n);
         } else {
             delay_ns = 0;
         }
@@ -XXX,XX +XXX,XX @@ static void stream_set_speed(BlockJob *job, int64_t speed, Error **errp)
         error_setg(errp, QERR_INVALID_PARAMETER, "speed");
         return;
     }
-    ratelimit_set_speed(&s->limit, speed, SLICE_TIME);
+    ratelimit_set_speed(&s->common.limit, speed, SLICE_TIME);
 }
 
 static const BlockJobDriver stream_job_driver = {
-- 
2.13.6

All block job drivers support .set_speed and all of them duplicate the
same code to implement it. Move that code to blockjob.c and remove the
now useless callback.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: John Snow <jsnow@redhat.com>
---
 include/block/blockjob.h     |  2 ++
 include/block/blockjob_int.h |  3 ---
 block/backup.c               | 13 -------------
 block/commit.c               | 14 --------------
 block/mirror.c               | 26 ++++++--------------------
 block/stream.c               | 14 --------------
 blockjob.c                   | 12 ++++--------
 7 files changed, 12 insertions(+), 72 deletions(-)

diff --git a/include/block/blockjob.h b/include/block/blockjob.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/blockjob.h
+++ b/include/block/blockjob.h
@@ -XXX,XX +XXX,XX @@
 #include "block/block.h"
 #include "qemu/ratelimit.h"
 
+#define BLOCK_JOB_SLICE_TIME 100000000ULL /* ns */
+
 typedef struct BlockJobDriver BlockJobDriver;
 typedef struct BlockJobTxn BlockJobTxn;
 
diff --git a/include/block/blockjob_int.h b/include/block/blockjob_int.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/blockjob_int.h
+++ b/include/block/blockjob_int.h
@@ -XXX,XX +XXX,XX @@ struct BlockJobDriver {
     /** String describing the operation, part of query-block-jobs QMP API */
     BlockJobType job_type;
 
-    /** Optional callback for job types that support setting a speed limit */
-    void (*set_speed)(BlockJob *job, int64_t speed, Error **errp);
-
     /** Mandatory: Entrypoint for the Coroutine. */
     CoroutineEntry *start;
 
diff --git a/block/backup.c b/block/backup.c
index XXXXXXX..XXXXXXX 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/error-report.h"
 
 #define BACKUP_CLUSTER_SIZE_DEFAULT (1 << 16)
-#define SLICE_TIME 100000000ULL /* ns */
 
 typedef struct BackupBlockJob {
     BlockJob common;
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn backup_before_write_notify(
     return backup_do_cow(job, req->offset, req->bytes, NULL, true);
 }
 
-static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp)
-{
-    BackupBlockJob *s = container_of(job, BackupBlockJob, common);
-
-    if (speed < 0) {
-        error_setg(errp, QERR_INVALID_PARAMETER, "speed");
-        return;
-    }
-    ratelimit_set_speed(&s->common.limit, speed, SLICE_TIME);
-}
-
 static void backup_cleanup_sync_bitmap(BackupBlockJob *job, int ret)
 {
     BdrvDirtyBitmap *bm;
@@ -XXX,XX +XXX,XX @@ static const BlockJobDriver backup_job_driver = {
     .instance_size          = sizeof(BackupBlockJob),
     .job_type               = BLOCK_JOB_TYPE_BACKUP,
     .start                  = backup_run,
-    .set_speed              = backup_set_speed,
     .commit                 = backup_commit,
     .abort                  = backup_abort,
     .clean                  = backup_clean,
diff --git a/block/commit.c b/block/commit.c
index XXXXXXX..XXXXXXX 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -XXX,XX +XXX,XX @@ enum {
     COMMIT_BUFFER_SIZE = 512 * 1024, /* in bytes */
 };
 
-#define SLICE_TIME 100000000ULL /* ns */
-
 typedef struct CommitBlockJob {
     BlockJob common;
     BlockDriverState *commit_top_bs;
@@ -XXX,XX +XXX,XX @@ out:
     block_job_defer_to_main_loop(&s->common, commit_complete, data);
 }
 
-static void commit_set_speed(BlockJob *job, int64_t speed, Error **errp)
-{
-    CommitBlockJob *s = container_of(job, CommitBlockJob, common);
-
-    if (speed < 0) {
-        error_setg(errp, QERR_INVALID_PARAMETER, "speed");
-        return;
-    }
-    ratelimit_set_speed(&s->common.limit, speed, SLICE_TIME);
-}
-
 static const BlockJobDriver commit_job_driver = {
     .instance_size = sizeof(CommitBlockJob),
     .job_type      = BLOCK_JOB_TYPE_COMMIT,
-    .set_speed     = commit_set_speed,
     .start         = commit_run,
 };
 
diff --git a/block/mirror.c b/block/mirror.c
index XXXXXXX..XXXXXXX 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/ratelimit.h"
 #include "qemu/bitmap.h"
 
-#define SLICE_TIME    100000000ULL /* ns */
 #define MAX_IN_FLIGHT 16
 #define MAX_IO_BYTES (1 << 20) /* 1 Mb */
 #define DEFAULT_MIRROR_BUF_SIZE (MAX_IN_FLIGHT * MAX_IO_BYTES)
@@ -XXX,XX +XXX,XX @@ static void mirror_throttle(MirrorBlockJob *s)
 {
     int64_t now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
 
-    if (now - s->last_pause_ns > SLICE_TIME) {
+    if (now - s->last_pause_ns > BLOCK_JOB_SLICE_TIME) {
         s->last_pause_ns = now;
         block_job_sleep_ns(&s->common, 0);
     } else {
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn mirror_run(void *opaque)
 
         /* Note that even when no rate limit is applied we need to yield
          * periodically with no pending I/O so that bdrv_drain_all() returns.
-         * We do so every SLICE_TIME nanoseconds, or when there is an error,
-         * or when the source is clean, whichever comes first.
-         */
+         * We do so every BLKOCK_JOB_SLICE_TIME nanoseconds, or when there is
+         * an error, or when the source is clean, whichever comes first. */
         delta = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - s->last_pause_ns;
-        if (delta < SLICE_TIME &&
+        if (delta < BLOCK_JOB_SLICE_TIME &&
             s->common.iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
             if (s->in_flight >= MAX_IN_FLIGHT || s->buf_free_count == 0 ||
                 (cnt == 0 && s->in_flight > 0)) {
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn mirror_run(void *opaque)
         ret = 0;
 
         if (s->synced && !should_complete) {
-            delay_ns = (s->in_flight == 0 && cnt == 0 ? SLICE_TIME : 0);
+            delay_ns = (s->in_flight == 0 &&
+                        cnt == 0 ? BLOCK_JOB_SLICE_TIME : 0);
         }
         trace_mirror_before_sleep(s, cnt, s->synced, delay_ns);
         block_job_sleep_ns(&s->common, delay_ns);
@@ -XXX,XX +XXX,XX @@ immediate_exit:
     block_job_defer_to_main_loop(&s->common, mirror_exit, data);
 }
 
-static void mirror_set_speed(BlockJob *job, int64_t speed, Error **errp)
-{
-    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
-
-    if (speed < 0) {
-        error_setg(errp, QERR_INVALID_PARAMETER, "speed");
-        return;
-    }
-    ratelimit_set_speed(&s->common.limit, speed, SLICE_TIME);
-}
-
 static void mirror_complete(BlockJob *job, Error **errp)
 {
     MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
@@ -XXX,XX +XXX,XX @@ static void mirror_drain(BlockJob *job)
 static const BlockJobDriver mirror_job_driver = {
     .instance_size          = sizeof(MirrorBlockJob),
     .job_type               = BLOCK_JOB_TYPE_MIRROR,
-    .set_speed              = mirror_set_speed,
     .start                  = mirror_run,
     .complete               = mirror_complete,
     .pause                  = mirror_pause,
@@ -XXX,XX +XXX,XX @@ static const BlockJobDriver mirror_job_driver = {
 static const BlockJobDriver commit_active_job_driver = {
     .instance_size          = sizeof(MirrorBlockJob),
     .job_type               = BLOCK_JOB_TYPE_COMMIT,
-    .set_speed              = mirror_set_speed,
     .start                  = mirror_run,
     .complete               = mirror_complete,
     .pause                  = mirror_pause,
diff --git a/block/stream.c b/block/stream.c
index XXXXXXX..XXXXXXX 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -XXX,XX +XXX,XX @@ enum {
     STREAM_BUFFER_SIZE = 512 * 1024, /* in bytes */
 };
 
-#define SLICE_TIME 100000000ULL /* ns */
-
 typedef struct StreamBlockJob {
     BlockJob common;
     BlockDriverState *base;
@@ -XXX,XX +XXX,XX @@ out:
     block_job_defer_to_main_loop(&s->common, stream_complete, data);
 }
 
-static void stream_set_speed(BlockJob *job, int64_t speed, Error **errp)
-{
-    StreamBlockJob *s = container_of(job, StreamBlockJob, common);
-
-    if (speed < 0) {
-        error_setg(errp, QERR_INVALID_PARAMETER, "speed");
-        return;
-    }
-    ratelimit_set_speed(&s->common.limit, speed, SLICE_TIME);
-}
-
 static const BlockJobDriver stream_job_driver = {
     .instance_size = sizeof(StreamBlockJob),
     .job_type      = BLOCK_JOB_TYPE_STREAM,
-    .set_speed     = stream_set_speed,
     .start         = stream_run,
 };
 
diff --git a/blockjob.c b/blockjob.c
index XXXXXXX..XXXXXXX 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -XXX,XX +XXX,XX @@ static bool block_job_timer_pending(BlockJob *job)
 
 void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
 {
-    Error *local_err = NULL;
     int64_t old_speed = job->speed;
 
-    if (!job->driver->set_speed) {
-        error_setg(errp, QERR_UNSUPPORTED);
-        return;
-    }
     if (block_job_apply_verb(job, BLOCK_JOB_VERB_SET_SPEED, errp)) {
         return;
     }
-    job->driver->set_speed(job, speed, &local_err);
-    if (local_err) {
-        error_propagate(errp, local_err);
+    if (speed < 0) {
+        error_setg(errp, QERR_INVALID_PARAMETER, "speed");
         return;
     }
 
+    ratelimit_set_speed(&job->limit, speed, BLOCK_JOB_SLICE_TIME);
+
     job->speed = speed;
     if (speed && speed <= old_speed) {
         return;
-- 
2.13.6

This gets us rid of more direct accesses to BlockJob fields from the
job drivers.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: John Snow <jsnow@redhat.com>
---
 include/block/blockjob_int.h |  8 ++++++++
 block/backup.c               | 18 +++++++-----------
 block/commit.c               |  4 ++--
 block/mirror.c               |  5 +----
 block/stream.c               |  4 ++--
 blockjob.c                   |  9 +++++++++
 6 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/include/block/blockjob_int.h b/include/block/blockjob_int.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/blockjob_int.h
+++ b/include/block/blockjob_int.h
@@ -XXX,XX +XXX,XX @@ void block_job_sleep_ns(BlockJob *job, int64_t ns);
 void block_job_yield(BlockJob *job);
 
 /**
+ * block_job_ratelimit_get_delay:
+ *
+ * Calculate and return delay for the next request in ns. See the documentation
+ * of ratelimit_calculate_delay() for details.
+ */
+int64_t block_job_ratelimit_get_delay(BlockJob *job, uint64_t n);
+
+/**
  * block_job_early_fail:
  * @bs: The block device.
  *
diff --git a/block/backup.c b/block/backup.c
index XXXXXXX..XXXXXXX 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -XXX,XX +XXX,XX @@ static void backup_complete(BlockJob *job, void *opaque)
 
 static bool coroutine_fn yield_and_check(BackupBlockJob *job)
 {
+    uint64_t delay_ns;
+
     if (block_job_is_cancelled(&job->common)) {
         return true;
     }
 
-    /* we need to yield so that bdrv_drain_all() returns.
-     * (without, VM does not reboot)
-     */
-    if (job->common.speed) {
-        uint64_t delay_ns = ratelimit_calculate_delay(&job->common.limit,
-                                                      job->bytes_read);
-        job->bytes_read = 0;
-        block_job_sleep_ns(&job->common, delay_ns);
-    } else {
-        block_job_sleep_ns(&job->common, 0);
-    }
+    /* We need to yield even for delay_ns = 0 so that bdrv_drain_all() can
+     * return. Without a yield, the VM would not reboot. */
+    delay_ns = block_job_ratelimit_get_delay(&job->common, job->bytes_read);
+    job->bytes_read = 0;
+    block_job_sleep_ns(&job->common, delay_ns);
 
     if (block_job_is_cancelled(&job->common)) {
         return true;
diff --git a/block/commit.c b/block/commit.c
index XXXXXXX..XXXXXXX 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn commit_run(void *opaque)
         /* Publish progress */
         block_job_progress_update(&s->common, n);
 
-        if (copy && s->common.speed) {
-            delay_ns = ratelimit_calculate_delay(&s->common.limit, n);
+        if (copy) {
+            delay_ns = block_job_ratelimit_get_delay(&s->common, n);
         } else {
             delay_ns = 0;
         }
diff --git a/block/mirror.c b/block/mirror.c
index XXXXXXX..XXXXXXX 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -XXX,XX +XXX,XX @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
         assert(io_bytes);
         offset += io_bytes;
         nb_chunks -= DIV_ROUND_UP(io_bytes, s->granularity);
-        if (s->common.speed) {
-            delay_ns = ratelimit_calculate_delay(&s->common.limit,
-                                                 io_bytes_acct);
-        }
+        delay_ns = block_job_ratelimit_get_delay(&s->common, io_bytes_acct);
     }
     return delay_ns;
 }
diff --git a/block/stream.c b/block/stream.c
index XXXXXXX..XXXXXXX 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn stream_run(void *opaque)
 
         /* Publish progress */
         block_job_progress_update(&s->common, n);
-        if (copy && s->common.speed) {
-            delay_ns = ratelimit_calculate_delay(&s->common.limit, n);
+        if (copy) {
+            delay_ns = block_job_ratelimit_get_delay(&s->common, n);
         } else {
             delay_ns = 0;
         }
diff --git a/blockjob.c b/blockjob.c
index XXXXXXX..XXXXXXX 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -XXX,XX +XXX,XX @@ void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
     block_job_enter_cond(job, block_job_timer_pending);
 }
 
+int64_t block_job_ratelimit_get_delay(BlockJob *job, uint64_t n)
+{
+    if (!job->speed) {
+        return 0;
+    }
+
+    return ratelimit_calculate_delay(&job->limit, n);
+}
+
 void block_job_complete(BlockJob *job, Error **errp)
 {
     /* Should not be reachable via external interface for internal jobs */
-- 
2.13.6

The backup block job directly accesses the driver field in BlockJob. Add
a wrapper for getting it.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: John Snow <jsnow@redhat.com>
---
 include/block/blockjob.h | 7 +++++++
 block/backup.c           | 8 +++++---
 blockjob.c               | 5 +++++
 3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/include/block/blockjob.h b/include/block/blockjob.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/blockjob.h
+++ b/include/block/blockjob.h
@@ -XXX,XX +XXX,XX @@ void block_job_txn_add_job(BlockJobTxn *txn, BlockJob *job);
  */
 bool block_job_is_internal(BlockJob *job);
 
+/**
+ * block_job_driver:
+ *
+ * Returns the driver associated with a block job.
+ */
+const BlockJobDriver *block_job_driver(BlockJob *job);
+
 #endif
diff --git a/block/backup.c b/block/backup.c
index XXXXXXX..XXXXXXX 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -XXX,XX +XXX,XX @@ typedef struct BackupBlockJob {
     HBitmap *copy_bitmap;
 } BackupBlockJob;
 
+static const BlockJobDriver backup_job_driver;
+
 /* See if in-flight requests overlap and wait for them to complete */
 static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job,
                                                        int64_t start,
@@ -XXX,XX +XXX,XX @@ void backup_do_checkpoint(BlockJob *job, Error **errp)
     BackupBlockJob *backup_job = container_of(job, BackupBlockJob, common);
     int64_t len;
 
-    assert(job->driver->job_type == BLOCK_JOB_TYPE_BACKUP);
+    assert(block_job_driver(job) == &backup_job_driver);
 
     if (backup_job->sync_mode != MIRROR_SYNC_MODE_NONE) {
         error_setg(errp, "The backup job only supports block checkpoint in"
@@ -XXX,XX +XXX,XX @@ void backup_wait_for_overlapping_requests(BlockJob *job, int64_t offset,
     BackupBlockJob *backup_job = container_of(job, BackupBlockJob, common);
     int64_t start, end;
 
-    assert(job->driver->job_type == BLOCK_JOB_TYPE_BACKUP);
+    assert(block_job_driver(job) == &backup_job_driver);
 
     start = QEMU_ALIGN_DOWN(offset, backup_job->cluster_size);
     end = QEMU_ALIGN_UP(offset + bytes, backup_job->cluster_size);
@@ -XXX,XX +XXX,XX @@ void backup_cow_request_begin(CowRequest *req, BlockJob *job,
     BackupBlockJob *backup_job = container_of(job, BackupBlockJob, common);
     int64_t start, end;
 
-    assert(job->driver->job_type == BLOCK_JOB_TYPE_BACKUP);
+    assert(block_job_driver(job) == &backup_job_driver);
 
     start = QEMU_ALIGN_DOWN(offset, backup_job->cluster_size);
     end = QEMU_ALIGN_UP(offset + bytes, backup_job->cluster_size);
diff --git a/blockjob.c b/blockjob.c
index XXXXXXX..XXXXXXX 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -XXX,XX +XXX,XX @@ static bool block_job_started(BlockJob *job)
     return job->co;
 }
 
+const BlockJobDriver *block_job_driver(BlockJob *job)
+{
+    return job->driver;
+}
+
 /**
  * All jobs must allow a pause point before entering their job proper. This
  * ensures that jobs can be paused prior to being started, then resumed later.
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

Commit abd3622cc03cf41ed542126a540385f30a4c0175 added a case to 122
regarding how the qcow2 driver handles an incorrect compressed data
length value.  This does not really fit into 122, as that file is
supposed to contain qemu-img convert test cases, which this case is not.
So this patch splits it off into its own file; maybe we will even get
more qcow2-only compression tests in the future.

Also, that test case does not work with refcount_bits=1, so mark that
option as unsupported.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20180406164108.26118-1-mreitz@redhat.com
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Alberto Garcia <berto@igalia.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/122     | 47 ----------------------
 tests/qemu-iotests/122.out | 33 ----------------
 tests/qemu-iotests/214     | 97 ++++++++++++++++++++++++++++++++++++++++++++++
 tests/qemu-iotests/214.out | 35 +++++++++++++++++
 tests/qemu-iotests/group   |  1 +
 5 files changed, 133 insertions(+), 80 deletions(-)
 create mode 100755 tests/qemu-iotests/214
 create mode 100644 tests/qemu-iotests/214.out

diff --git a/tests/qemu-iotests/122 b/tests/qemu-iotests/122
index XXXXXXX..XXXXXXX 100755
--- a/tests/qemu-iotests/122
+++ b/tests/qemu-iotests/122
@@ -XXX,XX +XXX,XX @@ $QEMU_IO -c "read -P 0    1024k 1022k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _fil
 
 
 echo
-echo "=== Corrupted size field in compressed cluster descriptor ==="
-echo
-# Create an empty image and fill half of it with compressed data.
-# The L2 entries of the two compressed clusters are located at
-# 0x800000 and 0x800008, their original values are 0x4008000000a00000
-# and 0x4008000000a00802 (5 sectors for compressed data each).
-_make_test_img 8M -o cluster_size=2M
-$QEMU_IO -c "write -c -P 0x11 0 2M" -c "write -c -P 0x11 2M 2M" "$TEST_IMG" \
-         2>&1 | _filter_qemu_io | _filter_testdir
-
-# Reduce size of compressed data to 4 sectors: this corrupts the image.
-poke_file "$TEST_IMG" $((0x800000)) "\x40\x06"
-$QEMU_IO -c "read  -P 0x11 0 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
-
-# 'qemu-img check' however doesn't see anything wrong because it
-# doesn't try to decompress the data and the refcounts are consistent.
-# TODO: update qemu-img so this can be detected.
-_check_test_img
-
-# Increase size of compressed data to the maximum (8192 sectors).
-# This makes QEMU read more data (8192 sectors instead of 5, host
-# addresses [0xa00000, 0xdfffff]), but the decompression algorithm
-# stops once we have enough to restore the uncompressed cluster, so
-# the rest of the data is ignored.
-poke_file "$TEST_IMG" $((0x800000)) "\x7f\xfe"
-# Do it also for the second compressed cluster (L2 entry at 0x800008).
-# In this case the compressed data would span 3 host clusters
-# (host addresses: [0xa00802, 0xe00801])
-poke_file "$TEST_IMG" $((0x800008)) "\x7f\xfe"
-
-# Here the image is too small so we're asking QEMU to read beyond the
-# end of the image.
-$QEMU_IO -c "read  -P 0x11  0 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
-# But if we grow the image we won't be reading beyond its end anymore.
-$QEMU_IO -c "write -P 0x22 4M 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
-$QEMU_IO -c "read  -P 0x11  0 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
-
-# The refcount data is however wrong because due to the increased size
-# of the compressed data it now reaches the following host clusters.
-# This can be repaired by qemu-img check by increasing the refcount of
-# those clusters.
-# TODO: update qemu-img to correct the compressed cluster size instead.
-_check_test_img -r all
-$QEMU_IO -c "read  -P 0x11  0 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
-$QEMU_IO -c "read  -P 0x22 4M 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
-
-echo
 echo "=== Full allocation with -S 0 ==="
 echo
 
diff --git a/tests/qemu-iotests/122.out b/tests/qemu-iotests/122.out
index XXXXXXX..XXXXXXX 100644
--- a/tests/qemu-iotests/122.out
+++ b/tests/qemu-iotests/122.out
@@ -XXX,XX +XXX,XX @@ read 1024/1024 bytes at offset 1047552
 read 1046528/1046528 bytes at offset 1048576
 1022 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 
-=== Corrupted size field in compressed cluster descriptor ===
-
-Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=8388608
-wrote 2097152/2097152 bytes at offset 0
-2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 2097152/2097152 bytes at offset 2097152
-2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-read failed: Input/output error
-No errors were found on the image.
-read 4194304/4194304 bytes at offset 0
-4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 4194304/4194304 bytes at offset 4194304
-4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-read 4194304/4194304 bytes at offset 0
-4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-ERROR cluster 6 refcount=1 reference=3
-ERROR cluster 7 refcount=1 reference=2
-Repairing cluster 6 refcount=1 reference=3
-Repairing cluster 7 refcount=1 reference=2
-Repairing OFLAG_COPIED data cluster: l2_entry=8000000000c00000 refcount=3
-Repairing OFLAG_COPIED data cluster: l2_entry=8000000000e00000 refcount=2
-The following inconsistencies were found and repaired:
-
-    0 leaked clusters
-    4 corruptions
-
-Double checking the fixed image now...
-No errors were found on the image.
-read 4194304/4194304 bytes at offset 0
-4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-read 4194304/4194304 bytes at offset 4194304
-4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-
 === Full allocation with -S 0 ===
 
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864
diff --git a/tests/qemu-iotests/214 b/tests/qemu-iotests/214
new file mode 100755
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/qemu-iotests/214
@@ -XXX,XX +XXX,XX @@
+#!/bin/bash
+#
+# Test qcow2 image compression
+#
+# Copyright (C) 2018 Igalia, S.L.
+# Author: Alberto Garcia <berto@igalia.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+seq=$(basename "$0")
+echo "QA output created by $seq"
+
+here=$PWD
+status=1	# failure is the default!
+
+_cleanup()
+{
+    _cleanup_test_img
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common.rc
+. ./common.filter
+
+_supported_fmt qcow2
+_supported_proto file
+_supported_os Linux
+
+# Repairing the corrupted image requires qemu-img check to store a
+# refcount up to 3, which requires at least two refcount bits.
+_unsupported_imgopts 'refcount_bits=1[^0-9]'
+
+
+echo
+echo "=== Corrupted size field in compressed cluster descriptor ==="
+echo
+# Create an empty image and fill half of it with compressed data.
+# The L2 entries of the two compressed clusters are located at
+# 0x800000 and 0x800008, their original values are 0x4008000000a00000
+# and 0x4008000000a00802 (5 sectors for compressed data each).
+_make_test_img 8M -o cluster_size=2M
+$QEMU_IO -c "write -c -P 0x11 0 2M" -c "write -c -P 0x11 2M 2M" "$TEST_IMG" \
+         2>&1 | _filter_qemu_io | _filter_testdir
+
+# Reduce size of compressed data to 4 sectors: this corrupts the image.
+poke_file "$TEST_IMG" $((0x800000)) "\x40\x06"
+$QEMU_IO -c "read  -P 0x11 0 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+
+# 'qemu-img check' however doesn't see anything wrong because it
+# doesn't try to decompress the data and the refcounts are consistent.
+# TODO: update qemu-img so this can be detected.
+_check_test_img
+
+# Increase size of compressed data to the maximum (8192 sectors).
+# This makes QEMU read more data (8192 sectors instead of 5, host
+# addresses [0xa00000, 0xdfffff]), but the decompression algorithm
+# stops once we have enough to restore the uncompressed cluster, so
+# the rest of the data is ignored.
+poke_file "$TEST_IMG" $((0x800000)) "\x7f\xfe"
+# Do it also for the second compressed cluster (L2 entry at 0x800008).
+# In this case the compressed data would span 3 host clusters
+# (host addresses: [0xa00802, 0xe00801])
+poke_file "$TEST_IMG" $((0x800008)) "\x7f\xfe"
+
+# Here the image is too small so we're asking QEMU to read beyond the
+# end of the image.
+$QEMU_IO -c "read  -P 0x11  0 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+# But if we grow the image we won't be reading beyond its end anymore.
+$QEMU_IO -c "write -P 0x22 4M 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read  -P 0x11  0 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+
+# The refcount data is however wrong because due to the increased size
+# of the compressed data it now reaches the following host clusters.
+# This can be repaired by qemu-img check by increasing the refcount of
+# those clusters.
+# TODO: update qemu-img to correct the compressed cluster size instead.
+_check_test_img -r all
+$QEMU_IO -c "read  -P 0x11  0 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read  -P 0x22 4M 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+
+# success, all done
+echo '*** done'
+rm -f $seq.full
+status=0
diff --git a/tests/qemu-iotests/214.out b/tests/qemu-iotests/214.out
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/qemu-iotests/214.out
@@ -XXX,XX +XXX,XX @@
+QA output created by 214
+
+=== Corrupted size field in compressed cluster descriptor ===
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=8388608
+wrote 2097152/2097152 bytes at offset 0
+2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 2097152/2097152 bytes at offset 2097152
+2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read failed: Input/output error
+No errors were found on the image.
+read 4194304/4194304 bytes at offset 0
+4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 4194304/4194304 bytes at offset 4194304
+4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 4194304/4194304 bytes at offset 0
+4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+ERROR cluster 6 refcount=1 reference=3
+ERROR cluster 7 refcount=1 reference=2
+Repairing cluster 6 refcount=1 reference=3
+Repairing cluster 7 refcount=1 reference=2
+Repairing OFLAG_COPIED data cluster: l2_entry=8000000000c00000 refcount=3
+Repairing OFLAG_COPIED data cluster: l2_entry=8000000000e00000 refcount=2
+The following inconsistencies were found and repaired:
+
+    0 leaked clusters
+    4 corruptions
+
+Double checking the fixed image now...
+No errors were found on the image.
+read 4194304/4194304 bytes at offset 0
+4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 4194304/4194304 bytes at offset 4194304
+4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+*** done
diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
index XXXXXXX..XXXXXXX 100644
--- a/tests/qemu-iotests/group
+++ b/tests/qemu-iotests/group
@@ -XXX,XX +XXX,XX @@
 211 rw auto quick
 212 rw auto quick
 213 rw auto quick
+214 rw auto
 218 rw auto quick
-- 
2.13.6

From: Alberto Garcia <berto@igalia.com>

Compressed clusters are not supposed to have the COPIED bit set.
"qemu-img check" detects that and prints an error message reporting
the number of the affected host cluster. This doesn't make much sense
because compressed clusters are not aligned to host clusters, so it
would be better to report the offset instead. Plus, the calculation is
wrong and it uses the raw L2 entry as if it was simply an offset.

This patch fixes the error message and reports the offset of the
compressed cluster.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Message-id: 0f687957feb72e80c740403191a47e607c2463fe.1523376013.git.berto@igalia.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-refcount.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -XXX,XX +XXX,XX @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
         case QCOW2_CLUSTER_COMPRESSED:
             /* Compressed clusters don't have QCOW_OFLAG_COPIED */
             if (l2_entry & QCOW_OFLAG_COPIED) {
-                fprintf(stderr, "ERROR: cluster %" PRId64 ": "
+                fprintf(stderr, "ERROR: coffset=0x%" PRIx64 ": "
                     "copied flag must never be set for compressed "
-                    "clusters\n", l2_entry >> s->cluster_bits);
+                    "clusters\n", l2_entry & s->cluster_offset_mask);
                 l2_entry &= ~QCOW_OFLAG_COPIED;
                 res->corruptions++;
             }
-- 
2.13.6

From: Alberto Garcia <berto@igalia.com>

Compressed clusters are not supposed to have the COPIED bit set, but
this is not made explicit in the specs, so let's document it.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Message-id: 74552e1d6e858d3159cb0c0e188e80bc9248e337.1523376013.git.berto@igalia.com
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 docs/interop/qcow2.txt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/interop/qcow2.txt b/docs/interop/qcow2.txt
index XXXXXXX..XXXXXXX 100644
--- a/docs/interop/qcow2.txt
+++ b/docs/interop/qcow2.txt
@@ -XXX,XX +XXX,XX @@ L2 table entry:
               62:   0 for standard clusters
                     1 for compressed clusters
 
-              63:   0 for a cluster that is unused or requires COW, 1 if its
-                    refcount is exactly one. This information is only accurate
-                    in L2 tables that are reachable from the active L1
-                    table.
+              63:   0 for clusters that are unused, compressed or require COW.
+                    1 for standard clusters whose refcount is exactly one.
+                    This information is only accurate in L2 tables
+                    that are reachable from the active L1 table.
 
 Standard Cluster Descriptor:
 
-- 
2.13.6

From: Alberto Garcia <berto@igalia.com>

The L2 and refcount caches have default sizes that can be overridden
using the l2-cache-size and refcount-cache-size (an additional
parameter named cache-size sets the combined size of both caches).

Unless forced by one of the aforementioned parameters, QEMU will set
the unspecified sizes so that the L2 cache is 4 times larger than the
refcount cache.

This is based on the premise that the refcount metadata needs to be
only a fourth of the L2 metadata to cover the same amount of disk
space. This is incorrect for two reasons:

a) The amount of disk covered by an L2 table depends solely on the
    cluster size, but in the case of a refcount block it depends on
    the cluster size *and* the width of each refcount entry.
    The 4/1 ratio is only valid with 16-bit entries (the default).

b) When we talk about disk space and L2 tables we are talking about
    guest space (L2 tables map guest clusters to host clusters),
    whereas refcount blocks are used for host clusters (including
    L1/L2 tables and the refcount blocks themselves). On a fully
    populated (and uncompressed) qcow2 file, image size > virtual size
    so there are more refcount entries than L2 entries.

Problem (a) could be fixed by adjusting the algorithm to take into
account the refcount entry width. Problem (b) could be fixed by
increasing a bit the refcount cache size to account for the clusters
used for qcow2 metadata.

However this patch takes a completely different approach and instead
of keeping a ratio between both cache sizes it assigns as much as
possible to the L2 cache and the remainder to the refcount cache.

The reason is that L2 tables are used for every single I/O request
from the guest and the effect of increasing the cache is significant
and clearly measurable. Refcount blocks are however only used for
cluster allocation and internal snapshots and in practice are accessed
sequentially in most cases, so the effect of increasing the cache is
negligible (even when doing random writes from the guest).

So, make the refcount cache as small as possible unless the user
explicitly asks for a larger one.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 9695182c2eb11b77cb319689a1ebaa4e7c9d6591.1523968389.git.berto@igalia.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2.h              |  4 ----
 block/qcow2.c              | 31 +++++++++++++++++++------------
 tests/qemu-iotests/137.out |  2 +-
 3 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/block/qcow2.h b/block/qcow2.h
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -XXX,XX +XXX,XX @@
 #define DEFAULT_L2_CACHE_CLUSTERS 8 /* clusters */
 #define DEFAULT_L2_CACHE_BYTE_SIZE 1048576 /* bytes */
 
-/* The refblock cache needs only a fourth of the L2 cache size to cover as many
- * clusters */
-#define DEFAULT_L2_REFCOUNT_SIZE_RATIO 4
-
 #define DEFAULT_CLUSTER_SIZE 65536
 
 
diff --git a/block/qcow2.c b/block/qcow2.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -XXX,XX +XXX,XX @@ static void read_cache_sizes(BlockDriverState *bs, QemuOpts *opts,
         } else if (refcount_cache_size_set) {
             *l2_cache_size = combined_cache_size - *refcount_cache_size;
         } else {
-            *refcount_cache_size = combined_cache_size
-                                 / (DEFAULT_L2_REFCOUNT_SIZE_RATIO + 1);
-            *l2_cache_size = combined_cache_size - *refcount_cache_size;
+            uint64_t virtual_disk_size = bs->total_sectors * BDRV_SECTOR_SIZE;
+            uint64_t max_l2_cache = virtual_disk_size / (s->cluster_size / 8);
+            uint64_t min_refcount_cache =
+                (uint64_t) MIN_REFCOUNT_CACHE_SIZE * s->cluster_size;
+
+            /* Assign as much memory as possible to the L2 cache, and
+             * use the remainder for the refcount cache */
+            if (combined_cache_size >= max_l2_cache + min_refcount_cache) {
+                *l2_cache_size = max_l2_cache;
+                *refcount_cache_size = combined_cache_size - *l2_cache_size;
+            } else {
+                *refcount_cache_size =
+                    MIN(combined_cache_size, min_refcount_cache);
+                *l2_cache_size = combined_cache_size - *refcount_cache_size;
+            }
         }
     } else {
-        if (!l2_cache_size_set && !refcount_cache_size_set) {
+        if (!l2_cache_size_set) {
             *l2_cache_size = MAX(DEFAULT_L2_CACHE_BYTE_SIZE,
                                  (uint64_t)DEFAULT_L2_CACHE_CLUSTERS
                                  * s->cluster_size);
-            *refcount_cache_size = *l2_cache_size
-                                 / DEFAULT_L2_REFCOUNT_SIZE_RATIO;
-        } else if (!l2_cache_size_set) {
-            *l2_cache_size = *refcount_cache_size
-                           * DEFAULT_L2_REFCOUNT_SIZE_RATIO;
-        } else if (!refcount_cache_size_set) {
-            *refcount_cache_size = *l2_cache_size
-                                 / DEFAULT_L2_REFCOUNT_SIZE_RATIO;
+        }
+        if (!refcount_cache_size_set) {
+            *refcount_cache_size = MIN_REFCOUNT_CACHE_SIZE * s->cluster_size;
         }
     }
 
diff --git a/tests/qemu-iotests/137.out b/tests/qemu-iotests/137.out
index XXXXXXX..XXXXXXX 100644
--- a/tests/qemu-iotests/137.out
+++ b/tests/qemu-iotests/137.out
@@ -XXX,XX +XXX,XX @@ refcount-cache-size may not exceed cache-size
 L2 cache size too big
 L2 cache entry size must be a power of two between 512 and the cluster size (65536)
 L2 cache entry size must be a power of two between 512 and the cluster size (65536)
-L2 cache size too big
+Refcount cache size too big
 Conflicting values for qcow2 options 'overlap-check' ('constant') and 'overlap-check.template' ('all')
 Unsupported value 'blubb' for qcow2 option 'overlap-check'. Allowed are any of the following: none, constant, cached, all
 Unsupported value 'blubb' for qcow2 option 'overlap-check'. Allowed are any of the following: none, constant, cached, all
-- 
2.13.6

From: Alberto Garcia <berto@igalia.com>

We have just reduced the refcount cache size to the minimum unless
the user explicitly requests a larger one, so we have to update the
documentation to reflect this change.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Message-id: c5f0bde23558dd9d33b21fffc76ac9953cc19c56.1523968389.git.berto@igalia.com
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 docs/qcow2-cache.txt | 33 ++++++++++++++++-----------------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/docs/qcow2-cache.txt b/docs/qcow2-cache.txt
index XXXXXXX..XXXXXXX 100644
--- a/docs/qcow2-cache.txt
+++ b/docs/qcow2-cache.txt
@@ -XXX,XX +XXX,XX @@ There are three options available, and all of them take bytes:
 "refcount-cache-size":   maximum size of the refcount block cache
 "cache-size":            maximum size of both caches combined
 
-There are two things that need to be taken into account:
+There are a few things that need to be taken into account:
 
  - Both caches must have a size that is a multiple of the cluster size
    (or the cache entry size: see "Using smaller cache sizes" below).
 
- - If you only set one of the options above, QEMU will automatically
-   adjust the others so that the L2 cache is 4 times bigger than the
-   refcount cache.
+ - The default L2 cache size is 8 clusters or 1MB (whichever is more),
+   and the minimum is 2 clusters (or 2 cache entries, see below).
 
-This means that these options are equivalent:
+ - The default (and minimum) refcount cache size is 4 clusters.
 
-   -drive file=hd.qcow2,l2-cache-size=2097152
-   -drive file=hd.qcow2,refcount-cache-size=524288
-   -drive file=hd.qcow2,cache-size=2621440
+ - If only "cache-size" is specified then QEMU will assign as much
+   memory as possible to the L2 cache before increasing the refcount
+   cache size.
 
-The reason for this 1/4 ratio is to ensure that both caches cover the
-same amount of disk space. Note however that this is only valid with
-the default value of refcount_bits (16). If you are using a different
-value you might want to calculate both cache sizes yourself since QEMU
-will always use the same 1/4 ratio.
+Unlike L2 tables, refcount blocks are not used during normal I/O but
+only during allocations and internal snapshots. In most cases they are
+accessed sequentially (even during random guest I/O) so increasing the
+refcount cache size won't have any measurable effect in performance
+(this can change if you are using internal snapshots, so you may want
+to think about increasing the cache size if you use them heavily).
 
-It's also worth mentioning that there's no strict need for both caches
-to cover the same amount of disk space. The refcount cache is used
-much less often than the L2 cache, so it's perfectly reasonable to
-keep it small.
+Before QEMU 2.12 the refcount cache had a default size of 1/4 of the
+L2 cache size. This resulted in unnecessarily large caches, so now the
+refcount cache is as small as possible unless overridden by the user.
 
 
 Using smaller cache entries
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

Currently, common.qemu only allows to match for results indicating
success.  The only way to fail is by provoking a timeout.  However,
sometimes we do have a defined failure output and can match for that,
which saves us from having to wait for the timeout in case of failure.
Because failure can sometimes just result in a _notrun in the test, it
is actually important to care about being able to fail quickly.

Also, sometimes we simply do not get any specific output in case of
success.  The only way to handle this currently would be to define an
error message as the string to look for, which means that actual success
results in a timeout.  This is really bad because it unnecessarily slows
down a succeeding test.

Therefore, this patch adds a new parameter $success_or_failure to
_timed_wait_for and _send_qemu_cmd.  Setting this to a non-empty string
makes both commands expect two match parameters: If the first matches,
the function succeeds.  If the second matches, the function fails.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20180406151731.4285-2-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/common.qemu | 58 +++++++++++++++++++++++++++++++++++++-----
 1 file changed, 51 insertions(+), 7 deletions(-)

diff --git a/tests/qemu-iotests/common.qemu b/tests/qemu-iotests/common.qemu
index XXXXXXX..XXXXXXX 100644
--- a/tests/qemu-iotests/common.qemu
+++ b/tests/qemu-iotests/common.qemu
@@ -XXX,XX +XXX,XX @@ _in_fd=4
 # response is not echoed out.
 # If $mismatch_only is set, only non-matching responses will
 # be echoed.
+#
+# If $success_or_failure is set, the meaning of the arguments is
+# changed as follows:
+# $2: A string to search for in the response; if found, this indicates
+#     success and ${QEMU_STATUS[$1]} is set to 0.
+# $3: A string to search for in the response; if found, this indicates
+#     failure and the test is either aborted (if $qemu_error_no_exit
+#     is not set) or ${QEMU_STATUS[$1]} is set to -1 (otherwise).
 function _timed_wait_for()
 {
     local h=${1}
     shift
 
+    if [ -z "${success_or_failure}" ]; then
+        success_match=${*}
+        failure_match=
+    else
+        success_match=${1}
+        failure_match=${2}
+    fi
+
+    timeout=yes
+
     QEMU_STATUS[$h]=0
     while IFS= read -t ${QEMU_COMM_TIMEOUT} resp <&${QEMU_OUT[$h]}
     do
@@ -XXX,XX +XXX,XX @@ function _timed_wait_for()
             echo "${resp}" | _filter_testdir | _filter_qemu \
                            | _filter_qemu_io | _filter_qmp | _filter_hmp
         fi
-        grep -q "${*}" < <(echo "${resp}")
+        if [ -n "${failure_match}" ]; then
+            grep -q "${failure_match}" < <(echo "${resp}")
+            if [ $? -eq 0 ]; then
+                timeout=
+                break
+            fi
+        fi
+        grep -q "${success_match}" < <(echo "${resp}")
         if [ $? -eq 0 ]; then
             return
-        elif [ -z "${silent}" ] && [ -n "${mismatch_only}" ]; then
+        fi
+        if [ -z "${silent}" ] && [ -n "${mismatch_only}" ]; then
             echo "${resp}" | _filter_testdir | _filter_qemu \
                            | _filter_qemu_io | _filter_qmp | _filter_hmp
         fi
@@ -XXX,XX +XXX,XX @@ function _timed_wait_for()
     done
     QEMU_STATUS[$h]=-1
     if [ -z "${qemu_error_no_exit}" ]; then
-        echo "Timeout waiting for ${*} on handle ${h}"
-        exit 1  # Timeout means the test failed
+        if [ -n "${timeout}" ]; then
+            echo "Timeout waiting for ${success_match} on handle ${h}"
+        else
+            echo "Wrong response matching ${failure_match} on handle ${h}"
+        fi
+        exit 1  # Timeout or wrong match mean the test failed
     fi
 }
 
@@ -XXX,XX +XXX,XX @@ function _timed_wait_for()
 # If $qemu_error_no_exit is set, then even if the expected response
 # is not seen, we will not exit.  $QEMU_STATUS[$1] will be set it -1 in
 # that case.
+#
+# If $success_or_failure is set, then the last two strings are the
+# strings the response will be scanned for.  The first of the two
+# indicates success, the latter indicates failure.  Failure is handled
+# like a timeout.
 function _send_qemu_cmd()
 {
     local h=${1}
@@ -XXX,XX +XXX,XX @@ function _send_qemu_cmd()
         use_error="no"
     fi
     # This array element extraction is done to accommodate pathnames with spaces
-    cmd=${@: 1:${#@}-1}
-    shift $(($# - 1))
+    if [ -z "${success_or_failure}" ]; then
+        cmd=${@: 1:${#@}-1}
+        shift $(($# - 1))
+    else
+        cmd=${@: 1:${#@}-2}
+        shift $(($# - 2))
+    fi
 
     while [ ${count} -gt 0 ]
     do
         echo "${cmd}" >&${QEMU_IN[${h}]}
         if [ -n "${1}" ]; then
-            qemu_error_no_exit=${use_error} _timed_wait_for ${h} "${1}"
+            if [ -z "${success_or_failure}" ]; then
+                qemu_error_no_exit=${use_error} _timed_wait_for ${h} "${1}"
+            else
+                qemu_error_no_exit=${use_error} _timed_wait_for ${h} "${1}" "${2}"
+            fi
             if [ ${QEMU_STATUS[$h]} -eq 0 ]; then
                 return
             fi
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

userfaultfd support depends on the host kernel, so it may not be
available.  If so, 181 and 201 should be skipped.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20180406151731.4285-3-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/181 | 13 +++++++++++++
 tests/qemu-iotests/201 | 13 +++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/tests/qemu-iotests/181 b/tests/qemu-iotests/181
index XXXXXXX..XXXXXXX 100755
--- a/tests/qemu-iotests/181
+++ b/tests/qemu-iotests/181
@@ -XXX,XX +XXX,XX @@ echo
 # Enable postcopy-ram capability both on source and destination
 silent=yes
 _send_qemu_cmd $dest 'migrate_set_capability postcopy-ram on' "(qemu)"
+
+qemu_error_no_exit=yes success_or_failure=yes \
+    _send_qemu_cmd $dest '' "(qemu)" "Postcopy is not supported"
+if [ ${QEMU_STATUS[$dest]} -lt 0 ]; then
+    _send_qemu_cmd $dest '' "(qemu)"
+
+    _send_qemu_cmd $src 'quit' ""
+    _send_qemu_cmd $dest 'quit' ""
+    wait=1 _cleanup_qemu
+
+    _notrun 'Postcopy is not supported'
+fi
+
 _send_qemu_cmd $src 'migrate_set_speed 4k' "(qemu)"
 _send_qemu_cmd $src 'migrate_set_capability postcopy-ram on' "(qemu)"
 _send_qemu_cmd $src "migrate -d unix:${MIG_SOCKET}" "(qemu)"
diff --git a/tests/qemu-iotests/201 b/tests/qemu-iotests/201
index XXXXXXX..XXXXXXX 100755
--- a/tests/qemu-iotests/201
+++ b/tests/qemu-iotests/201
@@ -XXX,XX +XXX,XX @@ echo
 
 silent=yes
 _send_qemu_cmd $dest 'migrate_set_capability postcopy-ram on' "(qemu)"
+
+qemu_error_no_exit=yes success_or_failure=yes \
+    _send_qemu_cmd $dest '' "(qemu)" "Postcopy is not supported"
+if [ ${QEMU_STATUS[$dest]} -lt 0 ]; then
+    _send_qemu_cmd $dest '' "(qemu)"
+
+    _send_qemu_cmd $src 'quit' ""
+    _send_qemu_cmd $dest 'quit' ""
+    wait=1 _cleanup_qemu
+
+    _notrun 'Postcopy is not supported'
+fi
+
 _send_qemu_cmd $src 'migrate_set_capability postcopy-ram on' "(qemu)"
 _send_qemu_cmd $src "migrate -d unix:${MIG_SOCKET}" "(qemu)"
 
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

This adds a simple copy-on-read filter driver.  It relies on the already
existing COR functionality in the central block layer code, which may be
moved here once we no longer need it there.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20180421132929.21610-2-mreitz@redhat.com
Reviewed-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 qapi/block-core.json |   5 +-
 block/copy-on-read.c | 171 +++++++++++++++++++++++++++++++++++++++++++++++++++
 block/Makefile.objs  |   2 +-
 3 files changed, 176 insertions(+), 2 deletions(-)
 create mode 100644 block/copy-on-read.c

diff --git a/qapi/block-core.json b/qapi/block-core.json
index XXXXXXX..XXXXXXX 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -XXX,XX +XXX,XX @@
 # @vxhs: Since 2.10
 # @throttle: Since 2.11
 # @nvme: Since 2.12
+# @copy-on-read: Since 2.13
 #
 # Since: 2.9
 ##
 { 'enum': 'BlockdevDriver',
-  'data': [ 'blkdebug', 'blkverify', 'bochs', 'cloop',
+  'data': [ 'blkdebug', 'blkverify', 'bochs', 'cloop', 'copy-on-read',
             'dmg', 'file', 'ftp', 'ftps', 'gluster', 'host_cdrom',
             'host_device', 'http', 'https', 'iscsi', 'luks', 'nbd', 'nfs',
             'null-aio', 'null-co', 'nvme', 'parallels', 'qcow', 'qcow2', 'qed',
@@ -XXX,XX +XXX,XX @@
       'blkverify':  'BlockdevOptionsBlkverify',
       'bochs':      'BlockdevOptionsGenericFormat',
       'cloop':      'BlockdevOptionsGenericFormat',
+      'copy-on-read':'BlockdevOptionsGenericFormat',
       'dmg':        'BlockdevOptionsGenericFormat',
       'file':       'BlockdevOptionsFile',
       'ftp':        'BlockdevOptionsCurlFtp',
@@ -XXX,XX +XXX,XX @@
       'blkverify':      'BlockdevCreateNotSupported',
       'bochs':          'BlockdevCreateNotSupported',
       'cloop':          'BlockdevCreateNotSupported',
+      'copy-on-read':   'BlockdevCreateNotSupported',
       'dmg':            'BlockdevCreateNotSupported',
       'file':           'BlockdevCreateOptionsFile',
       'ftp':            'BlockdevCreateNotSupported',
diff --git a/block/copy-on-read.c b/block/copy-on-read.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/block/copy-on-read.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * Copy-on-read filter block driver
+ *
+ * Copyright (c) 2018 Red Hat, Inc.
+ *
+ * Author:
+ *   Max Reitz <mreitz@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 or
+ * (at your option) version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "block/block_int.h"
+
+
+static int cor_open(BlockDriverState *bs, QDict *options, int flags,
+                    Error **errp)
+{
+    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file, false,
+                               errp);
+    if (!bs->file) {
+        return -EINVAL;
+    }
+
+    bs->supported_write_flags = BDRV_REQ_FUA &
+                                    bs->file->bs->supported_write_flags;
+
+    bs->supported_zero_flags = (BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) &
+                                    bs->file->bs->supported_zero_flags;
+
+    return 0;
+}
+
+
+static void cor_close(BlockDriverState *bs)
+{
+}
+
+
+#define PERM_PASSTHROUGH (BLK_PERM_CONSISTENT_READ \
+                          | BLK_PERM_WRITE \
+                          | BLK_PERM_RESIZE)
+#define PERM_UNCHANGED (BLK_PERM_ALL & ~PERM_PASSTHROUGH)
+
+static void cor_child_perm(BlockDriverState *bs, BdrvChild *c,
+                           const BdrvChildRole *role,
+                           BlockReopenQueue *reopen_queue,
+                           uint64_t perm, uint64_t shared,
+                           uint64_t *nperm, uint64_t *nshared)
+{
+    if (c == NULL) {
+        *nperm = (perm & PERM_PASSTHROUGH) | BLK_PERM_WRITE_UNCHANGED;
+        *nshared = (shared & PERM_PASSTHROUGH) | PERM_UNCHANGED;
+        return;
+    }
+
+    *nperm = (perm & PERM_PASSTHROUGH) |
+             (c->perm & PERM_UNCHANGED);
+    *nshared = (shared & PERM_PASSTHROUGH) |
+               (c->shared_perm & PERM_UNCHANGED);
+}
+
+
+static int64_t cor_getlength(BlockDriverState *bs)
+{
+    return bdrv_getlength(bs->file->bs);
+}
+
+
+static int cor_truncate(BlockDriverState *bs, int64_t offset,
+                        PreallocMode prealloc, Error **errp)
+{
+    return bdrv_truncate(bs->file, offset, prealloc, errp);
+}
+
+
+static int coroutine_fn cor_co_preadv(BlockDriverState *bs,
+                                      uint64_t offset, uint64_t bytes,
+                                      QEMUIOVector *qiov, int flags)
+{
+    return bdrv_co_preadv(bs->file, offset, bytes, qiov,
+                          flags | BDRV_REQ_COPY_ON_READ);
+}
+
+
+static int coroutine_fn cor_co_pwritev(BlockDriverState *bs,
+                                       uint64_t offset, uint64_t bytes,
+                                       QEMUIOVector *qiov, int flags)
+{
+
+    return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);
+}
+
+
+static int coroutine_fn cor_co_pwrite_zeroes(BlockDriverState *bs,
+                                             int64_t offset, int bytes,
+                                             BdrvRequestFlags flags)
+{
+    return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags);
+}
+
+
+static int coroutine_fn cor_co_pdiscard(BlockDriverState *bs,
+                                        int64_t offset, int bytes)
+{
+    return bdrv_co_pdiscard(bs->file->bs, offset, bytes);
+}
+
+
+static void cor_eject(BlockDriverState *bs, bool eject_flag)
+{
+    bdrv_eject(bs->file->bs, eject_flag);
+}
+
+
+static void cor_lock_medium(BlockDriverState *bs, bool locked)
+{
+    bdrv_lock_medium(bs->file->bs, locked);
+}
+
+
+static bool cor_recurse_is_first_non_filter(BlockDriverState *bs,
+                                            BlockDriverState *candidate)
+{
+    return bdrv_recurse_is_first_non_filter(bs->file->bs, candidate);
+}
+
+
+BlockDriver bdrv_copy_on_read = {
+    .format_name                        = "copy-on-read",
+
+    .bdrv_open                          = cor_open,
+    .bdrv_close                         = cor_close,
+    .bdrv_child_perm                    = cor_child_perm,
+
+    .bdrv_getlength                     = cor_getlength,
+    .bdrv_truncate                      = cor_truncate,
+
+    .bdrv_co_preadv                     = cor_co_preadv,
+    .bdrv_co_pwritev                    = cor_co_pwritev,
+    .bdrv_co_pwrite_zeroes              = cor_co_pwrite_zeroes,
+    .bdrv_co_pdiscard                   = cor_co_pdiscard,
+
+    .bdrv_eject                         = cor_eject,
+    .bdrv_lock_medium                   = cor_lock_medium,
+
+    .bdrv_co_block_status               = bdrv_co_block_status_from_file,
+
+    .bdrv_recurse_is_first_non_filter   = cor_recurse_is_first_non_filter,
+
+    .has_variable_length                = true,
+    .is_filter                          = true,
+};
+
+static void bdrv_copy_on_read_init(void)
+{
+    bdrv_register(&bdrv_copy_on_read);
+}
+
+block_init(bdrv_copy_on_read_init);
diff --git a/block/Makefile.objs b/block/Makefile.objs
index XXXXXXX..XXXXXXX 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -XXX,XX +XXX,XX @@ block-obj-y += accounting.o dirty-bitmap.o
 block-obj-y += write-threshold.o
 block-obj-y += backup.o
 block-obj-$(CONFIG_REPLICATION) += replication.o
-block-obj-y += throttle.o
+block-obj-y += throttle.o copy-on-read.o
 
 block-obj-y += crypto.o
 
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

Currently we never actually check whether the WRITE_UNCHANGED
permission has been taken for unchanging writes.  But the one check that
is commented out checks both WRITE and WRITE_UNCHANGED; and considering
that WRITE_UNCHANGED is already documented as being weaker than WRITE,
we should probably explicitly document WRITE to include WRITE_UNCHANGED.

diff --git a/include/block/block.h b/include/block/block.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -XXX,XX +XXX,XX @@ enum {
      * This permission (which is weaker than BLK_PERM_WRITE) is both enough and
      * required for writes to the block node when the caller promises that
      * the visible disk content doesn't change.
+     *
+     * As the BLK_PERM_WRITE permission is strictly stronger, either is
+     * sufficient to perform an unchanging write.
      */
     BLK_PERM_WRITE_UNCHANGED    = 0x04,
 
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

This flag signifies that a write request will not change the visible
disk content.  With this flag set, it is sufficient to have the
BLK_PERM_WRITE_UNCHANGED permission instead of BLK_PERM_WRITE.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Alberto Garcia <berto@igalia.com>
Message-id: 20180421132929.21610-4-mreitz@redhat.com
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 include/block/block.h | 6 +++++-
 block/io.c            | 6 +++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/include/block/block.h b/include/block/block.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
     BDRV_REQ_FUA                = 0x10,
     BDRV_REQ_WRITE_COMPRESSED   = 0x20,
 
+    /* Signifies that this write request will not change the visible disk
+     * content. */
+    BDRV_REQ_WRITE_UNCHANGED    = 0x40,
+
     /* Mask of valid flags */
-    BDRV_REQ_MASK               = 0x3f,
+    BDRV_REQ_MASK               = 0x7f,
 } BdrvRequestFlags;
 
 typedef struct BlockSizes {
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child,
     assert(!waited || !req->serialising);
     assert(req->overlap_offset <= offset);
     assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
-    assert(child->perm & BLK_PERM_WRITE);
+    if (flags & BDRV_REQ_WRITE_UNCHANGED) {
+        assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE));
+    } else {
+        assert(child->perm & BLK_PERM_WRITE);
+    }
     assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE);
 
     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child,
                 /* FIXME: Should we (perhaps conditionally) be setting
                  * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy
                  * that still correctly reads as zero? */
-                ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum, 0);
+                ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum,
+                                               BDRV_REQ_WRITE_UNCHANGED);
             } else {
                 /* This does not change the data on the disk, it is not
                  * necessary to flush even in cache=writethrough mode.
                  */
                 ret = bdrv_driver_pwritev(bs, cluster_offset, pnum,
-                                          &local_qiov, 0);
+                                          &local_qiov,
+                                          BDRV_REQ_WRITE_UNCHANGED);
             }
 
             if (ret < 0) {
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

We just need to forward it to quorum's children (except in case of a
rewrite because of corruption), but for that we first have to support
flags in child requests at all.

diff --git a/block/quorum.c b/block/quorum.c
index XXXXXXX..XXXXXXX 100644
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -XXX,XX +XXX,XX @@ struct QuorumAIOCB {
     /* Request metadata */
     uint64_t offset;
     uint64_t bytes;
+    int flags;
 
     QEMUIOVector *qiov;         /* calling IOV */
 
@@ -XXX,XX +XXX,XX @@ static bool quorum_64bits_compare(QuorumVoteValue *a, QuorumVoteValue *b)
 static QuorumAIOCB *quorum_aio_get(BlockDriverState *bs,
                                    QEMUIOVector *qiov,
                                    uint64_t offset,
-                                   uint64_t bytes)
+                                   uint64_t bytes,
+                                   int flags)
 {
     BDRVQuorumState *s = bs->opaque;
     QuorumAIOCB *acb = g_new(QuorumAIOCB, 1);
@@ -XXX,XX +XXX,XX @@ static QuorumAIOCB *quorum_aio_get(BlockDriverState *bs,
         .bs                 = bs,
         .offset             = offset,
         .bytes              = bytes,
+        .flags              = flags,
         .qiov               = qiov,
         .votes.compare      = quorum_sha256_compare,
         .votes.vote_list    = QLIST_HEAD_INITIALIZER(acb.votes.vote_list),
@@ -XXX,XX +XXX,XX @@ static void quorum_rewrite_entry(void *opaque)
     BDRVQuorumState *s = acb->bs->opaque;
 
     /* Ignore any errors, it's just a correction attempt for already
-     * corrupted data. */
+     * corrupted data.
+     * Mask out BDRV_REQ_WRITE_UNCHANGED because this overwrites the
+     * area with different data from the other children. */
     bdrv_co_pwritev(s->children[co->idx], acb->offset, acb->bytes,
-                    acb->qiov, 0);
+                    acb->qiov, acb->flags & ~BDRV_REQ_WRITE_UNCHANGED);
 
     /* Wake up the caller after the last rewrite */
     acb->rewrite_count--;
@@ -XXX,XX +XXX,XX @@ static int quorum_co_preadv(BlockDriverState *bs, uint64_t offset,
                             uint64_t bytes, QEMUIOVector *qiov, int flags)
 {
     BDRVQuorumState *s = bs->opaque;
-    QuorumAIOCB *acb = quorum_aio_get(bs, qiov, offset, bytes);
+    QuorumAIOCB *acb = quorum_aio_get(bs, qiov, offset, bytes, flags);
     int ret;
 
     acb->is_read = true;
@@ -XXX,XX +XXX,XX @@ static void write_quorum_entry(void *opaque)
 
     sacb->bs = s->children[i]->bs;
     sacb->ret = bdrv_co_pwritev(s->children[i], acb->offset, acb->bytes,
-                                acb->qiov, 0);
+                                acb->qiov, acb->flags);
     if (sacb->ret == 0) {
         acb->success_count++;
     } else {
@@ -XXX,XX +XXX,XX @@ static int quorum_co_pwritev(BlockDriverState *bs, uint64_t offset,
                              uint64_t bytes, QEMUIOVector *qiov, int flags)
 {
     BDRVQuorumState *s = bs->opaque;
-    QuorumAIOCB *acb = quorum_aio_get(bs, qiov, offset, bytes);
+    QuorumAIOCB *acb = quorum_aio_get(bs, qiov, offset, bytes, flags);
     int i, ret;
 
     for (i = 0; i < s->num_children; i++) {
@@ -XXX,XX +XXX,XX @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
     }
     s->next_child_index = s->num_children;
 
+    bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED;
+
     g_free(opened);
     goto exit;
 
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

Update the rest of the filter drivers to support
BDRV_REQ_WRITE_UNCHANGED.  They already forward write request flags to
their children, so we just have to announce support for it.

This patch does not cover the replication driver because that currently
does not support flags at all, and because it just grabs the WRITE
permission for its children when it can, so we should be fine just
submitting the incoming WRITE_UNCHANGED requests as normal writes.

It also does not cover format drivers for similar reasons.  They all use
bdrv_format_default_perms() as their .bdrv_child_perm() implementation
so they just always grab the WRITE permission for their file children
whenever possible.  In addition, it often would be difficult to
ascertain whether incoming unchanging writes end up as unchanging writes
in their files.  So we just leave them as normal potentially changing
writes.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Alberto Garcia <berto@igalia.com>
Message-id: 20180421132929.21610-7-mreitz@redhat.com
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/blkdebug.c     |  9 +++++----
 block/blkreplay.c    |  3 +++
 block/blkverify.c    |  3 +++
 block/copy-on-read.c | 10 ++++++----
 block/mirror.c       |  2 ++
 block/raw-format.c   |  9 +++++----
 block/throttle.c     |  6 ++++--
 7 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/block/blkdebug.c b/block/blkdebug.c
index XXXXXXX..XXXXXXX 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -XXX,XX +XXX,XX @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags,
         goto out;
     }
 
-    bs->supported_write_flags = BDRV_REQ_FUA &
-        bs->file->bs->supported_write_flags;
-    bs->supported_zero_flags = (BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) &
-        bs->file->bs->supported_zero_flags;
+    bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED |
+        (BDRV_REQ_FUA & bs->file->bs->supported_write_flags);
+    bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED |
+        ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) &
+            bs->file->bs->supported_zero_flags);
     ret = -EINVAL;
 
     /* Set alignment overrides */
diff --git a/block/blkreplay.c b/block/blkreplay.c
index XXXXXXX..XXXXXXX 100755
--- a/block/blkreplay.c
+++ b/block/blkreplay.c
@@ -XXX,XX +XXX,XX @@ static int blkreplay_open(BlockDriverState *bs, QDict *options, int flags,
         goto fail;
     }
 
+    bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED;
+    bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED;
+
     ret = 0;
 fail:
     return ret;
diff --git a/block/blkverify.c b/block/blkverify.c
index XXXXXXX..XXXXXXX 100644
--- a/block/blkverify.c
+++ b/block/blkverify.c
@@ -XXX,XX +XXX,XX @@ static int blkverify_open(BlockDriverState *bs, QDict *options, int flags,
         goto fail;
     }
 
+    bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED;
+    bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED;
+
     ret = 0;
 fail:
     qemu_opts_del(opts);
diff --git a/block/copy-on-read.c b/block/copy-on-read.c
index XXXXXXX..XXXXXXX 100644
--- a/block/copy-on-read.c
+++ b/block/copy-on-read.c
@@ -XXX,XX +XXX,XX @@ static int cor_open(BlockDriverState *bs, QDict *options, int flags,
         return -EINVAL;
     }
 
-    bs->supported_write_flags = BDRV_REQ_FUA &
-                                    bs->file->bs->supported_write_flags;
+    bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED |
+                                (BDRV_REQ_FUA &
+                                    bs->file->bs->supported_write_flags);
 
-    bs->supported_zero_flags = (BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) &
-                                    bs->file->bs->supported_zero_flags;
+    bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED |
+                               ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) &
+                                    bs->file->bs->supported_zero_flags);
 
     return 0;
 }
diff --git a/block/mirror.c b/block/mirror.c
index XXXXXXX..XXXXXXX 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -XXX,XX +XXX,XX @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs,
         mirror_top_bs->implicit = true;
     }
     mirror_top_bs->total_sectors = bs->total_sectors;
+    mirror_top_bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED;
+    mirror_top_bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED;
     bdrv_set_aio_context(mirror_top_bs, bdrv_get_aio_context(bs));
 
     /* bdrv_append takes ownership of the mirror_top_bs reference, need to keep
diff --git a/block/raw-format.c b/block/raw-format.c
index XXXXXXX..XXXXXXX 100644
--- a/block/raw-format.c
+++ b/block/raw-format.c
@@ -XXX,XX +XXX,XX @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags,
     }
 
     bs->sg = bs->file->bs->sg;
-    bs->supported_write_flags = BDRV_REQ_FUA &
-        bs->file->bs->supported_write_flags;
-    bs->supported_zero_flags = (BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) &
-        bs->file->bs->supported_zero_flags;
+    bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED |
+        (BDRV_REQ_FUA & bs->file->bs->supported_write_flags);
+    bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED |
+        ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) &
+            bs->file->bs->supported_zero_flags);
 
     if (bs->probed && !bdrv_is_read_only(bs)) {
         fprintf(stderr,
diff --git a/block/throttle.c b/block/throttle.c
index XXXXXXX..XXXXXXX 100644
--- a/block/throttle.c
+++ b/block/throttle.c
@@ -XXX,XX +XXX,XX @@ static int throttle_open(BlockDriverState *bs, QDict *options,
     if (!bs->file) {
         return -EINVAL;
     }
-    bs->supported_write_flags = bs->file->bs->supported_write_flags;
-    bs->supported_zero_flags = bs->file->bs->supported_zero_flags;
+    bs->supported_write_flags = bs->file->bs->supported_write_flags |
+                                BDRV_REQ_WRITE_UNCHANGED;
+    bs->supported_zero_flags = bs->file->bs->supported_zero_flags |
+                               BDRV_REQ_WRITE_UNCHANGED;
 
     return throttle_configure_tgm(bs, tgm, options, errp);
 }
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

iotest 197 tests copy-on-read using the (now old) copy-on-read flag.
Copy it to 215 and modify it to use the COR filter driver instead.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20180421132929.21610-9-mreitz@redhat.com
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/215     | 120 +++++++++++++++++++++++++++++++++++++++++++++
 tests/qemu-iotests/215.out |  26 ++++++++++
 tests/qemu-iotests/group   |   1 +
 3 files changed, 147 insertions(+)
 create mode 100755 tests/qemu-iotests/215
 create mode 100644 tests/qemu-iotests/215.out

diff --git a/tests/qemu-iotests/215 b/tests/qemu-iotests/215
new file mode 100755
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/qemu-iotests/215
@@ -XXX,XX +XXX,XX @@
+#!/bin/bash
+#
+# Test case for copy-on-read into qcow2, using the COR filter driver
+#
+# Copyright (C) 2018 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+seq="$(basename $0)"
+echo "QA output created by $seq"
+
+here="$PWD"
+status=1 # failure is the default!
+
+# get standard environment, filters and checks
+. ./common.rc
+. ./common.filter
+
+TEST_WRAP="$TEST_DIR/t.wrap.qcow2"
+BLKDBG_CONF="$TEST_DIR/blkdebug.conf"
+
+# Sanity check: our use of blkdebug fails if $TEST_DIR contains spaces
+# or other problems
+case "$TEST_DIR" in
+    *[^-_a-zA-Z0-9/]*)
+        _notrun "Suspicious TEST_DIR='$TEST_DIR', cowardly refusing to run" ;;
+esac
+
+_cleanup()
+{
+    _cleanup_test_img
+    rm -f "$TEST_WRAP"
+    rm -f "$BLKDBG_CONF"
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# Test is supported for any backing file; but we force qcow2 for our wrapper.
+_supported_fmt generic
+_supported_proto generic
+_supported_os Linux
+# LUKS support may be possible, but it complicates things.
+_unsupported_fmt luks
+
+echo
+echo '=== Copy-on-read ==='
+echo
+
+# Prep the images
+# VPC rounds image sizes to a specific geometry, force a specific size.
+if [ "$IMGFMT" = "vpc" ]; then
+    IMGOPTS=$(_optstr_add "$IMGOPTS" "force_size")
+fi
+_make_test_img 4G
+$QEMU_IO -c "write -P 55 3G 1k" "$TEST_IMG" | _filter_qemu_io
+IMGPROTO=file IMGFMT=qcow2 IMGOPTS= TEST_IMG_FILE="$TEST_WRAP" \
+    _make_test_img -F "$IMGFMT" -b "$TEST_IMG" | _filter_img_create
+$QEMU_IO -f qcow2 -c "write -z -u 1M 64k" "$TEST_WRAP" | _filter_qemu_io
+
+# Ensure that a read of two clusters, but where one is already allocated,
+# does not re-write the allocated cluster
+cat > "$BLKDBG_CONF" <<EOF
+[inject-error]
+event = "cor_write"
+sector = "2048"
+EOF
+$QEMU_IO -c "open \
+ -o driver=copy-on-read,file.driver=blkdebug,file.config=$BLKDBG_CONF,file.image.driver=qcow2 $TEST_WRAP" \
+ -c "read -P 0 1M 128k" | _filter_qemu_io
+
+# Read the areas we want copied. A zero-length read should still be a
+# no-op.  The next read is under 2G, but aligned so that rounding to
+# clusters copies more than 2G of zeroes. The final read will pick up
+# the non-zero data in the same cluster.  Since a 2G read may exhaust
+# memory on some machines (particularly 32-bit), we skip the test if
+# that fails due to memory pressure.
+$QEMU_IO \
+    -c "open -o driver=copy-on-read,file.driver=qcow2 $TEST_WRAP" \
+    -c "read 0 0" \
+    | _filter_qemu_io
+output=$($QEMU_IO \
+         -c "open -o driver=copy-on-read,file.driver=qcow2 $TEST_WRAP" \
+         -c "read -P 0 1k $((2*1024*1024*1024 - 512))" \
+         2>&1 | _filter_qemu_io)
+case $output in
+    *allocate*)
+        _notrun "Insufficent memory to run test" ;;
+    *) printf '%s\n' "$output" ;;
+esac
+$QEMU_IO \
+    -c "open -o driver=copy-on-read,file.driver=qcow2 $TEST_WRAP" \
+    -c "read -P 0 $((3*1024*1024*1024 + 1024)) 1k" \
+    | _filter_qemu_io
+
+# Copy-on-read is incompatible with read-only
+$QEMU_IO \
+    -c "open -r -o driver=copy-on-read,file.driver=qcow2 $TEST_WRAP" \
+    2>&1 | _filter_testdir
+
+# Break the backing chain, and show that images are identical, and that
+# we properly copied over explicit zeros.
+$QEMU_IMG rebase -u -b "" -f qcow2 "$TEST_WRAP"
+$QEMU_IO -f qcow2 -c map "$TEST_WRAP"
+_check_test_img
+$QEMU_IMG compare -f $IMGFMT -F qcow2 "$TEST_IMG" "$TEST_WRAP"
+
+# success, all done
+echo '*** done'
+status=0
diff --git a/tests/qemu-iotests/215.out b/tests/qemu-iotests/215.out
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/qemu-iotests/215.out
@@ -XXX,XX +XXX,XX @@
+QA output created by 215
+
+=== Copy-on-read ===
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=4294967296
+wrote 1024/1024 bytes at offset 3221225472
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+Formatting 'TEST_DIR/t.wrap.IMGFMT', fmt=IMGFMT size=4294967296 backing_file=TEST_DIR/t.IMGFMT backing_fmt=IMGFMT
+wrote 65536/65536 bytes at offset 1048576
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 131072/131072 bytes at offset 1048576
+128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 0/0 bytes at offset 0
+0 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 2147483136/2147483136 bytes at offset 1024
+2 GiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1024/1024 bytes at offset 3221226496
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+can't open device TEST_DIR/t.wrap.qcow2: Block node is read-only
+2 GiB (0x80010000) bytes     allocated at offset 0 bytes (0x0)
+1023.938 MiB (0x3fff0000) bytes not allocated at offset 2 GiB (0x80010000)
+64 KiB (0x10000) bytes     allocated at offset 3 GiB (0xc0000000)
+1023.938 MiB (0x3fff0000) bytes not allocated at offset 3 GiB (0xc0010000)
+No errors were found on the image.
+Images are identical.
+*** done
diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
index XXXXXXX..XXXXXXX 100644
--- a/tests/qemu-iotests/group
+++ b/tests/qemu-iotests/group
@@ -XXX,XX +XXX,XX @@
 212 rw auto quick
 213 rw auto quick
 214 rw auto
+215 rw auto quick
 218 rw auto quick
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

COR across nodes (that is, you have some filter node between the
actually COR target and the node that performs the COR) cannot reliably
work together with the permission system when there is no explicit COR
node that can request the WRITE_UNCHANGED permission for its child.
This is because COR (currently) sneaks its requests by the usual
permission checks, so it can work without a WRITE* permission; but if
there is a filter node in between, that will re-issue the request, which
then passes through the usual check -- and if nobody has requested a
WRITE_UNCHANGED permission, that check will fail.

There is no real direct fix apart from hoping that there is someone who
has requested that permission; in case of just the qemu-io HMP command
(and no guest device), however, that is not the case.  The real real fix
is to implement the copy-on-read flag through an implicitly added COR
node.  Such a node can request the necessary permissions as shown in
this test.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20180421132929.21610-10-mreitz@redhat.com
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/216     | 115 +++++++++++++++++++++++++++++++++++++++++++++
 tests/qemu-iotests/216.out |  28 +++++++++++
 tests/qemu-iotests/group   |   1 +
 3 files changed, 144 insertions(+)
 create mode 100755 tests/qemu-iotests/216
 create mode 100644 tests/qemu-iotests/216.out

diff --git a/tests/qemu-iotests/216 b/tests/qemu-iotests/216
new file mode 100755
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/qemu-iotests/216
@@ -XXX,XX +XXX,XX @@
+#!/usr/bin/env python
+#
+# Copy-on-read tests using a COR filter node
+#
+# Copyright (C) 2018 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+# Creator/Owner: Max Reitz <mreitz@redhat.com>
+
+import iotests
+from iotests import log, qemu_img_pipe, qemu_io, filter_qemu_io
+
+# Need backing file support
+iotests.verify_image_format(supported_fmts=['qcow2', 'qcow', 'qed', 'vmdk'])
+iotests.verify_platform(['linux'])
+
+log('')
+log('=== Copy-on-read across nodes ===')
+log('')
+
+# The old copy-on-read mechanism without a filter node cannot request
+# WRITE_UNCHANGED permissions for its child.  Therefore it just tries
+# to sneak its write by the usual permission system and holds its
+# fingers crossed.  However, that sneaking does not work so well when
+# there is a filter node in the way: That will receive the write
+# request and re-issue a new one to its child, which this time is a
+# proper write request that will make the permission system cough --
+# unless there is someone at the top (like a guest device) that has
+# requested write permissions.
+#
+# A COR filter node, however, can request the proper permissions for
+# its child and therefore is not hit by this issue.
+
+with iotests.FilePath('base.img') as base_img_path, \
+     iotests.FilePath('top.img') as top_img_path, \
+     iotests.VM() as vm:
+
+    log('--- Setting up images ---')
+    log('')
+
+    qemu_img_pipe('create', '-f', iotests.imgfmt, base_img_path, '64M')
+
+    log(filter_qemu_io(qemu_io(base_img_path, '-c', 'write -P 1 0M 1M')))
+
+    qemu_img_pipe('create', '-f', iotests.imgfmt, '-b', base_img_path,
+                  top_img_path)
+
+    log(filter_qemu_io(qemu_io(top_img_path,  '-c', 'write -P 2 1M 1M')))
+
+    log('')
+    log('--- Doing COR ---')
+    log('')
+
+    # Compare with e.g. the following:
+    #   vm.add_drive_raw('if=none,node-name=node0,copy-on-read=on,driver=raw,' \
+    #                    'file.driver=%s,file.file.filename=%s' %
+    #                       (iotests.imgfmt, top_img_path))
+    # (Remove the blockdev-add instead.)
+    # ((Not tested here because it hits an assertion in the permission
+    #   system.))
+
+    vm.launch()
+
+    log(vm.qmp('blockdev-add',
+                    node_name='node0',
+                    driver='copy-on-read',
+                    file={
+                        'driver': 'raw',
+                        'file': {
+                            'driver': 'copy-on-read',
+                            'file': {
+                                'driver': 'raw',
+                                'file': {
+                                    'driver': iotests.imgfmt,
+                                    'file': {
+                                        'driver': 'file',
+                                        'filename': top_img_path
+                                    },
+                                    'backing': {
+                                        'driver': iotests.imgfmt,
+                                        'file': {
+                                            'driver': 'file',
+                                            'filename': base_img_path
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }))
+
+    # Trigger COR
+    log(vm.qmp('human-monitor-command',
+               command_line='qemu-io node0 "read 0 64M"'))
+
+    vm.shutdown()
+
+    log('')
+    log('--- Checking COR result ---')
+    log('')
+
+    log(filter_qemu_io(qemu_io(base_img_path, '-c', 'discard 0 64M')))
+    log(filter_qemu_io(qemu_io(top_img_path,  '-c', 'read -P 1 0M 1M')))
+    log(filter_qemu_io(qemu_io(top_img_path,  '-c', 'read -P 2 1M 1M')))
diff --git a/tests/qemu-iotests/216.out b/tests/qemu-iotests/216.out
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/qemu-iotests/216.out
@@ -XXX,XX +XXX,XX @@
+
+=== Copy-on-read across nodes ===
+
+--- Setting up images ---
+
+wrote 1048576/1048576 bytes at offset 0
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+wrote 1048576/1048576 bytes at offset 1048576
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+
+--- Doing COR ---
+
+{u'return': {}}
+{u'return': u''}
+
+--- Checking COR result ---
+
+discard 67108864/67108864 bytes at offset 0
+64 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+read 1048576/1048576 bytes at offset 0
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+read 1048576/1048576 bytes at offset 1048576
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
index XXXXXXX..XXXXXXX 100644
--- a/tests/qemu-iotests/group
+++ b/tests/qemu-iotests/group
@@ -XXX,XX +XXX,XX @@
 213 rw auto quick
 214 rw auto
 215 rw auto quick
+216 rw auto quick
 218 rw auto quick
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

Some block drivers (iscsi and file-posix when dealing with device files)
do not actually support truncation, even though they provide a
.bdrv_truncate() method and will happily return success when providing a
new size that does not exceed the current size.  This is because these
drivers expect the user to resize the image outside of qemu and then
provide qemu with that information through the block_resize command
(compare cb1b83e740384b4e0d950f3d7c81c02b8ce86c2e).

Of course, anyone using qemu-img resize will find that behavior useless.
So we should check the actual size of the image after the supposedly
successful truncation took place, emit an error if nothing changed and
emit a warning if the target size was not met.

Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1523065
Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20180421163957.29872-1-mreitz@redhat.com
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 qemu-img.c | 39 +++++++++++++++++++++++++++++++++++----
 1 file changed, 35 insertions(+), 4 deletions(-)

diff --git a/qemu-img.c b/qemu-img.c
index XXXXXXX..XXXXXXX 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -XXX,XX +XXX,XX @@ static int img_resize(int argc, char **argv)
     Error *err = NULL;
     int c, ret, relative;
     const char *filename, *fmt, *size;
-    int64_t n, total_size, current_size;
+    int64_t n, total_size, current_size, new_size;
     bool quiet = false;
     BlockBackend *blk = NULL;
     PreallocMode prealloc = PREALLOC_MODE_OFF;
@@ -XXX,XX +XXX,XX @@ static int img_resize(int argc, char **argv)
     }
 
     ret = blk_truncate(blk, total_size, prealloc, &err);
-    if (!ret) {
-        qprintf(quiet, "Image resized.\n");
-    } else {
+    if (ret < 0) {
         error_report_err(err);
+        goto out;
+    }
+
+    new_size = blk_getlength(blk);
+    if (new_size < 0) {
+        error_report("Failed to verify truncated image length: %s",
+                     strerror(-new_size));
+        ret = -1;
+        goto out;
     }
+
+    /* Some block drivers implement a truncation method, but only so
+     * the user can cause qemu to refresh the image's size from disk.
+     * The idea is that the user resizes the image outside of qemu and
+     * then invokes block_resize to inform qemu about it.
+     * (This includes iscsi and file-posix for device files.)
+     * Of course, that is not the behavior someone invoking
+     * qemu-img resize would find useful, so we catch that behavior
+     * here and tell the user. */
+    if (new_size != total_size && new_size == current_size) {
+        error_report("Image was not resized; resizing may not be supported "
+                     "for this image");
+        ret = -1;
+        goto out;
+    }
+
+    if (new_size != total_size) {
+        warn_report("Image should have been resized to %" PRIi64
+                    " bytes, but was resized to %" PRIi64 " bytes",
+                    total_size, new_size);
+    }
+
+    qprintf(quiet, "Image resized.\n");
+
 out:
     blk_unref(blk);
     if (ret) {
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

Add BDRV_REQ_WRITE_UNCHANGED to the list of flags honored during pwrite
and pwrite_zeroes, and also add a note on when you absolutely need to
support it.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20180502140359.18222-1-mreitz@redhat.com
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 include/block/block_int.h | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/include/block/block_int.h b/include/block/block_int.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
     /* I/O Limits */
     BlockLimits bl;
 
-    /* Flags honored during pwrite (so far: BDRV_REQ_FUA) */
+    /* Flags honored during pwrite (so far: BDRV_REQ_FUA,
+     * BDRV_REQ_WRITE_UNCHANGED).
+     * If a driver does not support BDRV_REQ_WRITE_UNCHANGED, those
+     * writes will be issued as normal writes without the flag set.
+     * This is important to note for drivers that do not explicitly
+     * request a WRITE permission for their children and instead take
+     * the same permissions as their parent did (this is commonly what
+     * block filters do).  Such drivers have to be aware that the
+     * parent may have taken a WRITE_UNCHANGED permission only and is
+     * issuing such requests.  Drivers either must make sure that
+     * these requests do not result in plain WRITE accesses (usually
+     * by supporting BDRV_REQ_WRITE_UNCHANGED, and then forwarding
+     * every incoming write request as-is, including potentially that
+     * flag), or they have to explicitly take the WRITE permission for
+     * their children. */
     unsigned int supported_write_flags;
     /* Flags honored during pwrite_zeroes (so far: BDRV_REQ_FUA,
-     * BDRV_REQ_MAY_UNMAP) */
+     * BDRV_REQ_MAY_UNMAP, BDRV_REQ_WRITE_UNCHANGED) */
     unsigned int supported_zero_flags;
 
     /* the following member gives a name to every node on the bs graph. */
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

Currently, qemu-io only uses string-valued blockdev options (as all are
converted directly from QemuOpts) -- with one exception: -U adds the
force-share option as a boolean.  This in itself is already a bit
questionable, but a real issue is that it also assumes the value already
existing in the options QDict would be a boolean, which is wrong.

That has the following effect:

$ ./qemu-io -r -U --image-opts \
    driver=file,filename=/dev/null,force-share=off
[1]    15200 segmentation fault (core dumped)  ./qemu-io -r -U
--image-opts driver=file,filename=/dev/null,force-share=off

Since @opts is converted from QemuOpts, the value must be a string, and
we have to compare it as such.  Consequently, it makes sense to also set
it as a string instead of a boolean.

Cc: qemu-stable@nongnu.org
Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20180502202051.15493-2-mreitz@redhat.com
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 qemu-io.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/qemu-io.c b/qemu-io.c
index XXXXXXX..XXXXXXX 100644
--- a/qemu-io.c
+++ b/qemu-io.c
@@ -XXX,XX +XXX,XX @@ static int openfile(char *name, int flags, bool writethrough, bool force_share,
             opts = qdict_new();
         }
         if (qdict_haskey(opts, BDRV_OPT_FORCE_SHARE)
-            && !qdict_get_bool(opts, BDRV_OPT_FORCE_SHARE)) {
+            && strcmp(qdict_get_str(opts, BDRV_OPT_FORCE_SHARE), "on")) {
             error_report("-U conflicts with image options");
             qobject_unref(opts);
             return 1;
         }
-        qdict_put_bool(opts, BDRV_OPT_FORCE_SHARE, true);
+        qdict_put_str(opts, BDRV_OPT_FORCE_SHARE, "on");
     }
     qemuio_blk = blk_new_open(name, NULL, opts, flags, &local_err);
     if (!qemuio_blk) {
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

img_open_opts() takes a QemuOpts and converts them to a QDict, so all
values therein are strings.  Then it may try to call qdict_get_bool(),
however, which will fail with a segmentation fault every time:

$ ./qemu-img info -U --image-opts \
    driver=file,filename=/dev/null,force-share=off
[1]    27869 segmentation fault (core dumped)  ./qemu-img info -U
--image-opts driver=file,filename=/dev/null,force-share=off

Fix this by using qdict_get_str() and comparing the value as a string.
Also, when adding a force-share value to the QDict, add it as a string
so it fits the rest of the dict.

Cc: qemu-stable@nongnu.org
Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20180502202051.15493-3-mreitz@redhat.com
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 qemu-img.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/qemu-img.c b/qemu-img.c
index XXXXXXX..XXXXXXX 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -XXX,XX +XXX,XX @@ static BlockBackend *img_open_opts(const char *optstr,
     options = qemu_opts_to_qdict(opts, NULL);
     if (force_share) {
         if (qdict_haskey(options, BDRV_OPT_FORCE_SHARE)
-            && !qdict_get_bool(options, BDRV_OPT_FORCE_SHARE)) {
+            && strcmp(qdict_get_str(options, BDRV_OPT_FORCE_SHARE), "on")) {
             error_report("--force-share/-U conflicts with image options");
             qobject_unref(options);
             return NULL;
         }
-        qdict_put_bool(options, BDRV_OPT_FORCE_SHARE, true);
+        qdict_put_str(options, BDRV_OPT_FORCE_SHARE, "on");
     }
     blk = blk_new_open(NULL, NULL, options, flags, &local_err);
     if (!blk) {
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20180502202051.15493-4-mreitz@redhat.com
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/153     | 17 +++++++++++++++++
 tests/qemu-iotests/153.out | 16 ++++++++++++++++
 2 files changed, 33 insertions(+)

diff --git a/tests/qemu-iotests/153 b/tests/qemu-iotests/153
index XXXXXXX..XXXXXXX 100755
--- a/tests/qemu-iotests/153
+++ b/tests/qemu-iotests/153
@@ -XXX,XX +XXX,XX @@ _run_cmd $QEMU_IO "${TEST_IMG}" -c 'write 0 512'
 
 _cleanup_qemu
 
+echo
+echo "== Detecting -U and force-share conflicts =="
+
+echo
+echo 'No conflict:'
+$QEMU_IMG info -U --image-opts driver=null-co,force-share=on
+echo
+echo 'Conflict:'
+$QEMU_IMG info -U --image-opts driver=null-co,force-share=off
+
+echo
+echo 'No conflict:'
+$QEMU_IO -c 'open -r -U -o driver=null-co,force-share=on'
+echo
+echo 'Conflict:'
+$QEMU_IO -c 'open -r -U -o driver=null-co,force-share=off'
+
 # success, all done
 echo "*** done"
 rm -f $seq.full
diff --git a/tests/qemu-iotests/153.out b/tests/qemu-iotests/153.out
index XXXXXXX..XXXXXXX 100644
--- a/tests/qemu-iotests/153.out
+++ b/tests/qemu-iotests/153.out
@@ -XXX,XX +XXX,XX @@ Is another process using the image?
 Closing the other
 
 _qemu_io_wrapper TEST_DIR/t.qcow2 -c write 0 512
+
+== Detecting -U and force-share conflicts ==
+
+No conflict:
+image: null-co://
+file format: null-co
+virtual size: 1.0G (1073741824 bytes)
+disk size: unavailable
+
+Conflict:
+qemu-img: --force-share/-U conflicts with image options
+
+No conflict:
+
+Conflict:
+-U conflicts with image options
 *** done
-- 
2.13.6

The following changes since commit 281f327487c9c9b1599f93c589a408bbf4a651b8:

Merge remote-tracking branch 'remotes/vivier/tags/m68k-for-2.12-pull-request' into staging (2017-12-22 00:11:36 +0000)

are available in the git repository at:

git://repo.or.cz/qemu/kevin.git tags/for-upstream

for you to fetch changes up to 1a63a907507fbbcfaee3f622907ec244b7eabda8:

block: Keep nodes drained between reopen_queue/multiple (2017-12-22 15:05:32 +0100)

----------------------------------------------------------------
Block layer patches

----------------------------------------------------------------
Doug Gale (1):
      nvme: Add tracing

Edgar Kaziakhmedov (1):
      qcow2: get rid of qcow2_backing_read1 routine

Fam Zheng (2):
      block: Open backing image in force share mode for size probe
      block: Remove unused bdrv_requests_pending

John Snow (1):
      iotests: fix 197 for vpc

Kevin Wolf (27):
      block: Formats don't need CONSISTENT_READ with NO_IO
      block: Make bdrv_drain_invoke() recursive
      block: Call .drain_begin only once in bdrv_drain_all_begin()
      test-bdrv-drain: Test BlockDriver callbacks for drain
      block: bdrv_drain_recurse(): Remove unused begin parameter
      block: Don't wait for requests in bdrv_drain*_end()
      block: Unify order in drain functions
      block: Don't acquire AioContext in hmp_qemu_io()
      block: Document that x-blockdev-change breaks quorum children list
      block: Assert drain_all is only called from main AioContext
      block: Make bdrv_drain() driver callbacks non-recursive
      test-bdrv-drain: Test callback for bdrv_drain
      test-bdrv-drain: Test bs->quiesce_counter
      blockjob: Pause job on draining any job BDS
      test-bdrv-drain: Test drain vs. block jobs
      block: Don't block_job_pause_all() in bdrv_drain_all()
      block: Nested drain_end must still call callbacks
      test-bdrv-drain: Test nested drain sections
      block: Don't notify parents in drain call chain
      block: Add bdrv_subtree_drained_begin/end()
      test-bdrv-drain: Tests for bdrv_subtree_drain
      test-bdrv-drain: Test behaviour in coroutine context
      test-bdrv-drain: Recursive draining with multiple parents
      block: Allow graph changes in subtree drained section
      test-bdrv-drain: Test graph changes in drained section
      commit: Simplify reopen of base
      block: Keep nodes drained between reopen_queue/multiple

Thomas Huth (3):
      block: Remove the obsolete -drive boot=on|off parameter
      block: Remove the deprecated -hdachs option
      block: Mention -drive cyls/heads/secs/trans/serial/addr in deprecation chapter

Commit 1f4ad7d fixed 'qemu-img info' for raw images that are currently
in use as a mirror target. It is not enough for image formats, though,
as these still unconditionally request BLK_PERM_CONSISTENT_READ.

As this permission is geared towards whether the guest-visible data is
consistent, and has no impact on whether the metadata is sane, and
'qemu-img info' does not read guest-visible data (except for the raw
format), it makes sense to not require BLK_PERM_CONSISTENT_READ if there
is not going to be any guest I/O performed, regardless of image format.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/block.c b/block.c
index XXXXXXX..XXXXXXX 100644
--- a/block.c
+++ b/block.c
@@ -XXX,XX +XXX,XX @@ void bdrv_format_default_perms(BlockDriverState *bs, BdrvChild *c,
     assert(role == &child_backing || role == &child_file);
 
     if (!backing) {
+        int flags = bdrv_reopen_get_flags(reopen_queue, bs);
+
         /* Apart from the modifications below, the same permissions are
          * forwarded and left alone as for filters */
         bdrv_filter_default_perms(bs, c, role, reopen_queue, perm, shared,
@@ -XXX,XX +XXX,XX @@ void bdrv_format_default_perms(BlockDriverState *bs, BdrvChild *c,
 
         /* bs->file always needs to be consistent because of the metadata. We
          * can never allow other users to resize or write to it. */
-        perm |= BLK_PERM_CONSISTENT_READ;
+        if (!(flags & BDRV_O_NO_IO)) {
+            perm |= BLK_PERM_CONSISTENT_READ;
+        }
         shared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
     } else {
         /* We want consistent read from backing files if the parent needs it.
-- 
2.13.6

From: John Snow <jsnow@redhat.com>

VPC has some difficulty creating geometries of particular size.
However, we can indeed force it to use a literal one, so let's
do that for the sake of test 197, which is testing some specific
offsets.

Signed-off-by: John Snow <jsnow@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Lukáš Doktor <ldoktor@redhat.com>
---
 tests/qemu-iotests/197           | 4 ++++
 tests/qemu-iotests/common.filter | 3 ++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/197 b/tests/qemu-iotests/197
index XXXXXXX..XXXXXXX 100755
--- a/tests/qemu-iotests/197
+++ b/tests/qemu-iotests/197
@@ -XXX,XX +XXX,XX @@ echo '=== Copy-on-read ==='
 echo
 
 # Prep the images
+# VPC rounds image sizes to a specific geometry, force a specific size.
+if [ "$IMGFMT" = "vpc" ]; then
+    IMGOPTS=$(_optstr_add "$IMGOPTS" "force_size")
+fi
 _make_test_img 4G
 $QEMU_IO -c "write -P 55 3G 1k" "$TEST_IMG" | _filter_qemu_io
 IMGPROTO=file IMGFMT=qcow2 IMGOPTS= TEST_IMG_FILE="$TEST_WRAP" \
diff --git a/tests/qemu-iotests/common.filter b/tests/qemu-iotests/common.filter
index XXXXXXX..XXXXXXX 100644
--- a/tests/qemu-iotests/common.filter
+++ b/tests/qemu-iotests/common.filter
@@ -XXX,XX +XXX,XX @@ _filter_img_create()
         -e "s# log_size=[0-9]\\+##g" \
         -e "s# refcount_bits=[0-9]\\+##g" \
         -e "s# key-secret=[a-zA-Z0-9]\\+##g" \
-        -e "s# iter-time=[0-9]\\+##g"
+        -e "s# iter-time=[0-9]\\+##g" \
+        -e "s# force_size=\$on\\|off\$##g"
 }
 
 _filter_img_info()
-- 
2.13.6

This change separates bdrv_drain_invoke(), which calls the BlockDriver
drain callbacks, from bdrv_drain_recurse(). Instead, the function
performs its own recursion now.

One reason for this is that bdrv_drain_recurse() can be called multiple
times by bdrv_drain_all_begin(), but the callbacks may only be called
once. The separation is necessary to fix this bug.

The other reason is that we intend to go to a model where we call all
driver callbacks first, and only then start polling. This is not fully
achieved yet with this patch, as bdrv_drain_invoke() contains a
BDRV_POLL_WHILE() loop for the block driver callbacks, which can still
call callbacks for any unrelated event. It's a step in this direction
anyway.

Cc: qemu-stable@nongnu.org
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/io.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
     bdrv_wakeup(bs);
 }
 
+/* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
 static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
 {
+    BdrvChild *child, *tmp;
     BdrvCoDrainData data = { .bs = bs, .done = false, .begin = begin};
 
     if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) ||
@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
     data.co = qemu_coroutine_create(bdrv_drain_invoke_entry, &data);
     bdrv_coroutine_enter(bs, data.co);
     BDRV_POLL_WHILE(bs, !data.done);
+
+    QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
+        bdrv_drain_invoke(child->bs, begin);
+    }
 }
 
 static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
     BdrvChild *child, *tmp;
     bool waited;
 
-    /* Ensure any pending metadata writes are submitted to bs->file.  */
-    bdrv_drain_invoke(bs, begin);
-
     /* Wait for drained requests to finish */
     waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0);
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
         bdrv_parent_drained_begin(bs);
     }
 
+    bdrv_drain_invoke(bs, true);
     bdrv_drain_recurse(bs, true);
 }
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
     }
 
     bdrv_parent_drained_end(bs);
+    bdrv_drain_invoke(bs, false);
     bdrv_drain_recurse(bs, false);
     aio_enable_external(bdrv_get_aio_context(bs));
 }
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
             aio_context_acquire(aio_context);
             for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
                 if (aio_context == bdrv_get_aio_context(bs)) {
+                    /* FIXME Calling this multiple times is wrong */
+                    bdrv_drain_invoke(bs, true);
                     waited |= bdrv_drain_recurse(bs, true);
                 }
             }
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
         aio_context_acquire(aio_context);
         aio_enable_external(aio_context);
         bdrv_parent_drained_end(bs);
+        bdrv_drain_invoke(bs, false);
         bdrv_drain_recurse(bs, false);
         aio_context_release(aio_context);
     }
-- 
2.13.6

bdrv_drain_all_begin() used to call the .bdrv_co_drain_begin() driver
callback inside its polling loop. This means that how many times it got
called for each node depended on long it had to poll the event loop.

This is obviously not right and results in nodes that stay drained even
after bdrv_drain_all_end(), which calls .bdrv_co_drain_begin() once per
node.

Fix bdrv_drain_all_begin() to call the callback only once, too.

Cc: qemu-stable@nongnu.org
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/io.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
         aio_context_acquire(aio_context);
         bdrv_parent_drained_begin(bs);
         aio_disable_external(aio_context);
+        bdrv_drain_invoke(bs, true);
         aio_context_release(aio_context);
 
         if (!g_slist_find(aio_ctxs, aio_context)) {
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
             aio_context_acquire(aio_context);
             for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
                 if (aio_context == bdrv_get_aio_context(bs)) {
-                    /* FIXME Calling this multiple times is wrong */
-                    bdrv_drain_invoke(bs, true);
                     waited |= bdrv_drain_recurse(bs, true);
                 }
             }
-- 
2.13.6

This adds a test case that the BlockDriver callbacks for drain are
called in bdrv_drained_all_begin/end(), and that both of them are called
exactly once.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
---
 tests/test-bdrv-drain.c | 137 ++++++++++++++++++++++++++++++++++++++++++++++++
 tests/Makefile.include  |   2 +
 2 files changed, 139 insertions(+)
 create mode 100644 tests/test-bdrv-drain.c

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * Block node draining tests
+ *
+ * Copyright (c) 2017 Kevin Wolf <kwolf@redhat.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "block/block.h"
+#include "sysemu/block-backend.h"
+#include "qapi/error.h"
+
+typedef struct BDRVTestState {
+    int drain_count;
+} BDRVTestState;
+
+static void coroutine_fn bdrv_test_co_drain_begin(BlockDriverState *bs)
+{
+    BDRVTestState *s = bs->opaque;
+    s->drain_count++;
+}
+
+static void coroutine_fn bdrv_test_co_drain_end(BlockDriverState *bs)
+{
+    BDRVTestState *s = bs->opaque;
+    s->drain_count--;
+}
+
+static void bdrv_test_close(BlockDriverState *bs)
+{
+    BDRVTestState *s = bs->opaque;
+    g_assert_cmpint(s->drain_count, >, 0);
+}
+
+static int coroutine_fn bdrv_test_co_preadv(BlockDriverState *bs,
+                                            uint64_t offset, uint64_t bytes,
+                                            QEMUIOVector *qiov, int flags)
+{
+    /* We want this request to stay until the polling loop in drain waits for
+     * it to complete. We need to sleep a while as bdrv_drain_invoke() comes
+     * first and polls its result, too, but it shouldn't accidentally complete
+     * this request yet. */
+    qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100000);
+
+    return 0;
+}
+
+static BlockDriver bdrv_test = {
+    .format_name            = "test",
+    .instance_size          = sizeof(BDRVTestState),
+
+    .bdrv_close             = bdrv_test_close,
+    .bdrv_co_preadv         = bdrv_test_co_preadv,
+
+    .bdrv_co_drain_begin    = bdrv_test_co_drain_begin,
+    .bdrv_co_drain_end      = bdrv_test_co_drain_end,
+};
+
+static void aio_ret_cb(void *opaque, int ret)
+{
+    int *aio_ret = opaque;
+    *aio_ret = ret;
+}
+
+static void test_drv_cb_drain_all(void)
+{
+    BlockBackend *blk;
+    BlockDriverState *bs;
+    BDRVTestState *s;
+    BlockAIOCB *acb;
+    int aio_ret;
+
+    QEMUIOVector qiov;
+    struct iovec iov = {
+        .iov_base = NULL,
+        .iov_len = 0,
+    };
+    qemu_iovec_init_external(&qiov, &iov, 1);
+
+    blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
+                              &error_abort);
+    s = bs->opaque;
+    blk_insert_bs(blk, bs, &error_abort);
+
+    /* Simple bdrv_drain_all_begin/end pair, check that CBs are called */
+    g_assert_cmpint(s->drain_count, ==, 0);
+    bdrv_drain_all_begin();
+    g_assert_cmpint(s->drain_count, ==, 1);
+    bdrv_drain_all_end();
+    g_assert_cmpint(s->drain_count, ==, 0);
+
+    /* Now do the same while a request is pending */
+    aio_ret = -EINPROGRESS;
+    acb = blk_aio_preadv(blk, 0, &qiov, 0, aio_ret_cb, &aio_ret);
+    g_assert(acb != NULL);
+    g_assert_cmpint(aio_ret, ==, -EINPROGRESS);
+
+    g_assert_cmpint(s->drain_count, ==, 0);
+    bdrv_drain_all_begin();
+    g_assert_cmpint(aio_ret, ==, 0);
+    g_assert_cmpint(s->drain_count, ==, 1);
+    bdrv_drain_all_end();
+    g_assert_cmpint(s->drain_count, ==, 0);
+
+    bdrv_unref(bs);
+    blk_unref(blk);
+}
+
+int main(int argc, char **argv)
+{
+    bdrv_init();
+    qemu_init_main_loop(&error_abort);
+
+    g_test_init(&argc, &argv, NULL);
+
+    g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
+
+    return g_test_run();
+}
diff --git a/tests/Makefile.include b/tests/Makefile.include
index XXXXXXX..XXXXXXX 100644
--- a/tests/Makefile.include
+++ b/tests/Makefile.include
@@ -XXX,XX +XXX,XX @@ gcov-files-test-thread-pool-y = thread-pool.c
 gcov-files-test-hbitmap-y = util/hbitmap.c
 check-unit-y += tests/test-hbitmap$(EXESUF)
 gcov-files-test-hbitmap-y = blockjob.c
+check-unit-y += tests/test-bdrv-drain$(EXESUF)
 check-unit-y += tests/test-blockjob$(EXESUF)
 check-unit-y += tests/test-blockjob-txn$(EXESUF)
 check-unit-y += tests/test-x86-cpuid$(EXESUF)
@@ -XXX,XX +XXX,XX @@ tests/test-coroutine$(EXESUF): tests/test-coroutine.o $(test-block-obj-y)
 tests/test-aio$(EXESUF): tests/test-aio.o $(test-block-obj-y)
 tests/test-aio-multithread$(EXESUF): tests/test-aio-multithread.o $(test-block-obj-y)
 tests/test-throttle$(EXESUF): tests/test-throttle.o $(test-block-obj-y)
+tests/test-bdrv-drain$(EXESUF): tests/test-bdrv-drain.o $(test-block-obj-y) $(test-util-obj-y)
 tests/test-blockjob$(EXESUF): tests/test-blockjob.o $(test-block-obj-y) $(test-util-obj-y)
 tests/test-blockjob-txn$(EXESUF): tests/test-blockjob-txn.o $(test-block-obj-y) $(test-util-obj-y)
 tests/test-thread-pool$(EXESUF): tests/test-thread-pool.o $(test-block-obj-y)
-- 
2.13.6

Now that the bdrv_drain_invoke() calls are pulled up to the callers of
bdrv_drain_recurse(), the 'begin' parameter isn't needed any more.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/io.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
     }
 }
 
-static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
+static bool bdrv_drain_recurse(BlockDriverState *bs)
 {
     BdrvChild *child, *tmp;
     bool waited;
@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
              */
             bdrv_ref(bs);
         }
-        waited |= bdrv_drain_recurse(bs, begin);
+        waited |= bdrv_drain_recurse(bs);
         if (in_main_loop) {
             bdrv_unref(bs);
         }
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
     }
 
     bdrv_drain_invoke(bs, true);
-    bdrv_drain_recurse(bs, true);
+    bdrv_drain_recurse(bs);
 }
 
 void bdrv_drained_end(BlockDriverState *bs)
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
 
     bdrv_parent_drained_end(bs);
     bdrv_drain_invoke(bs, false);
-    bdrv_drain_recurse(bs, false);
+    bdrv_drain_recurse(bs);
     aio_enable_external(bdrv_get_aio_context(bs));
 }
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
             aio_context_acquire(aio_context);
             for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
                 if (aio_context == bdrv_get_aio_context(bs)) {
-                    waited |= bdrv_drain_recurse(bs, true);
+                    waited |= bdrv_drain_recurse(bs);
                 }
             }
             aio_context_release(aio_context);
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
         aio_enable_external(aio_context);
         bdrv_parent_drained_end(bs);
         bdrv_drain_invoke(bs, false);
-        bdrv_drain_recurse(bs, false);
+        bdrv_drain_recurse(bs);
         aio_context_release(aio_context);
     }
 
-- 
2.13.6

The device is drained, so there is no point in waiting for requests at
the end of the drained section. Remove the bdrv_drain_recurse() calls
there.

The bdrv_drain_recurse() calls were introduced in commit 481cad48e5e
in order to call the .bdrv_co_drain_end() driver callback. This is now
done by a separate bdrv_drain_invoke() call.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/io.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
 
     bdrv_parent_drained_end(bs);
     bdrv_drain_invoke(bs, false);
-    bdrv_drain_recurse(bs);
     aio_enable_external(bdrv_get_aio_context(bs));
 }
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
         aio_enable_external(aio_context);
         bdrv_parent_drained_end(bs);
         bdrv_drain_invoke(bs, false);
-        bdrv_drain_recurse(bs);
         aio_context_release(aio_context);
     }
 
-- 
2.13.6

Drain requests are propagated to child nodes, parent nodes and directly
to the AioContext. The order in which this happened was different
between all combinations of drain/drain_all and begin/end.

The correct order is to keep children only drained when their parents
are also drained. This means that at the start of a drained section, the
AioContext needs to be drained first, the parents second and only then
the children. The correct order for the end of a drained section is the
opposite.

This patch changes the three other functions to follow the example of
bdrv_drained_begin(), which is the only one that got it right.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/io.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
         return;
     }
 
+    /* Stop things in parent-to-child order */
     if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
         aio_disable_external(bdrv_get_aio_context(bs));
         bdrv_parent_drained_begin(bs);
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
         return;
     }
 
-    bdrv_parent_drained_end(bs);
+    /* Re-enable things in child-to-parent order */
     bdrv_drain_invoke(bs, false);
+    bdrv_parent_drained_end(bs);
     aio_enable_external(bdrv_get_aio_context(bs));
 }
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
         AioContext *aio_context = bdrv_get_aio_context(bs);
 
+        /* Stop things in parent-to-child order */
         aio_context_acquire(aio_context);
-        bdrv_parent_drained_begin(bs);
         aio_disable_external(aio_context);
+        bdrv_parent_drained_begin(bs);
         bdrv_drain_invoke(bs, true);
         aio_context_release(aio_context);
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
         AioContext *aio_context = bdrv_get_aio_context(bs);
 
+        /* Re-enable things in child-to-parent order */
         aio_context_acquire(aio_context);
-        aio_enable_external(aio_context);
-        bdrv_parent_drained_end(bs);
         bdrv_drain_invoke(bs, false);
+        bdrv_parent_drained_end(bs);
+        aio_enable_external(aio_context);
         aio_context_release(aio_context);
     }
 
-- 
2.13.6

Commit 15afd94a047 added code to acquire and release the AioContext in
qemuio_command(). This means that the lock is taken twice now in the
call path from hmp_qemu_io(). This causes BDRV_POLL_WHILE() to hang for
any requests issued to nodes in a non-mainloop AioContext.

Dropping the first locking from hmp_qemu_io() fixes the problem.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 hmp.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/hmp.c b/hmp.c
index XXXXXXX..XXXXXXX 100644
--- a/hmp.c
+++ b/hmp.c
@@ -XXX,XX +XXX,XX @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
 {
     BlockBackend *blk;
     BlockBackend *local_blk = NULL;
-    AioContext *aio_context;
     const char* device = qdict_get_str(qdict, "device");
     const char* command = qdict_get_str(qdict, "command");
     Error *err = NULL;
@@ -XXX,XX +XXX,XX @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
         }
     }
 
-    aio_context = blk_get_aio_context(blk);
-    aio_context_acquire(aio_context);
-
     /*
      * Notably absent: Proper permission management. This is sad, but it seems
      * almost impossible to achieve without changing the semantics and thereby
@@ -XXX,XX +XXX,XX @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
      */
     qemuio_command(blk, command);
 
-    aio_context_release(aio_context);
-
 fail:
     blk_unref(local_blk);
     hmp_handle_error(mon, &err);
-- 
2.13.6

From: Edgar Kaziakhmedov <edgar.kaziakhmedov@virtuozzo.com>

Since bdrv_co_preadv does all neccessary checks including
reading after the end of the backing file, avoid duplication
of verification before bdrv_co_preadv call.

Signed-off-by: Edgar Kaziakhmedov <edgar.kaziakhmedov@virtuozzo.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2.h |  3 ---
 block/qcow2.c | 51 ++++++++-------------------------------------------
 2 files changed, 8 insertions(+), 46 deletions(-)

diff --git a/block/qcow2.h b/block/qcow2.h
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -XXX,XX +XXX,XX @@ uint32_t offset_to_reftable_index(BDRVQcow2State *s, uint64_t offset)
 }
 
 /* qcow2.c functions */
-int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
-                  int64_t sector_num, int nb_sectors);
-
 int64_t qcow2_refcount_metadata_size(int64_t clusters, size_t cluster_size,
                                      int refcount_order, bool generous_increase,
                                      uint64_t *refblock_count);
diff --git a/block/qcow2.c b/block/qcow2.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -XXX,XX +XXX,XX @@ static int64_t coroutine_fn qcow2_co_get_block_status(BlockDriverState *bs,
     return status;
 }
 
-/* handle reading after the end of the backing file */
-int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
-                        int64_t offset, int bytes)
-{
-    uint64_t bs_size = bs->total_sectors * BDRV_SECTOR_SIZE;
-    int n1;
-
-    if ((offset + bytes) <= bs_size) {
-        return bytes;
-    }
-
-    if (offset >= bs_size) {
-        n1 = 0;
-    } else {
-        n1 = bs_size - offset;
-    }
-
-    qemu_iovec_memset(qiov, n1, 0, bytes - n1);
-
-    return n1;
-}
-
 static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
                                         uint64_t bytes, QEMUIOVector *qiov,
                                         int flags)
 {
     BDRVQcow2State *s = bs->opaque;
-    int offset_in_cluster, n1;
+    int offset_in_cluster;
     int ret;
     unsigned int cur_bytes; /* number of bytes in current iteration */
     uint64_t cluster_offset = 0;
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
         case QCOW2_CLUSTER_UNALLOCATED:
 
             if (bs->backing) {
-                /* read from the base image */
-                n1 = qcow2_backing_read1(bs->backing->bs, &hd_qiov,
-                                         offset, cur_bytes);
-                if (n1 > 0) {
-                    QEMUIOVector local_qiov;
-
-                    qemu_iovec_init(&local_qiov, hd_qiov.niov);
-                    qemu_iovec_concat(&local_qiov, &hd_qiov, 0, n1);
-
-                    BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
-                    qemu_co_mutex_unlock(&s->lock);
-                    ret = bdrv_co_preadv(bs->backing, offset, n1,
-                                         &local_qiov, 0);
-                    qemu_co_mutex_lock(&s->lock);
-
-                    qemu_iovec_destroy(&local_qiov);
-
-                    if (ret < 0) {
-                        goto fail;
-                    }
+                BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
+                qemu_co_mutex_unlock(&s->lock);
+                ret = bdrv_co_preadv(bs->backing, offset, cur_bytes,
+                                     &hd_qiov, 0);
+                qemu_co_mutex_lock(&s->lock);
+                if (ret < 0) {
+                    goto fail;
                 }
             } else {
                 /* Note: in this case, no need to wait */
-- 
2.13.6

Removing a quorum child node with x-blockdev-change results in a quorum
driver state that cannot be recreated with create options because it
would require a list with gaps. This causes trouble in at least
.bdrv_refresh_filename().

Document this problem so that we won't accidentally mark the command
stable without having addressed it.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Alberto Garcia <berto@igalia.com>
---
 qapi/block-core.json | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/qapi/block-core.json b/qapi/block-core.json
index XXXXXXX..XXXXXXX 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -XXX,XX +XXX,XX @@
 # does not support all kinds of operations, all kinds of children, nor
 # all block drivers.
 #
+# FIXME Removing children from a quorum node means introducing gaps in the
+# child indices. This cannot be represented in the 'children' list of
+# BlockdevOptionsQuorum, as returned by .bdrv_refresh_filename().
+#
 # Warning: The data in a new quorum child MUST be consistent with that of
 # the rest of the array.
 #
-- 
2.13.6

From: Doug Gale <doug16k@gmail.com>

Add trace output for commands, errors, and undefined behavior.
Add guest error log output for undefined behavior.
Report invalid undefined accesses to MMIO.
Annotate unlikely error checks with unlikely.

Signed-off-by: Doug Gale <doug16k@gmail.com>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/block/nvme.c       | 349 ++++++++++++++++++++++++++++++++++++++++++--------
 hw/block/trace-events |  93 ++++++++++++++
 2 files changed, 390 insertions(+), 52 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -XXX,XX +XXX,XX @@
 #include "qapi/visitor.h"
 #include "sysemu/block-backend.h"
 
+#include "qemu/log.h"
+#include "trace.h"
 #include "nvme.h"
 
+#define NVME_GUEST_ERR(trace, fmt, ...) \
+    do { \
+        (trace_##trace)(__VA_ARGS__); \
+        qemu_log_mask(LOG_GUEST_ERROR, #trace \
+            " in %s: " fmt "\n", __func__, ## __VA_ARGS__); \
+    } while (0)
+
 static void nvme_process_sq(void *opaque);
 
 static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
@@ -XXX,XX +XXX,XX @@ static void nvme_isr_notify(NvmeCtrl *n, NvmeCQueue *cq)
 {
     if (cq->irq_enabled) {
         if (msix_enabled(&(n->parent_obj))) {
+            trace_nvme_irq_msix(cq->vector);
             msix_notify(&(n->parent_obj), cq->vector);
         } else {
+            trace_nvme_irq_pin();
             pci_irq_pulse(&n->parent_obj);
         }
+    } else {
+        trace_nvme_irq_masked();
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
     trans_len = MIN(len, trans_len);
     int num_prps = (len >> n->page_bits) + 1;
 
-    if (!prp1) {
+    if (unlikely(!prp1)) {
+        trace_nvme_err_invalid_prp();
         return NVME_INVALID_FIELD | NVME_DNR;
     } else if (n->cmbsz && prp1 >= n->ctrl_mem.addr &&
                prp1 < n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size)) {
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
     }
     len -= trans_len;
     if (len) {
-        if (!prp2) {
+        if (unlikely(!prp2)) {
+            trace_nvme_err_invalid_prp2_missing();
             goto unmap;
         }
         if (len > n->page_size) {
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
                 uint64_t prp_ent = le64_to_cpu(prp_list[i]);
 
                 if (i == n->max_prp_ents - 1 && len > n->page_size) {
-                    if (!prp_ent || prp_ent & (n->page_size - 1)) {
+                    if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) {
+                        trace_nvme_err_invalid_prplist_ent(prp_ent);
                         goto unmap;
                     }
 
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
                     prp_ent = le64_to_cpu(prp_list[i]);
                 }
 
-                if (!prp_ent || prp_ent & (n->page_size - 1)) {
+                if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) {
+                    trace_nvme_err_invalid_prplist_ent(prp_ent);
                     goto unmap;
                 }
 
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
                 i++;
             }
         } else {
-            if (prp2 & (n->page_size - 1)) {
+            if (unlikely(prp2 & (n->page_size - 1))) {
+                trace_nvme_err_invalid_prp2_align(prp2);
                 goto unmap;
             }
             if (qsg->nsg) {
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_dma_read_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
     QEMUIOVector iov;
     uint16_t status = NVME_SUCCESS;
 
+    trace_nvme_dma_read(prp1, prp2);
+
     if (nvme_map_prp(&qsg, &iov, prp1, prp2, len, n)) {
         return NVME_INVALID_FIELD | NVME_DNR;
     }
     if (qsg.nsg > 0) {
-        if (dma_buf_read(ptr, len, &qsg)) {
+        if (unlikely(dma_buf_read(ptr, len, &qsg))) {
+            trace_nvme_err_invalid_dma();
             status = NVME_INVALID_FIELD | NVME_DNR;
         }
         qemu_sglist_destroy(&qsg);
     } else {
-        if (qemu_iovec_to_buf(&iov, 0, ptr, len) != len) {
+        if (unlikely(qemu_iovec_to_buf(&iov, 0, ptr, len) != len)) {
+            trace_nvme_err_invalid_dma();
             status = NVME_INVALID_FIELD | NVME_DNR;
         }
         qemu_iovec_destroy(&iov);
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_write_zeros(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
     uint64_t aio_slba = slba << (data_shift - BDRV_SECTOR_BITS);
     uint32_t aio_nlb = nlb << (data_shift - BDRV_SECTOR_BITS);
 
-    if (slba + nlb > ns->id_ns.nsze) {
+    if (unlikely(slba + nlb > ns->id_ns.nsze)) {
+        trace_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
         return NVME_LBA_RANGE | NVME_DNR;
     }
 
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
     int is_write = rw->opcode == NVME_CMD_WRITE ? 1 : 0;
     enum BlockAcctType acct = is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ;
 
-    if ((slba + nlb) > ns->id_ns.nsze) {
+    trace_nvme_rw(is_write ? "write" : "read", nlb, data_size, slba);
+
+    if (unlikely((slba + nlb) > ns->id_ns.nsze)) {
         block_acct_invalid(blk_get_stats(n->conf.blk), acct);
+        trace_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
         return NVME_LBA_RANGE | NVME_DNR;
     }
 
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
     NvmeNamespace *ns;
     uint32_t nsid = le32_to_cpu(cmd->nsid);
 
-    if (nsid == 0 || nsid > n->num_namespaces) {
+    if (unlikely(nsid == 0 || nsid > n->num_namespaces)) {
+        trace_nvme_err_invalid_ns(nsid, n->num_namespaces);
         return NVME_INVALID_NSID | NVME_DNR;
     }
 
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
     case NVME_CMD_READ:
         return nvme_rw(n, ns, cmd, req);
     default:
+        trace_nvme_err_invalid_opc(cmd->opcode);
         return NVME_INVALID_OPCODE | NVME_DNR;
     }
 }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeCmd *cmd)
     NvmeCQueue *cq;
     uint16_t qid = le16_to_cpu(c->qid);
 
-    if (!qid || nvme_check_sqid(n, qid)) {
+    if (unlikely(!qid || nvme_check_sqid(n, qid))) {
+        trace_nvme_err_invalid_del_sq(qid);
         return NVME_INVALID_QID | NVME_DNR;
     }
 
+    trace_nvme_del_sq(qid);
+
     sq = n->sq[qid];
     while (!QTAILQ_EMPTY(&sq->out_req_list)) {
         req = QTAILQ_FIRST(&sq->out_req_list);
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeCmd *cmd)
     uint16_t qflags = le16_to_cpu(c->sq_flags);
     uint64_t prp1 = le64_to_cpu(c->prp1);
 
-    if (!cqid || nvme_check_cqid(n, cqid)) {
+    trace_nvme_create_sq(prp1, sqid, cqid, qsize, qflags);
+
+    if (unlikely(!cqid || nvme_check_cqid(n, cqid))) {
+        trace_nvme_err_invalid_create_sq_cqid(cqid);
         return NVME_INVALID_CQID | NVME_DNR;
     }
-    if (!sqid || !nvme_check_sqid(n, sqid)) {
+    if (unlikely(!sqid || !nvme_check_sqid(n, sqid))) {
+        trace_nvme_err_invalid_create_sq_sqid(sqid);
         return NVME_INVALID_QID | NVME_DNR;
     }
-    if (!qsize || qsize > NVME_CAP_MQES(n->bar.cap)) {
+    if (unlikely(!qsize || qsize > NVME_CAP_MQES(n->bar.cap))) {
+        trace_nvme_err_invalid_create_sq_size(qsize);
         return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
     }
-    if (!prp1 || prp1 & (n->page_size - 1)) {
+    if (unlikely(!prp1 || prp1 & (n->page_size - 1))) {
+        trace_nvme_err_invalid_create_sq_addr(prp1);
         return NVME_INVALID_FIELD | NVME_DNR;
     }
-    if (!(NVME_SQ_FLAGS_PC(qflags))) {
+    if (unlikely(!(NVME_SQ_FLAGS_PC(qflags)))) {
+        trace_nvme_err_invalid_create_sq_qflags(NVME_SQ_FLAGS_PC(qflags));
         return NVME_INVALID_FIELD | NVME_DNR;
     }
     sq = g_malloc0(sizeof(*sq));
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeCmd *cmd)
     NvmeCQueue *cq;
     uint16_t qid = le16_to_cpu(c->qid);
 
-    if (!qid || nvme_check_cqid(n, qid)) {
+    if (unlikely(!qid || nvme_check_cqid(n, qid))) {
+        trace_nvme_err_invalid_del_cq_cqid(qid);
         return NVME_INVALID_CQID | NVME_DNR;
     }
 
     cq = n->cq[qid];
-    if (!QTAILQ_EMPTY(&cq->sq_list)) {
+    if (unlikely(!QTAILQ_EMPTY(&cq->sq_list))) {
+        trace_nvme_err_invalid_del_cq_notempty(qid);
         return NVME_INVALID_QUEUE_DEL;
     }
+    trace_nvme_del_cq(qid);
     nvme_free_cq(cq, n);
     return NVME_SUCCESS;
 }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeCmd *cmd)
     uint16_t qflags = le16_to_cpu(c->cq_flags);
     uint64_t prp1 = le64_to_cpu(c->prp1);
 
-    if (!cqid || !nvme_check_cqid(n, cqid)) {
+    trace_nvme_create_cq(prp1, cqid, vector, qsize, qflags,
+                         NVME_CQ_FLAGS_IEN(qflags) != 0);
+
+    if (unlikely(!cqid || !nvme_check_cqid(n, cqid))) {
+        trace_nvme_err_invalid_create_cq_cqid(cqid);
         return NVME_INVALID_CQID | NVME_DNR;
     }
-    if (!qsize || qsize > NVME_CAP_MQES(n->bar.cap)) {
+    if (unlikely(!qsize || qsize > NVME_CAP_MQES(n->bar.cap))) {
+        trace_nvme_err_invalid_create_cq_size(qsize);
         return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
     }
-    if (!prp1) {
+    if (unlikely(!prp1)) {
+        trace_nvme_err_invalid_create_cq_addr(prp1);
         return NVME_INVALID_FIELD | NVME_DNR;
     }
-    if (vector > n->num_queues) {
+    if (unlikely(vector > n->num_queues)) {
+        trace_nvme_err_invalid_create_cq_vector(vector);
         return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
     }
-    if (!(NVME_CQ_FLAGS_PC(qflags))) {
+    if (unlikely(!(NVME_CQ_FLAGS_PC(qflags)))) {
+        trace_nvme_err_invalid_create_cq_qflags(NVME_CQ_FLAGS_PC(qflags));
         return NVME_INVALID_FIELD | NVME_DNR;
     }
 
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeIdentify *c)
     uint64_t prp1 = le64_to_cpu(c->prp1);
     uint64_t prp2 = le64_to_cpu(c->prp2);
 
+    trace_nvme_identify_ctrl();
+
     return nvme_dma_read_prp(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl),
         prp1, prp2);
 }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeIdentify *c)
     uint64_t prp1 = le64_to_cpu(c->prp1);
     uint64_t prp2 = le64_to_cpu(c->prp2);
 
-    if (nsid == 0 || nsid > n->num_namespaces) {
+    trace_nvme_identify_ns(nsid);
+
+    if (unlikely(nsid == 0 || nsid > n->num_namespaces)) {
+        trace_nvme_err_invalid_ns(nsid, n->num_namespaces);
         return NVME_INVALID_NSID | NVME_DNR;
     }
 
     ns = &n->namespaces[nsid - 1];
+
     return nvme_dma_read_prp(n, (uint8_t *)&ns->id_ns, sizeof(ns->id_ns),
         prp1, prp2);
 }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeIdentify *c)
     uint16_t ret;
     int i, j = 0;
 
+    trace_nvme_identify_nslist(min_nsid);
+
     list = g_malloc0(data_len);
     for (i = 0; i < n->num_namespaces; i++) {
         if (i < min_nsid) {
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeCmd *cmd)
     case 0x02:
         return nvme_identify_nslist(n, c);
     default:
+        trace_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns));
         return NVME_INVALID_FIELD | NVME_DNR;
     }
 }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
     switch (dw10) {
     case NVME_VOLATILE_WRITE_CACHE:
         result = blk_enable_write_cache(n->conf.blk);
+        trace_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
         break;
     case NVME_NUMBER_OF_QUEUES:
         result = cpu_to_le32((n->num_queues - 2) | ((n->num_queues - 2) << 16));
+        trace_nvme_getfeat_numq(result);
         break;
     default:
+        trace_nvme_err_invalid_getfeat(dw10);
         return NVME_INVALID_FIELD | NVME_DNR;
     }
 
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
         blk_set_enable_write_cache(n->conf.blk, dw11 & 1);
         break;
     case NVME_NUMBER_OF_QUEUES:
+        trace_nvme_setfeat_numq((dw11 & 0xFFFF) + 1,
+                                ((dw11 >> 16) & 0xFFFF) + 1,
+                                n->num_queues - 1, n->num_queues - 1);
         req->cqe.result =
             cpu_to_le32((n->num_queues - 2) | ((n->num_queues - 2) << 16));
         break;
     default:
+        trace_nvme_err_invalid_setfeat(dw10);
         return NVME_INVALID_FIELD | NVME_DNR;
     }
     return NVME_SUCCESS;
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
     case NVME_ADM_CMD_GET_FEATURES:
         return nvme_get_feature(n, cmd, req);
     default:
+        trace_nvme_err_invalid_admin_opc(cmd->opcode);
         return NVME_INVALID_OPCODE | NVME_DNR;
     }
 }
@@ -XXX,XX +XXX,XX @@ static int nvme_start_ctrl(NvmeCtrl *n)
     uint32_t page_bits = NVME_CC_MPS(n->bar.cc) + 12;
     uint32_t page_size = 1 << page_bits;
 
-    if (n->cq[0] || n->sq[0] || !n->bar.asq || !n->bar.acq ||
-            n->bar.asq & (page_size - 1) || n->bar.acq & (page_size - 1) ||
-            NVME_CC_MPS(n->bar.cc) < NVME_CAP_MPSMIN(n->bar.cap) ||
-            NVME_CC_MPS(n->bar.cc) > NVME_CAP_MPSMAX(n->bar.cap) ||
-            NVME_CC_IOCQES(n->bar.cc) < NVME_CTRL_CQES_MIN(n->id_ctrl.cqes) ||
-            NVME_CC_IOCQES(n->bar.cc) > NVME_CTRL_CQES_MAX(n->id_ctrl.cqes) ||
-            NVME_CC_IOSQES(n->bar.cc) < NVME_CTRL_SQES_MIN(n->id_ctrl.sqes) ||
-            NVME_CC_IOSQES(n->bar.cc) > NVME_CTRL_SQES_MAX(n->id_ctrl.sqes) ||
-            !NVME_AQA_ASQS(n->bar.aqa) || !NVME_AQA_ACQS(n->bar.aqa)) {
+    if (unlikely(n->cq[0])) {
+        trace_nvme_err_startfail_cq();
+        return -1;
+    }
+    if (unlikely(n->sq[0])) {
+        trace_nvme_err_startfail_sq();
+        return -1;
+    }
+    if (unlikely(!n->bar.asq)) {
+        trace_nvme_err_startfail_nbarasq();
+        return -1;
+    }
+    if (unlikely(!n->bar.acq)) {
+        trace_nvme_err_startfail_nbaracq();
+        return -1;
+    }
+    if (unlikely(n->bar.asq & (page_size - 1))) {
+        trace_nvme_err_startfail_asq_misaligned(n->bar.asq);
+        return -1;
+    }
+    if (unlikely(n->bar.acq & (page_size - 1))) {
+        trace_nvme_err_startfail_acq_misaligned(n->bar.acq);
+        return -1;
+    }
+    if (unlikely(NVME_CC_MPS(n->bar.cc) <
+                 NVME_CAP_MPSMIN(n->bar.cap))) {
+        trace_nvme_err_startfail_page_too_small(
+                    NVME_CC_MPS(n->bar.cc),
+                    NVME_CAP_MPSMIN(n->bar.cap));
+        return -1;
+    }
+    if (unlikely(NVME_CC_MPS(n->bar.cc) >
+                 NVME_CAP_MPSMAX(n->bar.cap))) {
+        trace_nvme_err_startfail_page_too_large(
+                    NVME_CC_MPS(n->bar.cc),
+                    NVME_CAP_MPSMAX(n->bar.cap));
+        return -1;
+    }
+    if (unlikely(NVME_CC_IOCQES(n->bar.cc) <
+                 NVME_CTRL_CQES_MIN(n->id_ctrl.cqes))) {
+        trace_nvme_err_startfail_cqent_too_small(
+                    NVME_CC_IOCQES(n->bar.cc),
+                    NVME_CTRL_CQES_MIN(n->bar.cap));
+        return -1;
+    }
+    if (unlikely(NVME_CC_IOCQES(n->bar.cc) >
+                 NVME_CTRL_CQES_MAX(n->id_ctrl.cqes))) {
+        trace_nvme_err_startfail_cqent_too_large(
+                    NVME_CC_IOCQES(n->bar.cc),
+                    NVME_CTRL_CQES_MAX(n->bar.cap));
+        return -1;
+    }
+    if (unlikely(NVME_CC_IOSQES(n->bar.cc) <
+                 NVME_CTRL_SQES_MIN(n->id_ctrl.sqes))) {
+        trace_nvme_err_startfail_sqent_too_small(
+                    NVME_CC_IOSQES(n->bar.cc),
+                    NVME_CTRL_SQES_MIN(n->bar.cap));
+        return -1;
+    }
+    if (unlikely(NVME_CC_IOSQES(n->bar.cc) >
+                 NVME_CTRL_SQES_MAX(n->id_ctrl.sqes))) {
+        trace_nvme_err_startfail_sqent_too_large(
+                    NVME_CC_IOSQES(n->bar.cc),
+                    NVME_CTRL_SQES_MAX(n->bar.cap));
+        return -1;
+    }
+    if (unlikely(!NVME_AQA_ASQS(n->bar.aqa))) {
+        trace_nvme_err_startfail_asqent_sz_zero();
+        return -1;
+    }
+    if (unlikely(!NVME_AQA_ACQS(n->bar.aqa))) {
+        trace_nvme_err_startfail_acqent_sz_zero();
         return -1;
     }
 
@@ -XXX,XX +XXX,XX @@ static int nvme_start_ctrl(NvmeCtrl *n)
 static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
     unsigned size)
 {
+    if (unlikely(offset & (sizeof(uint32_t) - 1))) {
+        NVME_GUEST_ERR(nvme_ub_mmiowr_misaligned32,
+                       "MMIO write not 32-bit aligned,"
+                       " offset=0x%"PRIx64"", offset);
+        /* should be ignored, fall through for now */
+    }
+
+    if (unlikely(size < sizeof(uint32_t))) {
+        NVME_GUEST_ERR(nvme_ub_mmiowr_toosmall,
+                       "MMIO write smaller than 32-bits,"
+                       " offset=0x%"PRIx64", size=%u",
+                       offset, size);
+        /* should be ignored, fall through for now */
+    }
+
     switch (offset) {
-    case 0xc:
+    case 0xc:   /* INTMS */
+        if (unlikely(msix_enabled(&(n->parent_obj)))) {
+            NVME_GUEST_ERR(nvme_ub_mmiowr_intmask_with_msix,
+                           "undefined access to interrupt mask set"
+                           " when MSI-X is enabled");
+            /* should be ignored, fall through for now */
+        }
         n->bar.intms |= data & 0xffffffff;
         n->bar.intmc = n->bar.intms;
+        trace_nvme_mmio_intm_set(data & 0xffffffff,
+                                 n->bar.intmc);
         break;
-    case 0x10:
+    case 0x10:  /* INTMC */
+        if (unlikely(msix_enabled(&(n->parent_obj)))) {
+            NVME_GUEST_ERR(nvme_ub_mmiowr_intmask_with_msix,
+                           "undefined access to interrupt mask clr"
+                           " when MSI-X is enabled");
+            /* should be ignored, fall through for now */
+        }
         n->bar.intms &= ~(data & 0xffffffff);
         n->bar.intmc = n->bar.intms;
+        trace_nvme_mmio_intm_clr(data & 0xffffffff,
+                                 n->bar.intmc);
         break;
-    case 0x14:
+    case 0x14:  /* CC */
+        trace_nvme_mmio_cfg(data & 0xffffffff);
         /* Windows first sends data, then sends enable bit */
         if (!NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc) &&
             !NVME_CC_SHN(data) && !NVME_CC_SHN(n->bar.cc))
@@ -XXX,XX +XXX,XX @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
 
         if (NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc)) {
             n->bar.cc = data;
-            if (nvme_start_ctrl(n)) {
+            if (unlikely(nvme_start_ctrl(n))) {
+                trace_nvme_err_startfail();
                 n->bar.csts = NVME_CSTS_FAILED;
             } else {
+                trace_nvme_mmio_start_success();
                 n->bar.csts = NVME_CSTS_READY;
             }
         } else if (!NVME_CC_EN(data) && NVME_CC_EN(n->bar.cc)) {
+            trace_nvme_mmio_stopped();
             nvme_clear_ctrl(n);
             n->bar.csts &= ~NVME_CSTS_READY;
         }
         if (NVME_CC_SHN(data) && !(NVME_CC_SHN(n->bar.cc))) {
-                nvme_clear_ctrl(n);
-                n->bar.cc = data;
-                n->bar.csts |= NVME_CSTS_SHST_COMPLETE;
+            trace_nvme_mmio_shutdown_set();
+            nvme_clear_ctrl(n);
+            n->bar.cc = data;
+            n->bar.csts |= NVME_CSTS_SHST_COMPLETE;
         } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(n->bar.cc)) {
-                n->bar.csts &= ~NVME_CSTS_SHST_COMPLETE;
-                n->bar.cc = data;
+            trace_nvme_mmio_shutdown_cleared();
+            n->bar.csts &= ~NVME_CSTS_SHST_COMPLETE;
+            n->bar.cc = data;
+        }
+        break;
+    case 0x1C:  /* CSTS */
+        if (data & (1 << 4)) {
+            NVME_GUEST_ERR(nvme_ub_mmiowr_ssreset_w1c_unsupported,
+                           "attempted to W1C CSTS.NSSRO"
+                           " but CAP.NSSRS is zero (not supported)");
+        } else if (data != 0) {
+            NVME_GUEST_ERR(nvme_ub_mmiowr_ro_csts,
+                           "attempted to set a read only bit"
+                           " of controller status");
+        }
+        break;
+    case 0x20:  /* NSSR */
+        if (data == 0x4E564D65) {
+            trace_nvme_ub_mmiowr_ssreset_unsupported();
+        } else {
+            /* The spec says that writes of other values have no effect */
+            return;
         }
         break;
-    case 0x24:
+    case 0x24:  /* AQA */
         n->bar.aqa = data & 0xffffffff;
+        trace_nvme_mmio_aqattr(data & 0xffffffff);
         break;
-    case 0x28:
+    case 0x28:  /* ASQ */
         n->bar.asq = data;
+        trace_nvme_mmio_asqaddr(data);
         break;
-    case 0x2c:
+    case 0x2c:  /* ASQ hi */
         n->bar.asq |= data << 32;
+        trace_nvme_mmio_asqaddr_hi(data, n->bar.asq);
         break;
-    case 0x30:
+    case 0x30:  /* ACQ */
+        trace_nvme_mmio_acqaddr(data);
         n->bar.acq = data;
         break;
-    case 0x34:
+    case 0x34:  /* ACQ hi */
         n->bar.acq |= data << 32;
+        trace_nvme_mmio_acqaddr_hi(data, n->bar.acq);
         break;
+    case 0x38:  /* CMBLOC */
+        NVME_GUEST_ERR(nvme_ub_mmiowr_cmbloc_reserved,
+                       "invalid write to reserved CMBLOC"
+                       " when CMBSZ is zero, ignored");
+        return;
+    case 0x3C:  /* CMBSZ */
+        NVME_GUEST_ERR(nvme_ub_mmiowr_cmbsz_readonly,
+                       "invalid write to read only CMBSZ, ignored");
+        return;
     default:
+        NVME_GUEST_ERR(nvme_ub_mmiowr_invalid,
+                       "invalid MMIO write,"
+                       " offset=0x%"PRIx64", data=%"PRIx64"",
+                       offset, data);
         break;
     }
 }
@@ -XXX,XX +XXX,XX @@ static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
     uint8_t *ptr = (uint8_t *)&n->bar;
     uint64_t val = 0;
 
+    if (unlikely(addr & (sizeof(uint32_t) - 1))) {
+        NVME_GUEST_ERR(nvme_ub_mmiord_misaligned32,
+                       "MMIO read not 32-bit aligned,"
+                       " offset=0x%"PRIx64"", addr);
+        /* should RAZ, fall through for now */
+    } else if (unlikely(size < sizeof(uint32_t))) {
+        NVME_GUEST_ERR(nvme_ub_mmiord_toosmall,
+                       "MMIO read smaller than 32-bits,"
+                       " offset=0x%"PRIx64"", addr);
+        /* should RAZ, fall through for now */
+    }
+
     if (addr < sizeof(n->bar)) {
         memcpy(&val, ptr + addr, size);
+    } else {
+        NVME_GUEST_ERR(nvme_ub_mmiord_invalid_ofs,
+                       "MMIO read beyond last register,"
+                       " offset=0x%"PRIx64", returning 0", addr);
     }
+
     return val;
 }
 
@@ -XXX,XX +XXX,XX @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
 {
     uint32_t qid;
 
-    if (addr & ((1 << 2) - 1)) {
+    if (unlikely(addr & ((1 << 2) - 1))) {
+        NVME_GUEST_ERR(nvme_ub_db_wr_misaligned,
+                       "doorbell write not 32-bit aligned,"
+                       " offset=0x%"PRIx64", ignoring", addr);
         return;
     }
 
     if (((addr - 0x1000) >> 2) & 1) {
+        /* Completion queue doorbell write */
+
         uint16_t new_head = val & 0xffff;
         int start_sqs;
         NvmeCQueue *cq;
 
         qid = (addr - (0x1000 + (1 << 2))) >> 3;
-        if (nvme_check_cqid(n, qid)) {
+        if (unlikely(nvme_check_cqid(n, qid))) {
+            NVME_GUEST_ERR(nvme_ub_db_wr_invalid_cq,
+                           "completion queue doorbell write"
+                           " for nonexistent queue,"
+                           " sqid=%"PRIu32", ignoring", qid);
             return;
         }
 
         cq = n->cq[qid];
-        if (new_head >= cq->size) {
+        if (unlikely(new_head >= cq->size)) {
+            NVME_GUEST_ERR(nvme_ub_db_wr_invalid_cqhead,
+                           "completion queue doorbell write value"
+                           " beyond queue size, sqid=%"PRIu32","
+                           " new_head=%"PRIu16", ignoring",
+                           qid, new_head);
             return;
         }
 
@@ -XXX,XX +XXX,XX @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
             nvme_isr_notify(n, cq);
         }
     } else {
+        /* Submission queue doorbell write */
+
         uint16_t new_tail = val & 0xffff;
         NvmeSQueue *sq;
 
         qid = (addr - 0x1000) >> 3;
-        if (nvme_check_sqid(n, qid)) {
+        if (unlikely(nvme_check_sqid(n, qid))) {
+            NVME_GUEST_ERR(nvme_ub_db_wr_invalid_sq,
+                           "submission queue doorbell write"
+                           " for nonexistent queue,"
+                           " sqid=%"PRIu32", ignoring", qid);
             return;
         }
 
         sq = n->sq[qid];
-        if (new_tail >= sq->size) {
+        if (unlikely(new_tail >= sq->size)) {
+            NVME_GUEST_ERR(nvme_ub_db_wr_invalid_sqtail,
+                           "submission queue doorbell write value"
+                           " beyond queue size, sqid=%"PRIu32","
+                           " new_tail=%"PRIu16", ignoring",
+                           qid, new_tail);
             return;
         }
 
diff --git a/hw/block/trace-events b/hw/block/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -XXX,XX +XXX,XX @@ virtio_blk_submit_multireq(void *vdev, void *mrb, int start, int num_reqs, uint6
 hd_geometry_lchs_guess(void *blk, int cyls, int heads, int secs) "blk %p LCHS %d %d %d"
 hd_geometry_guess(void *blk, uint32_t cyls, uint32_t heads, uint32_t secs, int trans) "blk %p CHS %u %u %u trans %d"
 
+# hw/block/nvme.c
+# nvme traces for successful events
+nvme_irq_msix(uint32_t vector) "raising MSI-X IRQ vector %u"
+nvme_irq_pin(void) "pulsing IRQ pin"
+nvme_irq_masked(void) "IRQ is masked"
+nvme_dma_read(uint64_t prp1, uint64_t prp2) "DMA read, prp1=0x%"PRIx64" prp2=0x%"PRIx64""
+nvme_rw(char const *verb, uint32_t blk_count, uint64_t byte_count, uint64_t lba) "%s %"PRIu32" blocks (%"PRIu64" bytes) from LBA %"PRIu64""
+nvme_create_sq(uint64_t addr, uint16_t sqid, uint16_t cqid, uint16_t qsize, uint16_t qflags) "create submission queue, addr=0x%"PRIx64", sqid=%"PRIu16", cqid=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16""
+nvme_create_cq(uint64_t addr, uint16_t cqid, uint16_t vector, uint16_t size, uint16_t qflags, int ien) "create completion queue, addr=0x%"PRIx64", cqid=%"PRIu16", vector=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16", ien=%d"
+nvme_del_sq(uint16_t qid) "deleting submission queue sqid=%"PRIu16""
+nvme_del_cq(uint16_t cqid) "deleted completion queue, sqid=%"PRIu16""
+nvme_identify_ctrl(void) "identify controller"
+nvme_identify_ns(uint16_t ns) "identify namespace, nsid=%"PRIu16""
+nvme_identify_nslist(uint16_t ns) "identify namespace list, nsid=%"PRIu16""
+nvme_getfeat_vwcache(char const* result) "get feature volatile write cache, result=%s"
+nvme_getfeat_numq(int result) "get feature number of queues, result=%d"
+nvme_setfeat_numq(int reqcq, int reqsq, int gotcq, int gotsq) "requested cq_count=%d sq_count=%d, responding with cq_count=%d sq_count=%d"
+nvme_mmio_intm_set(uint64_t data, uint64_t new_mask) "wrote MMIO, interrupt mask set, data=0x%"PRIx64", new_mask=0x%"PRIx64""
+nvme_mmio_intm_clr(uint64_t data, uint64_t new_mask) "wrote MMIO, interrupt mask clr, data=0x%"PRIx64", new_mask=0x%"PRIx64""
+nvme_mmio_cfg(uint64_t data) "wrote MMIO, config controller config=0x%"PRIx64""
+nvme_mmio_aqattr(uint64_t data) "wrote MMIO, admin queue attributes=0x%"PRIx64""
+nvme_mmio_asqaddr(uint64_t data) "wrote MMIO, admin submission queue address=0x%"PRIx64""
+nvme_mmio_acqaddr(uint64_t data) "wrote MMIO, admin completion queue address=0x%"PRIx64""
+nvme_mmio_asqaddr_hi(uint64_t data, uint64_t new_addr) "wrote MMIO, admin submission queue high half=0x%"PRIx64", new_address=0x%"PRIx64""
+nvme_mmio_acqaddr_hi(uint64_t data, uint64_t new_addr) "wrote MMIO, admin completion queue high half=0x%"PRIx64", new_address=0x%"PRIx64""
+nvme_mmio_start_success(void) "setting controller enable bit succeeded"
+nvme_mmio_stopped(void) "cleared controller enable bit"
+nvme_mmio_shutdown_set(void) "shutdown bit set"
+nvme_mmio_shutdown_cleared(void) "shutdown bit cleared"
+
+# nvme traces for error conditions
+nvme_err_invalid_dma(void) "PRP/SGL is too small for transfer size"
+nvme_err_invalid_prplist_ent(uint64_t prplist) "PRP list entry is null or not page aligned: 0x%"PRIx64""
+nvme_err_invalid_prp2_align(uint64_t prp2) "PRP2 is not page aligned: 0x%"PRIx64""
+nvme_err_invalid_prp2_missing(void) "PRP2 is null and more data to be transferred"
+nvme_err_invalid_field(void) "invalid field"
+nvme_err_invalid_prp(void) "invalid PRP"
+nvme_err_invalid_sgl(void) "invalid SGL"
+nvme_err_invalid_ns(uint32_t ns, uint32_t limit) "invalid namespace %u not within 1-%u"
+nvme_err_invalid_opc(uint8_t opc) "invalid opcode 0x%"PRIx8""
+nvme_err_invalid_admin_opc(uint8_t opc) "invalid admin opcode 0x%"PRIx8""
+nvme_err_invalid_lba_range(uint64_t start, uint64_t len, uint64_t limit) "Invalid LBA start=%"PRIu64" len=%"PRIu64" limit=%"PRIu64""
+nvme_err_invalid_del_sq(uint16_t qid) "invalid submission queue deletion, sid=%"PRIu16""
+nvme_err_invalid_create_sq_cqid(uint16_t cqid) "failed creating submission queue, invalid cqid=%"PRIu16""
+nvme_err_invalid_create_sq_sqid(uint16_t sqid) "failed creating submission queue, invalid sqid=%"PRIu16""
+nvme_err_invalid_create_sq_size(uint16_t qsize) "failed creating submission queue, invalid qsize=%"PRIu16""
+nvme_err_invalid_create_sq_addr(uint64_t addr) "failed creating submission queue, addr=0x%"PRIx64""
+nvme_err_invalid_create_sq_qflags(uint16_t qflags) "failed creating submission queue, qflags=%"PRIu16""
+nvme_err_invalid_del_cq_cqid(uint16_t cqid) "failed deleting completion queue, cqid=%"PRIu16""
+nvme_err_invalid_del_cq_notempty(uint16_t cqid) "failed deleting completion queue, it is not empty, cqid=%"PRIu16""
+nvme_err_invalid_create_cq_cqid(uint16_t cqid) "failed creating completion queue, cqid=%"PRIu16""
+nvme_err_invalid_create_cq_size(uint16_t size) "failed creating completion queue, size=%"PRIu16""
+nvme_err_invalid_create_cq_addr(uint64_t addr) "failed creating completion queue, addr=0x%"PRIx64""
+nvme_err_invalid_create_cq_vector(uint16_t vector) "failed creating completion queue, vector=%"PRIu16""
+nvme_err_invalid_create_cq_qflags(uint16_t qflags) "failed creating completion queue, qflags=%"PRIu16""
+nvme_err_invalid_identify_cns(uint16_t cns) "identify, invalid cns=0x%"PRIx16""
+nvme_err_invalid_getfeat(int dw10) "invalid get features, dw10=0x%"PRIx32""
+nvme_err_invalid_setfeat(uint32_t dw10) "invalid set features, dw10=0x%"PRIx32""
+nvme_err_startfail_cq(void) "nvme_start_ctrl failed because there are non-admin completion queues"
+nvme_err_startfail_sq(void) "nvme_start_ctrl failed because there are non-admin submission queues"
+nvme_err_startfail_nbarasq(void) "nvme_start_ctrl failed because the admin submission queue address is null"
+nvme_err_startfail_nbaracq(void) "nvme_start_ctrl failed because the admin completion queue address is null"
+nvme_err_startfail_asq_misaligned(uint64_t addr) "nvme_start_ctrl failed because the admin submission queue address is misaligned: 0x%"PRIx64""
+nvme_err_startfail_acq_misaligned(uint64_t addr) "nvme_start_ctrl failed because the admin completion queue address is misaligned: 0x%"PRIx64""
+nvme_err_startfail_page_too_small(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the page size is too small: log2size=%u, min=%u"
+nvme_err_startfail_page_too_large(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the page size is too large: log2size=%u, max=%u"
+nvme_err_startfail_cqent_too_small(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the completion queue entry size is too small: log2size=%u, min=%u"
+nvme_err_startfail_cqent_too_large(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the completion queue entry size is too large: log2size=%u, max=%u"
+nvme_err_startfail_sqent_too_small(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the submission queue entry size is too small: log2size=%u, min=%u"
+nvme_err_startfail_sqent_too_large(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the submission queue entry size is too large: log2size=%u, max=%u"
+nvme_err_startfail_asqent_sz_zero(void) "nvme_start_ctrl failed because the admin submission queue size is zero"
+nvme_err_startfail_acqent_sz_zero(void) "nvme_start_ctrl failed because the admin completion queue size is zero"
+nvme_err_startfail(void) "setting controller enable bit failed"
+
+# Traces for undefined behavior
+nvme_ub_mmiowr_misaligned32(uint64_t offset) "MMIO write not 32-bit aligned, offset=0x%"PRIx64""
+nvme_ub_mmiowr_toosmall(uint64_t offset, unsigned size) "MMIO write smaller than 32 bits, offset=0x%"PRIx64", size=%u"
+nvme_ub_mmiowr_intmask_with_msix(void) "undefined access to interrupt mask set when MSI-X is enabled"
+nvme_ub_mmiowr_ro_csts(void) "attempted to set a read only bit of controller status"
+nvme_ub_mmiowr_ssreset_w1c_unsupported(void) "attempted to W1C CSTS.NSSRO but CAP.NSSRS is zero (not supported)"
+nvme_ub_mmiowr_ssreset_unsupported(void) "attempted NVM subsystem reset but CAP.NSSRS is zero (not supported)"
+nvme_ub_mmiowr_cmbloc_reserved(void) "invalid write to reserved CMBLOC when CMBSZ is zero, ignored"
+nvme_ub_mmiowr_cmbsz_readonly(void) "invalid write to read only CMBSZ, ignored"
+nvme_ub_mmiowr_invalid(uint64_t offset, uint64_t data) "invalid MMIO write, offset=0x%"PRIx64", data=0x%"PRIx64""
+nvme_ub_mmiord_misaligned32(uint64_t offset) "MMIO read not 32-bit aligned, offset=0x%"PRIx64""
+nvme_ub_mmiord_toosmall(uint64_t offset) "MMIO read smaller than 32-bits, offset=0x%"PRIx64""
+nvme_ub_mmiord_invalid_ofs(uint64_t offset) "MMIO read beyond last register, offset=0x%"PRIx64", returning 0"
+nvme_ub_db_wr_misaligned(uint64_t offset) "doorbell write not 32-bit aligned, offset=0x%"PRIx64", ignoring"
+nvme_ub_db_wr_invalid_cq(uint32_t qid) "completion queue doorbell write for nonexistent queue, cqid=%"PRIu32", ignoring"
+nvme_ub_db_wr_invalid_cqhead(uint32_t qid, uint16_t new_head) "completion queue doorbell write value beyond queue size, cqid=%"PRIu32", new_head=%"PRIu16", ignoring"
+nvme_ub_db_wr_invalid_sq(uint32_t qid) "submission queue doorbell write for nonexistent queue, sqid=%"PRIu32", ignoring"
+nvme_ub_db_wr_invalid_sqtail(uint32_t qid, uint16_t new_tail) "submission queue doorbell write value beyond queue size, sqid=%"PRIu32", new_head=%"PRIu16", ignoring"
+
 # hw/block/xen_disk.c
 xen_disk_alloc(char *name) "%s"
 xen_disk_init(char *name) "%s"
-- 
2.13.6

From: Fam Zheng <famz@redhat.com>

Management tools create overlays of running guests with qemu-img:

$ qemu-img create -b /image/in/use.qcow2 -f qcow2 /overlay/image.qcow2

but this doesn't work anymore due to image locking:

qemu-img: /overlay/image.qcow2: Failed to get shared "write" lock
    Is another process using the image?
    Could not open backing image to determine size.
Use the force share option to allow this use case again.

Cc: qemu-stable@nongnu.org
Signed-off-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/block.c b/block.c
index XXXXXXX..XXXXXXX 100644
--- a/block.c
+++ b/block.c
@@ -XXX,XX +XXX,XX @@ void bdrv_img_create(const char *filename, const char *fmt,
         back_flags = flags;
         back_flags &= ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
 
+        backing_options = qdict_new();
         if (backing_fmt) {
-            backing_options = qdict_new();
             qdict_put_str(backing_options, "driver", backing_fmt);
         }
+        qdict_put_bool(backing_options, BDRV_OPT_FORCE_SHARE, true);
 
         bs = bdrv_open(full_backing, NULL, backing_options, back_flags,
                        &local_err);
-- 
2.13.6

From: Thomas Huth <thuth@redhat.com>

It's not working anymore since QEMU v1.3.0 - time to remove it now.

Signed-off-by: Thomas Huth <thuth@redhat.com>
Reviewed-by: John Snow <jsnow@redhat.com>
Reviewed-by: Markus Armbruster <armbru@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 blockdev.c    | 11 -----------
 qemu-doc.texi |  6 ------
 2 files changed, 17 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index XXXXXXX..XXXXXXX 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -XXX,XX +XXX,XX @@ QemuOptsList qemu_legacy_drive_opts = {
             .type = QEMU_OPT_STRING,
             .help = "chs translation (auto, lba, none)",
         },{
-            .name = "boot",
-            .type = QEMU_OPT_BOOL,
-            .help = "(deprecated, ignored)",
-        },{
             .name = "addr",
             .type = QEMU_OPT_STRING,
             .help = "pci address (virtio only)",
@@ -XXX,XX +XXX,XX @@ DriveInfo *drive_new(QemuOpts *all_opts, BlockInterfaceType block_default_type)
         goto fail;
     }
 
-    /* Deprecated option boot=[on|off] */
-    if (qemu_opt_get(legacy_opts, "boot") != NULL) {
-        fprintf(stderr, "qemu-kvm: boot=on|off is deprecated and will be "
-                "ignored. Future versions will reject this parameter. Please "
-                "update your scripts.\n");
-    }
-
     /* Other deprecated options */
     if (!qtest_enabled()) {
         for (i = 0; i < ARRAY_SIZE(deprecated); i++) {
diff --git a/qemu-doc.texi b/qemu-doc.texi
index XXXXXXX..XXXXXXX 100644
--- a/qemu-doc.texi
+++ b/qemu-doc.texi
@@ -XXX,XX +XXX,XX @@ deprecated.
 
 @section System emulator command line arguments
 
-@subsection -drive boot=on|off (since 1.3.0)
-
-The ``boot=on|off'' option to the ``-drive'' argument is
-ignored. Applications should use the ``bootindex=N'' parameter
-to set an absolute ordering between devices instead.
-
 @subsection -tdf (since 1.3.0)
 
 The ``-tdf'' argument is ignored. The behaviour implemented
-- 
2.13.6

From: Thomas Huth <thuth@redhat.com>

It's been marked as deprecated since QEMU v2.10.0, and so far nobody
complained that we should keep it, so let's remove this legacy option
now to simplify the code quite a bit.

Signed-off-by: Thomas Huth <thuth@redhat.com>
Reviewed-by: John Snow <jsnow@redhat.com>
Reviewed-by: Markus Armbruster <armbru@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 vl.c            | 86 ++-------------------------------------------------------
 qemu-doc.texi   |  8 ------
 qemu-options.hx | 19 ++-----------
 3 files changed, 4 insertions(+), 109 deletions(-)

diff --git a/vl.c b/vl.c
index XXXXXXX..XXXXXXX 100644
--- a/vl.c
+++ b/vl.c
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
     const char *boot_order = NULL;
     const char *boot_once = NULL;
     DisplayState *ds;
-    int cyls, heads, secs, translation;
     QemuOpts *opts, *machine_opts;
-    QemuOpts *hda_opts = NULL, *icount_opts = NULL, *accel_opts = NULL;
+    QemuOpts *icount_opts = NULL, *accel_opts = NULL;
     QemuOptsList *olist;
     int optind;
     const char *optarg;
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
 
     cpu_model = NULL;
     snapshot = 0;
-    cyls = heads = secs = 0;
-    translation = BIOS_ATA_TRANSLATION_AUTO;
 
     nb_nics = 0;
 
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
         if (optind >= argc)
             break;
         if (argv[optind][0] != '-') {
-            hda_opts = drive_add(IF_DEFAULT, 0, argv[optind++], HD_OPTS);
+            drive_add(IF_DEFAULT, 0, argv[optind++], HD_OPTS);
         } else {
             const QEMUOption *popt;
 
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
                 cpu_model = optarg;
                 break;
             case QEMU_OPTION_hda:
-                {
-                    char buf[256];
-                    if (cyls == 0)
-                        snprintf(buf, sizeof(buf), "%s", HD_OPTS);
-                    else
-                        snprintf(buf, sizeof(buf),
-                                 "%s,cyls=%d,heads=%d,secs=%d%s",
-                                 HD_OPTS , cyls, heads, secs,
-                                 translation == BIOS_ATA_TRANSLATION_LBA ?
-                                 ",trans=lba" :
-                                 translation == BIOS_ATA_TRANSLATION_NONE ?
-                                 ",trans=none" : "");
-                    drive_add(IF_DEFAULT, 0, optarg, buf);
-                    break;
-                }
             case QEMU_OPTION_hdb:
             case QEMU_OPTION_hdc:
             case QEMU_OPTION_hdd:
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
             case QEMU_OPTION_snapshot:
                 snapshot = 1;
                 break;
-            case QEMU_OPTION_hdachs:
-                {
-                    const char *p;
-                    p = optarg;
-                    cyls = strtol(p, (char **)&p, 0);
-                    if (cyls < 1 || cyls > 16383)
-                        goto chs_fail;
-                    if (*p != ',')
-                        goto chs_fail;
-                    p++;
-                    heads = strtol(p, (char **)&p, 0);
-                    if (heads < 1 || heads > 16)
-                        goto chs_fail;
-                    if (*p != ',')
-                        goto chs_fail;
-                    p++;
-                    secs = strtol(p, (char **)&p, 0);
-                    if (secs < 1 || secs > 63)
-                        goto chs_fail;
-                    if (*p == ',') {
-                        p++;
-                        if (!strcmp(p, "large")) {
-                            translation = BIOS_ATA_TRANSLATION_LARGE;
-                        } else if (!strcmp(p, "rechs")) {
-                            translation = BIOS_ATA_TRANSLATION_RECHS;
-                        } else if (!strcmp(p, "none")) {
-                            translation = BIOS_ATA_TRANSLATION_NONE;
-                        } else if (!strcmp(p, "lba")) {
-                            translation = BIOS_ATA_TRANSLATION_LBA;
-                        } else if (!strcmp(p, "auto")) {
-                            translation = BIOS_ATA_TRANSLATION_AUTO;
-                        } else {
-                            goto chs_fail;
-                        }
-                    } else if (*p != '\0') {
-                    chs_fail:
-                        error_report("invalid physical CHS format");
-                        exit(1);
-                    }
-                    if (hda_opts != NULL) {
-                        qemu_opt_set_number(hda_opts, "cyls", cyls,
-                                            &error_abort);
-                        qemu_opt_set_number(hda_opts, "heads", heads,
-                                            &error_abort);
-                        qemu_opt_set_number(hda_opts, "secs", secs,
-                                            &error_abort);
-                        if (translation == BIOS_ATA_TRANSLATION_LARGE) {
-                            qemu_opt_set(hda_opts, "trans", "large",
-                                         &error_abort);
-                        } else if (translation == BIOS_ATA_TRANSLATION_RECHS) {
-                            qemu_opt_set(hda_opts, "trans", "rechs",
-                                         &error_abort);
-                        } else if (translation == BIOS_ATA_TRANSLATION_LBA) {
-                            qemu_opt_set(hda_opts, "trans", "lba",
-                                         &error_abort);
-                        } else if (translation == BIOS_ATA_TRANSLATION_NONE) {
-                            qemu_opt_set(hda_opts, "trans", "none",
-                                         &error_abort);
-                        }
-                    }
-                }
-                error_report("'-hdachs' is deprecated, please use '-device"
-                             " ide-hd,cyls=c,heads=h,secs=s,...' instead");
-                break;
             case QEMU_OPTION_numa:
                 opts = qemu_opts_parse_noisily(qemu_find_opts("numa"),
                                                optarg, true);
diff --git a/qemu-doc.texi b/qemu-doc.texi
index XXXXXXX..XXXXXXX 100644
--- a/qemu-doc.texi
+++ b/qemu-doc.texi
@@ -XXX,XX +XXX,XX @@ The ``--net dump'' argument is now replaced with the
 ``-object filter-dump'' argument which works in combination
 with the modern ``-netdev`` backends instead.
 
-@subsection -hdachs (since 2.10.0)
-
-The ``-hdachs'' argument is now a synonym for setting
-the ``cyls'', ``heads'', ``secs'', and ``trans'' properties
-on the ``ide-hd'' device using the ``-device'' argument.
-The new syntax allows different settings to be provided
-per disk.
-
 @subsection -usbdevice (since 2.10.0)
 
 The ``-usbdevice DEV'' argument is now a synonym for setting
diff --git a/qemu-options.hx b/qemu-options.hx
index XXXXXXX..XXXXXXX 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -XXX,XX +XXX,XX @@ of available connectors of a given interface type.
 @item media=@var{media}
 This option defines the type of the media: disk or cdrom.
 @item cyls=@var{c},heads=@var{h},secs=@var{s}[,trans=@var{t}]
-These options have the same definition as they have in @option{-hdachs}.
-These parameters are deprecated, use the corresponding parameters
+Force disk physical geometry and the optional BIOS translation (trans=none or
+lba). These parameters are deprecated, use the corresponding parameters
 of @code{-device} instead.
 @item snapshot=@var{snapshot}
 @var{snapshot} is "on" or "off" and controls snapshot mode for the given drive
@@ -XXX,XX +XXX,XX @@ the raw disk image you use is not written back. You can however force
 the write back by pressing @key{C-a s} (@pxref{disk_images}).
 ETEXI
 
-DEF("hdachs", HAS_ARG, QEMU_OPTION_hdachs, \
-    "-hdachs c,h,s[,t]\n" \
-    "                force hard disk 0 physical geometry and the optional BIOS\n" \
-    "                translation (t=none or lba) (usually QEMU can guess them)\n",
-    QEMU_ARCH_ALL)
-STEXI
-@item -hdachs @var{c},@var{h},@var{s},[,@var{t}]
-@findex -hdachs
-Force hard disk 0 physical geometry (1 <= @var{c} <= 16383, 1 <=
-@var{h} <= 16, 1 <= @var{s} <= 63) and optionally force the BIOS
-translation mode (@var{t}=none, lba or auto). Usually QEMU can guess
-all those parameters. This option is deprecated, please use
-@code{-device ide-hd,cyls=c,heads=h,secs=s,...} instead.
-ETEXI
-
 DEF("fsdev", HAS_ARG, QEMU_OPTION_fsdev,
     "-fsdev fsdriver,id=id[,path=path,][security_model={mapped-xattr|mapped-file|passthrough|none}]\n"
     " [,writeout=immediate][,readonly][,socket=socket|sock_fd=sock_fd][,fmode=fmode][,dmode=dmode]\n"
-- 
2.13.6

From: Thomas Huth <thuth@redhat.com>

Looks like we forgot to announce the deprecation of these options in
the corresponding chapter of the qemu-doc text, so let's do that now.

diff --git a/qemu-doc.texi b/qemu-doc.texi
index XXXXXXX..XXXXXXX 100644
--- a/qemu-doc.texi
+++ b/qemu-doc.texi
@@ -XXX,XX +XXX,XX @@ longer be directly supported in QEMU.
 The ``-drive if=scsi'' argument is replaced by the the
 ``-device BUS-TYPE'' argument combined with ``-drive if=none''.
 
+@subsection -drive cyls=...,heads=...,secs=...,trans=... (since 2.10.0)
+
+The drive geometry arguments are replaced by the the geometry arguments
+that can be specified with the ``-device'' parameter.
+
+@subsection -drive serial=... (since 2.10.0)
+
+The drive serial argument is replaced by the the serial argument
+that can be specified with the ``-device'' parameter.
+
+@subsection -drive addr=... (since 2.10.0)
+
+The drive addr argument is replaced by the the addr argument
+that can be specified with the ``-device'' parameter.
+
 @subsection -net dump (since 2.10.0)
 
 The ``--net dump'' argument is now replaced with the
-- 
2.13.6

From: Fam Zheng <famz@redhat.com>

Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/block_int.h |  1 -
 block/io.c                | 18 ------------------
 2 files changed, 19 deletions(-)

diff --git a/include/block/block_int.h b/include/block/block_int.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -XXX,XX +XXX,XX @@ bool blk_dev_is_tray_open(BlockBackend *blk);
 bool blk_dev_is_medium_locked(BlockBackend *blk);
 
 void bdrv_set_dirty(BlockDriverState *bs, int64_t offset, int64_t bytes);
-bool bdrv_requests_pending(BlockDriverState *bs);
 
 void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out);
 void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in);
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ void bdrv_disable_copy_on_read(BlockDriverState *bs)
     assert(old >= 1);
 }
 
-/* Check if any requests are in-flight (including throttled requests) */
-bool bdrv_requests_pending(BlockDriverState *bs)
-{
-    BdrvChild *child;
-
-    if (atomic_read(&bs->in_flight)) {
-        return true;
-    }
-
-    QLIST_FOREACH(child, &bs->children, next) {
-        if (bdrv_requests_pending(child->bs)) {
-            return true;
-        }
-    }
-
-    return false;
-}
-
 typedef struct {
     Coroutine *co;
     BlockDriverState *bs;
-- 
2.13.6

bdrv_drained_begin() doesn't increase bs->quiesce_counter recursively
and also doesn't notify other parent nodes of children, which both means
that the child nodes are not actually drained, and bdrv_drained_begin()
is providing useful functionality only on a single node.

To keep things consistent, we also shouldn't call the block driver
callbacks recursively.

A proper recursive drain version that provides an actually working
drained section for child nodes will be introduced later.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
---
 block/io.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
 }
 
 /* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
-static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
+static void bdrv_drain_invoke(BlockDriverState *bs, bool begin, bool recursive)
 {
     BdrvChild *child, *tmp;
     BdrvCoDrainData data = { .bs = bs, .done = false, .begin = begin};
@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
     bdrv_coroutine_enter(bs, data.co);
     BDRV_POLL_WHILE(bs, !data.done);
 
-    QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
-        bdrv_drain_invoke(child->bs, begin);
+    if (recursive) {
+        QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
+            bdrv_drain_invoke(child->bs, begin, true);
+        }
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
         bdrv_parent_drained_begin(bs);
     }
 
-    bdrv_drain_invoke(bs, true);
+    bdrv_drain_invoke(bs, true, false);
     bdrv_drain_recurse(bs);
 }
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
     }
 
     /* Re-enable things in child-to-parent order */
-    bdrv_drain_invoke(bs, false);
+    bdrv_drain_invoke(bs, false, false);
     bdrv_parent_drained_end(bs);
     aio_enable_external(bdrv_get_aio_context(bs));
 }
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
         aio_context_acquire(aio_context);
         aio_disable_external(aio_context);
         bdrv_parent_drained_begin(bs);
-        bdrv_drain_invoke(bs, true);
+        bdrv_drain_invoke(bs, true, true);
         aio_context_release(aio_context);
 
         if (!g_slist_find(aio_ctxs, aio_context)) {
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
 
         /* Re-enable things in child-to-parent order */
         aio_context_acquire(aio_context);
-        bdrv_drain_invoke(bs, false);
+        bdrv_drain_invoke(bs, false, true);
         bdrv_parent_drained_end(bs);
         aio_enable_external(aio_context);
         aio_context_release(aio_context);
-- 
2.13.6

The existing test is for bdrv_drain_all_begin/end() only. Generalise the
test case so that it can be run for the other variants as well. At the
moment this is only bdrv_drain_begin/end(), but in a while, we'll add
another one.

Also, add a backing file to the test node to test whether the operations
work recursively.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/test-bdrv-drain.c | 69 ++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 62 insertions(+), 7 deletions(-)

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_test = {
 
     .bdrv_co_drain_begin    = bdrv_test_co_drain_begin,
     .bdrv_co_drain_end      = bdrv_test_co_drain_end,
+
+    .bdrv_child_perm        = bdrv_format_default_perms,
 };
 
 static void aio_ret_cb(void *opaque, int ret)
@@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret)
     *aio_ret = ret;
 }
 
-static void test_drv_cb_drain_all(void)
+enum drain_type {
+    BDRV_DRAIN_ALL,
+    BDRV_DRAIN,
+};
+
+static void do_drain_begin(enum drain_type drain_type, BlockDriverState *bs)
+{
+    switch (drain_type) {
+    case BDRV_DRAIN_ALL:        bdrv_drain_all_begin(); break;
+    case BDRV_DRAIN:            bdrv_drained_begin(bs); break;
+    default:                    g_assert_not_reached();
+    }
+}
+
+static void do_drain_end(enum drain_type drain_type, BlockDriverState *bs)
+{
+    switch (drain_type) {
+    case BDRV_DRAIN_ALL:        bdrv_drain_all_end(); break;
+    case BDRV_DRAIN:            bdrv_drained_end(bs); break;
+    default:                    g_assert_not_reached();
+    }
+}
+
+static void test_drv_cb_common(enum drain_type drain_type, bool recursive)
 {
     BlockBackend *blk;
-    BlockDriverState *bs;
-    BDRVTestState *s;
+    BlockDriverState *bs, *backing;
+    BDRVTestState *s, *backing_s;
     BlockAIOCB *acb;
     int aio_ret;
 
@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain_all(void)
     s = bs->opaque;
     blk_insert_bs(blk, bs, &error_abort);
 
+    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
+    backing_s = backing->opaque;
+    bdrv_set_backing_hd(bs, backing, &error_abort);
+
     /* Simple bdrv_drain_all_begin/end pair, check that CBs are called */
     g_assert_cmpint(s->drain_count, ==, 0);
-    bdrv_drain_all_begin();
+    g_assert_cmpint(backing_s->drain_count, ==, 0);
+
+    do_drain_begin(drain_type, bs);
+
     g_assert_cmpint(s->drain_count, ==, 1);
-    bdrv_drain_all_end();
+    g_assert_cmpint(backing_s->drain_count, ==, !!recursive);
+
+    do_drain_end(drain_type, bs);
+
     g_assert_cmpint(s->drain_count, ==, 0);
+    g_assert_cmpint(backing_s->drain_count, ==, 0);
 
     /* Now do the same while a request is pending */
     aio_ret = -EINPROGRESS;
@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain_all(void)
     g_assert_cmpint(aio_ret, ==, -EINPROGRESS);
 
     g_assert_cmpint(s->drain_count, ==, 0);
-    bdrv_drain_all_begin();
+    g_assert_cmpint(backing_s->drain_count, ==, 0);
+
+    do_drain_begin(drain_type, bs);
+
     g_assert_cmpint(aio_ret, ==, 0);
     g_assert_cmpint(s->drain_count, ==, 1);
-    bdrv_drain_all_end();
+    g_assert_cmpint(backing_s->drain_count, ==, !!recursive);
+
+    do_drain_end(drain_type, bs);
+
     g_assert_cmpint(s->drain_count, ==, 0);
+    g_assert_cmpint(backing_s->drain_count, ==, 0);
 
+    bdrv_unref(backing);
     bdrv_unref(bs);
     blk_unref(blk);
 }
 
+static void test_drv_cb_drain_all(void)
+{
+    test_drv_cb_common(BDRV_DRAIN_ALL, true);
+}
+
+static void test_drv_cb_drain(void)
+{
+    test_drv_cb_common(BDRV_DRAIN, false);
+}
+
 int main(int argc, char **argv)
 {
     bdrv_init();
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
     g_test_init(&argc, &argv, NULL);
 
     g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
+    g_test_add_func("/bdrv-drain/driver-cb/drain", test_drv_cb_drain);
 
     return g_test_run();
 }
-- 
2.13.6

This is currently only working correctly for bdrv_drain(), not for
bdrv_drain_all(). Leave a comment for the drain_all case, we'll address
it later.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/test-bdrv-drain.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain(void)
     test_drv_cb_common(BDRV_DRAIN, false);
 }
 
+static void test_quiesce_common(enum drain_type drain_type, bool recursive)
+{
+    BlockBackend *blk;
+    BlockDriverState *bs, *backing;
+
+    blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
+                              &error_abort);
+    blk_insert_bs(blk, bs, &error_abort);
+
+    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
+    bdrv_set_backing_hd(bs, backing, &error_abort);
+
+    g_assert_cmpint(bs->quiesce_counter, ==, 0);
+    g_assert_cmpint(backing->quiesce_counter, ==, 0);
+
+    do_drain_begin(drain_type, bs);
+
+    g_assert_cmpint(bs->quiesce_counter, ==, 1);
+    g_assert_cmpint(backing->quiesce_counter, ==, !!recursive);
+
+    do_drain_end(drain_type, bs);
+
+    g_assert_cmpint(bs->quiesce_counter, ==, 0);
+    g_assert_cmpint(backing->quiesce_counter, ==, 0);
+
+    bdrv_unref(backing);
+    bdrv_unref(bs);
+    blk_unref(blk);
+}
+
+static void test_quiesce_drain_all(void)
+{
+    // XXX drain_all doesn't quiesce
+    //test_quiesce_common(BDRV_DRAIN_ALL, true);
+}
+
+static void test_quiesce_drain(void)
+{
+    test_quiesce_common(BDRV_DRAIN, false);
+}
+
 int main(int argc, char **argv)
 {
     bdrv_init();
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
     g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
     g_test_add_func("/bdrv-drain/driver-cb/drain", test_drv_cb_drain);
 
+    g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
+    g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
+
     return g_test_run();
 }
-- 
2.13.6

Block jobs already paused themselves when their main BlockBackend
entered a drained section. This is not good enough: We also want to
pause a block job and may not submit new requests if, for example, the
mirror target node should be drained.

This implements .drained_begin/end callbacks in child_job in order to
consider all block nodes related to the job, and removes the
BlockBackend callbacks which are unnecessary now because the root of the
job main BlockBackend is always referenced with a child_job, too.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 blockjob.c | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/blockjob.c b/blockjob.c
index XXXXXXX..XXXXXXX 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -XXX,XX +XXX,XX @@ static char *child_job_get_parent_desc(BdrvChild *c)
                            job->id);
 }
 
-static const BdrvChildRole child_job = {
-    .get_parent_desc    = child_job_get_parent_desc,
-    .stay_at_node       = true,
-};
-
-static void block_job_drained_begin(void *opaque)
+static void child_job_drained_begin(BdrvChild *c)
 {
-    BlockJob *job = opaque;
+    BlockJob *job = c->opaque;
     block_job_pause(job);
 }
 
-static void block_job_drained_end(void *opaque)
+static void child_job_drained_end(BdrvChild *c)
 {
-    BlockJob *job = opaque;
+    BlockJob *job = c->opaque;
     block_job_resume(job);
 }
 
-static const BlockDevOps block_job_dev_ops = {
-    .drained_begin = block_job_drained_begin,
-    .drained_end = block_job_drained_end,
+static const BdrvChildRole child_job = {
+    .get_parent_desc    = child_job_get_parent_desc,
+    .drained_begin      = child_job_drained_begin,
+    .drained_end        = child_job_drained_end,
+    .stay_at_node       = true,
 };
 
 void block_job_remove_all_bdrv(BlockJob *job)
@@ -XXX,XX +XXX,XX @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver,
     block_job_add_bdrv(job, "main node", bs, 0, BLK_PERM_ALL, &error_abort);
     bs->job = job;
 
-    blk_set_dev_ops(blk, &block_job_dev_ops, job);
     bdrv_op_unblock(bs, BLOCK_OP_TYPE_DATAPLANE, job->blocker);
 
     QLIST_INSERT_HEAD(&block_jobs, job, job_list);
-- 
2.13.6

Block jobs must be paused if any of the involved nodes are drained.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/test-bdrv-drain.c | 121 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 121 insertions(+)

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@
 
 #include "qemu/osdep.h"
 #include "block/block.h"
+#include "block/blockjob_int.h"
 #include "sysemu/block-backend.h"
 #include "qapi/error.h"
 
@@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain(void)
     test_quiesce_common(BDRV_DRAIN, false);
 }
 
+
+typedef struct TestBlockJob {
+    BlockJob common;
+    bool should_complete;
+} TestBlockJob;
+
+static void test_job_completed(BlockJob *job, void *opaque)
+{
+    block_job_completed(job, 0);
+}
+
+static void coroutine_fn test_job_start(void *opaque)
+{
+    TestBlockJob *s = opaque;
+
+    while (!s->should_complete) {
+        block_job_sleep_ns(&s->common, 100000);
+    }
+
+    block_job_defer_to_main_loop(&s->common, test_job_completed, NULL);
+}
+
+static void test_job_complete(BlockJob *job, Error **errp)
+{
+    TestBlockJob *s = container_of(job, TestBlockJob, common);
+    s->should_complete = true;
+}
+
+BlockJobDriver test_job_driver = {
+    .instance_size  = sizeof(TestBlockJob),
+    .start          = test_job_start,
+    .complete       = test_job_complete,
+};
+
+static void test_blockjob_common(enum drain_type drain_type)
+{
+    BlockBackend *blk_src, *blk_target;
+    BlockDriverState *src, *target;
+    BlockJob *job;
+    int ret;
+
+    src = bdrv_new_open_driver(&bdrv_test, "source", BDRV_O_RDWR,
+                               &error_abort);
+    blk_src = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    blk_insert_bs(blk_src, src, &error_abort);
+
+    target = bdrv_new_open_driver(&bdrv_test, "target", BDRV_O_RDWR,
+                                  &error_abort);
+    blk_target = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    blk_insert_bs(blk_target, target, &error_abort);
+
+    job = block_job_create("job0", &test_job_driver, src, 0, BLK_PERM_ALL, 0,
+                           0, NULL, NULL, &error_abort);
+    block_job_add_bdrv(job, "target", target, 0, BLK_PERM_ALL, &error_abort);
+    block_job_start(job);
+
+    g_assert_cmpint(job->pause_count, ==, 0);
+    g_assert_false(job->paused);
+    g_assert_false(job->busy); /* We're in block_job_sleep_ns() */
+
+    do_drain_begin(drain_type, src);
+
+    if (drain_type == BDRV_DRAIN_ALL) {
+        /* bdrv_drain_all() drains both src and target, and involves an
+         * additional block_job_pause_all() */
+        g_assert_cmpint(job->pause_count, ==, 3);
+    } else {
+        g_assert_cmpint(job->pause_count, ==, 1);
+    }
+    /* XXX We don't wait until the job is actually paused. Is this okay? */
+    /* g_assert_true(job->paused); */
+    g_assert_false(job->busy); /* The job is paused */
+
+    do_drain_end(drain_type, src);
+
+    g_assert_cmpint(job->pause_count, ==, 0);
+    g_assert_false(job->paused);
+    g_assert_false(job->busy); /* We're in block_job_sleep_ns() */
+
+    do_drain_begin(drain_type, target);
+
+    if (drain_type == BDRV_DRAIN_ALL) {
+        /* bdrv_drain_all() drains both src and target, and involves an
+         * additional block_job_pause_all() */
+        g_assert_cmpint(job->pause_count, ==, 3);
+    } else {
+        g_assert_cmpint(job->pause_count, ==, 1);
+    }
+    /* XXX We don't wait until the job is actually paused. Is this okay? */
+    /* g_assert_true(job->paused); */
+    g_assert_false(job->busy); /* The job is paused */
+
+    do_drain_end(drain_type, target);
+
+    g_assert_cmpint(job->pause_count, ==, 0);
+    g_assert_false(job->paused);
+    g_assert_false(job->busy); /* We're in block_job_sleep_ns() */
+
+    ret = block_job_complete_sync(job, &error_abort);
+    g_assert_cmpint(ret, ==, 0);
+
+    blk_unref(blk_src);
+    blk_unref(blk_target);
+    bdrv_unref(src);
+    bdrv_unref(target);
+}
+
+static void test_blockjob_drain_all(void)
+{
+    test_blockjob_common(BDRV_DRAIN_ALL);
+}
+
+static void test_blockjob_drain(void)
+{
+    test_blockjob_common(BDRV_DRAIN);
+}
+
 int main(int argc, char **argv)
 {
     bdrv_init();
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
     g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
     g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
 
+    g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
+    g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
+
     return g_test_run();
 }
-- 
2.13.6

Block jobs are already paused using the BdrvChildRole drain callbacks,
so we don't need an additional block_job_pause_all() call.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/io.c              |  4 ----
 tests/test-bdrv-drain.c | 10 ++++------
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
      * context. */
     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
 
-    block_job_pause_all();
-
     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
         AioContext *aio_context = bdrv_get_aio_context(bs);
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
         aio_enable_external(aio_context);
         aio_context_release(aio_context);
     }
-
-    block_job_resume_all();
 }
 
 void bdrv_drain_all(void)
diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static void test_blockjob_common(enum drain_type drain_type)
     do_drain_begin(drain_type, src);
 
     if (drain_type == BDRV_DRAIN_ALL) {
-        /* bdrv_drain_all() drains both src and target, and involves an
-         * additional block_job_pause_all() */
-        g_assert_cmpint(job->pause_count, ==, 3);
+        /* bdrv_drain_all() drains both src and target */
+        g_assert_cmpint(job->pause_count, ==, 2);
     } else {
         g_assert_cmpint(job->pause_count, ==, 1);
     }
@@ -XXX,XX +XXX,XX @@ static void test_blockjob_common(enum drain_type drain_type)
     do_drain_begin(drain_type, target);
 
     if (drain_type == BDRV_DRAIN_ALL) {
-        /* bdrv_drain_all() drains both src and target, and involves an
-         * additional block_job_pause_all() */
-        g_assert_cmpint(job->pause_count, ==, 3);
+        /* bdrv_drain_all() drains both src and target */
+        g_assert_cmpint(job->pause_count, ==, 2);
     } else {
         g_assert_cmpint(job->pause_count, ==, 1);
     }
-- 
2.13.6

bdrv_do_drained_begin() restricts the call of parent callbacks and
aio_disable_external() to the outermost drain section, but the block
driver callbacks are always called. bdrv_do_drained_end() must match
this behaviour, otherwise nodes stay drained even if begin/end calls
were balanced.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/io.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
 
 void bdrv_drained_end(BlockDriverState *bs)
 {
+    int old_quiesce_counter;
+
     if (qemu_in_coroutine()) {
         bdrv_co_yield_to_drain(bs, false);
         return;
     }
     assert(bs->quiesce_counter > 0);
-    if (atomic_fetch_dec(&bs->quiesce_counter) > 1) {
-        return;
-    }
+    old_quiesce_counter = atomic_fetch_dec(&bs->quiesce_counter);
 
     /* Re-enable things in child-to-parent order */
     bdrv_drain_invoke(bs, false, false);
-    bdrv_parent_drained_end(bs);
-    aio_enable_external(bdrv_get_aio_context(bs));
+    if (old_quiesce_counter == 1) {
+        bdrv_parent_drained_end(bs);
+        aio_enable_external(bdrv_get_aio_context(bs));
+    }
 }
 
 /*
-- 
2.13.6

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/test-bdrv-drain.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret)
 enum drain_type {
     BDRV_DRAIN_ALL,
     BDRV_DRAIN,
+    DRAIN_TYPE_MAX,
 };
 
 static void do_drain_begin(enum drain_type drain_type, BlockDriverState *bs)
@@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain(void)
     test_quiesce_common(BDRV_DRAIN, false);
 }
 
+static void test_nested(void)
+{
+    BlockBackend *blk;
+    BlockDriverState *bs, *backing;
+    BDRVTestState *s, *backing_s;
+    enum drain_type outer, inner;
+
+    blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
+                              &error_abort);
+    s = bs->opaque;
+    blk_insert_bs(blk, bs, &error_abort);
+
+    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
+    backing_s = backing->opaque;
+    bdrv_set_backing_hd(bs, backing, &error_abort);
+
+    for (outer = 0; outer < DRAIN_TYPE_MAX; outer++) {
+        for (inner = 0; inner < DRAIN_TYPE_MAX; inner++) {
+            /* XXX bdrv_drain_all() doesn't increase the quiesce_counter */
+            int bs_quiesce      = (outer != BDRV_DRAIN_ALL) +
+                                  (inner != BDRV_DRAIN_ALL);
+            int backing_quiesce = 0;
+            int backing_cb_cnt  = (outer != BDRV_DRAIN) +
+                                  (inner != BDRV_DRAIN);
+
+            g_assert_cmpint(bs->quiesce_counter, ==, 0);
+            g_assert_cmpint(backing->quiesce_counter, ==, 0);
+            g_assert_cmpint(s->drain_count, ==, 0);
+            g_assert_cmpint(backing_s->drain_count, ==, 0);
+
+            do_drain_begin(outer, bs);
+            do_drain_begin(inner, bs);
+
+            g_assert_cmpint(bs->quiesce_counter, ==, bs_quiesce);
+            g_assert_cmpint(backing->quiesce_counter, ==, backing_quiesce);
+            g_assert_cmpint(s->drain_count, ==, 2);
+            g_assert_cmpint(backing_s->drain_count, ==, backing_cb_cnt);
+
+            do_drain_end(inner, bs);
+            do_drain_end(outer, bs);
+
+            g_assert_cmpint(bs->quiesce_counter, ==, 0);
+            g_assert_cmpint(backing->quiesce_counter, ==, 0);
+            g_assert_cmpint(s->drain_count, ==, 0);
+            g_assert_cmpint(backing_s->drain_count, ==, 0);
+        }
+    }
+
+    bdrv_unref(backing);
+    bdrv_unref(bs);
+    blk_unref(blk);
+}
+
 
 typedef struct TestBlockJob {
     BlockJob common;
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
     g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
     g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
 
+    g_test_add_func("/bdrv-drain/nested", test_nested);
+
     g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
     g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
 
-- 
2.13.6

This is in preparation for subtree drains, i.e. drained sections that
affect not only a single node, but recursively all child nodes, too.

Calling the parent callbacks for drain is pointless when we just came
from that parent node recursively and leads to multiple increases of
bs->quiesce_counter in a single drain call. Don't do it.

In order for this to work correctly, the parent callback must be called
for every bdrv_drain_begin/end() call, not only for the outermost one:

If we have a node N with two parents A and B, recursive draining of A
should cause the quiesce_counter of B to increase because its child N is
drained independently of B. If now B is recursively drained, too, A must
increase its quiesce_counter because N is drained independently of A
only now, even if N is going from quiesce_counter 1 to 2.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/block.h |  4 ++--
 block.c               | 13 +++++++++----
 block/io.c            | 47 ++++++++++++++++++++++++++++++++++-------------
 3 files changed, 45 insertions(+), 19 deletions(-)

diff --git a/include/block/block.h b/include/block/block.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -XXX,XX +XXX,XX @@ void bdrv_io_unplug(BlockDriverState *bs);
  * Begin a quiesced section of all users of @bs. This is part of
  * bdrv_drained_begin.
  */
-void bdrv_parent_drained_begin(BlockDriverState *bs);
+void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore);
 
 /**
  * bdrv_parent_drained_end:
@@ -XXX,XX +XXX,XX @@ void bdrv_parent_drained_begin(BlockDriverState *bs);
  * End a quiesced section of all users of @bs. This is part of
  * bdrv_drained_end.
  */
-void bdrv_parent_drained_end(BlockDriverState *bs);
+void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore);
 
 /**
  * bdrv_drained_begin:
diff --git a/block.c b/block.c
index XXXXXXX..XXXXXXX 100644
--- a/block.c
+++ b/block.c
@@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child,
                                       BlockDriverState *new_bs)
 {
     BlockDriverState *old_bs = child->bs;
+    int i;
 
     if (old_bs && new_bs) {
         assert(bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs));
     }
     if (old_bs) {
         if (old_bs->quiesce_counter && child->role->drained_end) {
-            child->role->drained_end(child);
+            for (i = 0; i < old_bs->quiesce_counter; i++) {
+                child->role->drained_end(child);
+            }
         }
         if (child->role->detach) {
             child->role->detach(child);
@@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child,
     if (new_bs) {
         QLIST_INSERT_HEAD(&new_bs->parents, child, next_parent);
         if (new_bs->quiesce_counter && child->role->drained_begin) {
-            child->role->drained_begin(child);
+            for (i = 0; i < new_bs->quiesce_counter; i++) {
+                child->role->drained_begin(child);
+            }
         }
 
         if (child->role->attach) {
@@ -XXX,XX +XXX,XX @@ void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
     AioContext *ctx = bdrv_get_aio_context(bs);
 
     aio_disable_external(ctx);
-    bdrv_parent_drained_begin(bs);
+    bdrv_parent_drained_begin(bs, NULL);
     bdrv_drain(bs); /* ensure there are no in-flight requests */
 
     while (aio_poll(ctx, false)) {
@@ -XXX,XX +XXX,XX @@ void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
      */
     aio_context_acquire(new_context);
     bdrv_attach_aio_context(bs, new_context);
-    bdrv_parent_drained_end(bs);
+    bdrv_parent_drained_end(bs, NULL);
     aio_enable_external(ctx);
     aio_context_release(new_context);
 }
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@
 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
     int64_t offset, int bytes, BdrvRequestFlags flags);
 
-void bdrv_parent_drained_begin(BlockDriverState *bs)
+void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore)
 {
     BdrvChild *c, *next;
 
     QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
+        if (c == ignore) {
+            continue;
+        }
         if (c->role->drained_begin) {
             c->role->drained_begin(c);
         }
     }
 }
 
-void bdrv_parent_drained_end(BlockDriverState *bs)
+void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore)
 {
     BdrvChild *c, *next;
 
     QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
+        if (c == ignore) {
+            continue;
+        }
         if (c->role->drained_end) {
             c->role->drained_end(c);
         }
@@ -XXX,XX +XXX,XX @@ typedef struct {
     BlockDriverState *bs;
     bool done;
     bool begin;
+    BdrvChild *parent;
 } BdrvCoDrainData;
 
 static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs)
     return waited;
 }
 
+static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent);
+static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent);
+
 static void bdrv_co_drain_bh_cb(void *opaque)
 {
     BdrvCoDrainData *data = opaque;
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
 
     bdrv_dec_in_flight(bs);
     if (data->begin) {
-        bdrv_drained_begin(bs);
+        bdrv_do_drained_begin(bs, data->parent);
     } else {
-        bdrv_drained_end(bs);
+        bdrv_do_drained_end(bs, data->parent);
     }
 
     data->done = true;
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
 }
 
 static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
-                                                bool begin)
+                                                bool begin, BdrvChild *parent)
 {
     BdrvCoDrainData data;
 
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
         .bs = bs,
         .done = false,
         .begin = begin,
+        .parent = parent,
     };
     bdrv_inc_in_flight(bs);
     aio_bh_schedule_oneshot(bdrv_get_aio_context(bs),
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
     assert(data.done);
 }
 
-void bdrv_drained_begin(BlockDriverState *bs)
+static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent)
 {
     if (qemu_in_coroutine()) {
-        bdrv_co_yield_to_drain(bs, true);
+        bdrv_co_yield_to_drain(bs, true, parent);
         return;
     }
 
     /* Stop things in parent-to-child order */
     if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
         aio_disable_external(bdrv_get_aio_context(bs));
-        bdrv_parent_drained_begin(bs);
     }
 
+    bdrv_parent_drained_begin(bs, parent);
     bdrv_drain_invoke(bs, true, false);
     bdrv_drain_recurse(bs);
 }
 
-void bdrv_drained_end(BlockDriverState *bs)
+void bdrv_drained_begin(BlockDriverState *bs)
+{
+    bdrv_do_drained_begin(bs, NULL);
+}
+
+static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent)
 {
     int old_quiesce_counter;
 
     if (qemu_in_coroutine()) {
-        bdrv_co_yield_to_drain(bs, false);
+        bdrv_co_yield_to_drain(bs, false, parent);
         return;
     }
     assert(bs->quiesce_counter > 0);
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
 
     /* Re-enable things in child-to-parent order */
     bdrv_drain_invoke(bs, false, false);
+    bdrv_parent_drained_end(bs, parent);
     if (old_quiesce_counter == 1) {
-        bdrv_parent_drained_end(bs);
         aio_enable_external(bdrv_get_aio_context(bs));
     }
 }
 
+void bdrv_drained_end(BlockDriverState *bs)
+{
+    bdrv_do_drained_end(bs, NULL);
+}
+
 /*
  * Wait for pending requests to complete on a single BlockDriverState subtree,
  * and suspend block driver's internal I/O until next request arrives.
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
         /* Stop things in parent-to-child order */
         aio_context_acquire(aio_context);
         aio_disable_external(aio_context);
-        bdrv_parent_drained_begin(bs);
+        bdrv_parent_drained_begin(bs, NULL);
         bdrv_drain_invoke(bs, true, true);
         aio_context_release(aio_context);
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
         /* Re-enable things in child-to-parent order */
         aio_context_acquire(aio_context);
         bdrv_drain_invoke(bs, false, true);
-        bdrv_parent_drained_end(bs);
+        bdrv_parent_drained_end(bs, NULL);
         aio_enable_external(aio_context);
         aio_context_release(aio_context);
     }
-- 
2.13.6

bdrv_drained_begin() waits for the completion of requests in the whole
subtree, but it only actually keeps its immediate bs parameter quiesced
until bdrv_drained_end().

Add a version that keeps the whole subtree drained. As of this commit,
graph changes cannot be allowed during a subtree drained section, but
this will be fixed soon.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/block.h | 13 +++++++++++++
 block/io.c            | 54 ++++++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 56 insertions(+), 11 deletions(-)

diff --git a/include/block/block.h b/include/block/block.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -XXX,XX +XXX,XX @@ void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore);
 void bdrv_drained_begin(BlockDriverState *bs);
 
 /**
+ * Like bdrv_drained_begin, but recursively begins a quiesced section for
+ * exclusive access to all child nodes as well.
+ *
+ * Graph changes are not allowed during a subtree drain section.
+ */
+void bdrv_subtree_drained_begin(BlockDriverState *bs);
+
+/**
  * bdrv_drained_end:
  *
  * End a quiescent section started by bdrv_drained_begin().
  */
 void bdrv_drained_end(BlockDriverState *bs);
 
+/**
+ * End a quiescent section started by bdrv_subtree_drained_begin().
+ */
+void bdrv_subtree_drained_end(BlockDriverState *bs);
+
 void bdrv_add_child(BlockDriverState *parent, BlockDriverState *child,
                     Error **errp);
 void bdrv_del_child(BlockDriverState *parent, BdrvChild *child, Error **errp);
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ typedef struct {
     BlockDriverState *bs;
     bool done;
     bool begin;
+    bool recursive;
     BdrvChild *parent;
 } BdrvCoDrainData;
 
@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs)
     return waited;
 }
 
-static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent);
-static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent);
+static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
+                                  BdrvChild *parent);
+static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
+                                BdrvChild *parent);
 
 static void bdrv_co_drain_bh_cb(void *opaque)
 {
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
 
     bdrv_dec_in_flight(bs);
     if (data->begin) {
-        bdrv_do_drained_begin(bs, data->parent);
+        bdrv_do_drained_begin(bs, data->recursive, data->parent);
     } else {
-        bdrv_do_drained_end(bs, data->parent);
+        bdrv_do_drained_end(bs, data->recursive, data->parent);
     }
 
     data->done = true;
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
 }
 
 static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
-                                                bool begin, BdrvChild *parent)
+                                                bool begin, bool recursive,
+                                                BdrvChild *parent)
 {
     BdrvCoDrainData data;
 
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
         .bs = bs,
         .done = false,
         .begin = begin,
+        .recursive = recursive,
         .parent = parent,
     };
     bdrv_inc_in_flight(bs);
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
     assert(data.done);
 }
 
-static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent)
+static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
+                                  BdrvChild *parent)
 {
+    BdrvChild *child, *next;
+
     if (qemu_in_coroutine()) {
-        bdrv_co_yield_to_drain(bs, true, parent);
+        bdrv_co_yield_to_drain(bs, true, recursive, parent);
         return;
     }
 
@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent)
     bdrv_parent_drained_begin(bs, parent);
     bdrv_drain_invoke(bs, true, false);
     bdrv_drain_recurse(bs);
+
+    if (recursive) {
+        QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
+            bdrv_do_drained_begin(child->bs, true, child);
+        }
+    }
 }
 
 void bdrv_drained_begin(BlockDriverState *bs)
 {
-    bdrv_do_drained_begin(bs, NULL);
+    bdrv_do_drained_begin(bs, false, NULL);
+}
+
+void bdrv_subtree_drained_begin(BlockDriverState *bs)
+{
+    bdrv_do_drained_begin(bs, true, NULL);
 }
 
-static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent)
+static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
+                                BdrvChild *parent)
 {
+    BdrvChild *child, *next;
     int old_quiesce_counter;
 
     if (qemu_in_coroutine()) {
-        bdrv_co_yield_to_drain(bs, false, parent);
+        bdrv_co_yield_to_drain(bs, false, recursive, parent);
         return;
     }
     assert(bs->quiesce_counter > 0);
@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent)
     if (old_quiesce_counter == 1) {
         aio_enable_external(bdrv_get_aio_context(bs));
     }
+
+    if (recursive) {
+        QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
+            bdrv_do_drained_end(child->bs, true, child);
+        }
+    }
 }
 
 void bdrv_drained_end(BlockDriverState *bs)
 {
-    bdrv_do_drained_end(bs, NULL);
+    bdrv_do_drained_end(bs, false, NULL);
+}
+
+void bdrv_subtree_drained_end(BlockDriverState *bs)
+{
+    bdrv_do_drained_end(bs, true, NULL);
 }
 
 /*
-- 
2.13.6

Add a subtree drain version to the existing test cases.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/test-bdrv-drain.c | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret)
 enum drain_type {
     BDRV_DRAIN_ALL,
     BDRV_DRAIN,
+    BDRV_SUBTREE_DRAIN,
     DRAIN_TYPE_MAX,
 };
 
@@ -XXX,XX +XXX,XX @@ static void do_drain_begin(enum drain_type drain_type, BlockDriverState *bs)
     switch (drain_type) {
     case BDRV_DRAIN_ALL:        bdrv_drain_all_begin(); break;
     case BDRV_DRAIN:            bdrv_drained_begin(bs); break;
+    case BDRV_SUBTREE_DRAIN:    bdrv_subtree_drained_begin(bs); break;
     default:                    g_assert_not_reached();
     }
 }
@@ -XXX,XX +XXX,XX @@ static void do_drain_end(enum drain_type drain_type, BlockDriverState *bs)
     switch (drain_type) {
     case BDRV_DRAIN_ALL:        bdrv_drain_all_end(); break;
     case BDRV_DRAIN:            bdrv_drained_end(bs); break;
+    case BDRV_SUBTREE_DRAIN:    bdrv_subtree_drained_end(bs); break;
     default:                    g_assert_not_reached();
     }
 }
@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain(void)
     test_drv_cb_common(BDRV_DRAIN, false);
 }
 
+static void test_drv_cb_drain_subtree(void)
+{
+    test_drv_cb_common(BDRV_SUBTREE_DRAIN, true);
+}
+
 static void test_quiesce_common(enum drain_type drain_type, bool recursive)
 {
     BlockBackend *blk;
@@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain(void)
     test_quiesce_common(BDRV_DRAIN, false);
 }
 
+static void test_quiesce_drain_subtree(void)
+{
+    test_quiesce_common(BDRV_SUBTREE_DRAIN, true);
+}
+
 static void test_nested(void)
 {
     BlockBackend *blk;
@@ -XXX,XX +XXX,XX @@ static void test_nested(void)
             /* XXX bdrv_drain_all() doesn't increase the quiesce_counter */
             int bs_quiesce      = (outer != BDRV_DRAIN_ALL) +
                                   (inner != BDRV_DRAIN_ALL);
-            int backing_quiesce = 0;
+            int backing_quiesce = (outer == BDRV_SUBTREE_DRAIN) +
+                                  (inner == BDRV_SUBTREE_DRAIN);
             int backing_cb_cnt  = (outer != BDRV_DRAIN) +
                                   (inner != BDRV_DRAIN);
 
@@ -XXX,XX +XXX,XX @@ static void test_blockjob_drain(void)
     test_blockjob_common(BDRV_DRAIN);
 }
 
+static void test_blockjob_drain_subtree(void)
+{
+    test_blockjob_common(BDRV_SUBTREE_DRAIN);
+}
+
 int main(int argc, char **argv)
 {
     bdrv_init();
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
 
     g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
     g_test_add_func("/bdrv-drain/driver-cb/drain", test_drv_cb_drain);
+    g_test_add_func("/bdrv-drain/driver-cb/drain_subtree",
+                    test_drv_cb_drain_subtree);
 
     g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
     g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
+    g_test_add_func("/bdrv-drain/quiesce/drain_subtree",
+                    test_quiesce_drain_subtree);
 
     g_test_add_func("/bdrv-drain/nested", test_nested);
 
     g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
     g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
+    g_test_add_func("/bdrv-drain/blockjob/drain_subtree",
+                    test_blockjob_drain_subtree);
 
     return g_test_run();
 }
-- 
2.13.6

If bdrv_do_drained_begin/end() are called in coroutine context, they
first use a BH to get out of the coroutine context. Call some existing
tests again from a coroutine to cover this code path.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/test-bdrv-drain.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret)
     *aio_ret = ret;
 }
 
+typedef struct CallInCoroutineData {
+    void (*entry)(void);
+    bool done;
+} CallInCoroutineData;
+
+static coroutine_fn void call_in_coroutine_entry(void *opaque)
+{
+    CallInCoroutineData *data = opaque;
+
+    data->entry();
+    data->done = true;
+}
+
+static void call_in_coroutine(void (*entry)(void))
+{
+    Coroutine *co;
+    CallInCoroutineData data = {
+        .entry  = entry,
+        .done   = false,
+    };
+
+    co = qemu_coroutine_create(call_in_coroutine_entry, &data);
+    qemu_coroutine_enter(co);
+    while (!data.done) {
+        aio_poll(qemu_get_aio_context(), true);
+    }
+}
+
 enum drain_type {
     BDRV_DRAIN_ALL,
     BDRV_DRAIN,
@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain_subtree(void)
     test_drv_cb_common(BDRV_SUBTREE_DRAIN, true);
 }
 
+static void test_drv_cb_co_drain(void)
+{
+    call_in_coroutine(test_drv_cb_drain);
+}
+
+static void test_drv_cb_co_drain_subtree(void)
+{
+    call_in_coroutine(test_drv_cb_drain_subtree);
+}
+
 static void test_quiesce_common(enum drain_type drain_type, bool recursive)
 {
     BlockBackend *blk;
@@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain_subtree(void)
     test_quiesce_common(BDRV_SUBTREE_DRAIN, true);
 }
 
+static void test_quiesce_co_drain(void)
+{
+    call_in_coroutine(test_quiesce_drain);
+}
+
+static void test_quiesce_co_drain_subtree(void)
+{
+    call_in_coroutine(test_quiesce_drain_subtree);
+}
+
 static void test_nested(void)
 {
     BlockBackend *blk;
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
     g_test_add_func("/bdrv-drain/driver-cb/drain_subtree",
                     test_drv_cb_drain_subtree);
 
+    // XXX bdrv_drain_all() doesn't work in coroutine context
+    g_test_add_func("/bdrv-drain/driver-cb/co/drain", test_drv_cb_co_drain);
+    g_test_add_func("/bdrv-drain/driver-cb/co/drain_subtree",
+                    test_drv_cb_co_drain_subtree);
+
+
     g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
     g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
     g_test_add_func("/bdrv-drain/quiesce/drain_subtree",
                     test_quiesce_drain_subtree);
 
+    // XXX bdrv_drain_all() doesn't work in coroutine context
+    g_test_add_func("/bdrv-drain/quiesce/co/drain", test_quiesce_co_drain);
+    g_test_add_func("/bdrv-drain/quiesce/co/drain_subtree",
+                    test_quiesce_co_drain_subtree);
+
     g_test_add_func("/bdrv-drain/nested", test_nested);
 
     g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
-- 
2.13.6

Test that drain sections are correctly propagated through the graph.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/test-bdrv-drain.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static void test_nested(void)
     blk_unref(blk);
 }
 
+static void test_multiparent(void)
+{
+    BlockBackend *blk_a, *blk_b;
+    BlockDriverState *bs_a, *bs_b, *backing;
+    BDRVTestState *a_s, *b_s, *backing_s;
+
+    blk_a = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    bs_a = bdrv_new_open_driver(&bdrv_test, "test-node-a", BDRV_O_RDWR,
+                                &error_abort);
+    a_s = bs_a->opaque;
+    blk_insert_bs(blk_a, bs_a, &error_abort);
+
+    blk_b = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    bs_b = bdrv_new_open_driver(&bdrv_test, "test-node-b", BDRV_O_RDWR,
+                                &error_abort);
+    b_s = bs_b->opaque;
+    blk_insert_bs(blk_b, bs_b, &error_abort);
+
+    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
+    backing_s = backing->opaque;
+    bdrv_set_backing_hd(bs_a, backing, &error_abort);
+    bdrv_set_backing_hd(bs_b, backing, &error_abort);
+
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
+    g_assert_cmpint(backing->quiesce_counter, ==, 0);
+    g_assert_cmpint(a_s->drain_count, ==, 0);
+    g_assert_cmpint(b_s->drain_count, ==, 0);
+    g_assert_cmpint(backing_s->drain_count, ==, 0);
+
+    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
+
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
+    g_assert_cmpint(backing->quiesce_counter, ==, 1);
+    g_assert_cmpint(a_s->drain_count, ==, 1);
+    g_assert_cmpint(b_s->drain_count, ==, 1);
+    g_assert_cmpint(backing_s->drain_count, ==, 1);
+
+    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b);
+
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 2);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 2);
+    g_assert_cmpint(backing->quiesce_counter, ==, 2);
+    g_assert_cmpint(a_s->drain_count, ==, 2);
+    g_assert_cmpint(b_s->drain_count, ==, 2);
+    g_assert_cmpint(backing_s->drain_count, ==, 2);
+
+    do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
+
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
+    g_assert_cmpint(backing->quiesce_counter, ==, 1);
+    g_assert_cmpint(a_s->drain_count, ==, 1);
+    g_assert_cmpint(b_s->drain_count, ==, 1);
+    g_assert_cmpint(backing_s->drain_count, ==, 1);
+
+    do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
+
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
+    g_assert_cmpint(backing->quiesce_counter, ==, 0);
+    g_assert_cmpint(a_s->drain_count, ==, 0);
+    g_assert_cmpint(b_s->drain_count, ==, 0);
+    g_assert_cmpint(backing_s->drain_count, ==, 0);
+
+    bdrv_unref(backing);
+    bdrv_unref(bs_a);
+    bdrv_unref(bs_b);
+    blk_unref(blk_a);
+    blk_unref(blk_b);
+}
+
 
 typedef struct TestBlockJob {
     BlockJob common;
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
                     test_quiesce_co_drain_subtree);
 
     g_test_add_func("/bdrv-drain/nested", test_nested);
+    g_test_add_func("/bdrv-drain/multiparent", test_multiparent);
 
     g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
     g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
-- 
2.13.6

We need to remember how many of the drain sections in which a node is
were recursive (i.e. subtree drain rather than node drain), so that they
can be correctly applied when children are added or removed during the
drained section.

With this change, it is safe to modify the graph even inside a
bdrv_subtree_drained_begin/end() section.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/block.h     |  2 --
 include/block/block_int.h |  5 +++++
 block.c                   | 32 +++++++++++++++++++++++++++++---
 block/io.c                | 28 ++++++++++++++++++++++++----
 4 files changed, 58 insertions(+), 9 deletions(-)

diff --git a/include/block/block.h b/include/block/block.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs);
 /**
  * Like bdrv_drained_begin, but recursively begins a quiesced section for
  * exclusive access to all child nodes as well.
- *
- * Graph changes are not allowed during a subtree drain section.
  */
 void bdrv_subtree_drained_begin(BlockDriverState *bs);
 
diff --git a/include/block/block_int.h b/include/block/block_int.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
 
     /* Accessed with atomic ops.  */
     int quiesce_counter;
+    int recursive_quiesce_counter;
+
     unsigned int write_gen;               /* Current data generation */
 
     /* Protected by reqs_lock.  */
@@ -XXX,XX +XXX,XX @@ int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
     BdrvRequestFlags flags);
 
+void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent);
+void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent);
+
 int get_tmp_filename(char *filename, int size);
 BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
                             const char *filename);
diff --git a/block.c b/block.c
index XXXXXXX..XXXXXXX 100644
--- a/block.c
+++ b/block.c
@@ -XXX,XX +XXX,XX @@ static void bdrv_child_cb_drained_end(BdrvChild *child)
     bdrv_drained_end(bs);
 }
 
+static void bdrv_child_cb_attach(BdrvChild *child)
+{
+    BlockDriverState *bs = child->opaque;
+    bdrv_apply_subtree_drain(child, bs);
+}
+
+static void bdrv_child_cb_detach(BdrvChild *child)
+{
+    BlockDriverState *bs = child->opaque;
+    bdrv_unapply_subtree_drain(child, bs);
+}
+
 static int bdrv_child_cb_inactivate(BdrvChild *child)
 {
     BlockDriverState *bs = child->opaque;
@@ -XXX,XX +XXX,XX @@ const BdrvChildRole child_file = {
     .inherit_options = bdrv_inherited_options,
     .drained_begin   = bdrv_child_cb_drained_begin,
     .drained_end     = bdrv_child_cb_drained_end,
+    .attach          = bdrv_child_cb_attach,
+    .detach          = bdrv_child_cb_detach,
     .inactivate      = bdrv_child_cb_inactivate,
 };
 
@@ -XXX,XX +XXX,XX @@ const BdrvChildRole child_format = {
     .inherit_options = bdrv_inherited_fmt_options,
     .drained_begin   = bdrv_child_cb_drained_begin,
     .drained_end     = bdrv_child_cb_drained_end,
+    .attach          = bdrv_child_cb_attach,
+    .detach          = bdrv_child_cb_detach,
     .inactivate      = bdrv_child_cb_inactivate,
 };
 
@@ -XXX,XX +XXX,XX @@ static void bdrv_backing_attach(BdrvChild *c)
                     parent->backing_blocker);
     bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_BACKUP_TARGET,
                     parent->backing_blocker);
+
+    bdrv_child_cb_attach(c);
 }
 
 static void bdrv_backing_detach(BdrvChild *c)
@@ -XXX,XX +XXX,XX @@ static void bdrv_backing_detach(BdrvChild *c)
     bdrv_op_unblock_all(c->bs, parent->backing_blocker);
     error_free(parent->backing_blocker);
     parent->backing_blocker = NULL;
+
+    bdrv_child_cb_detach(c);
 }
 
 /*
@@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child,
         assert(bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs));
     }
     if (old_bs) {
+        /* Detach first so that the recursive drain sections coming from @child
+         * are already gone and we only end the drain sections that came from
+         * elsewhere. */
+        if (child->role->detach) {
+            child->role->detach(child);
+        }
         if (old_bs->quiesce_counter && child->role->drained_end) {
             for (i = 0; i < old_bs->quiesce_counter; i++) {
                 child->role->drained_end(child);
             }
         }
-        if (child->role->detach) {
-            child->role->detach(child);
-        }
         QLIST_REMOVE(child, next_parent);
     }
 
@@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child,
             }
         }
 
+        /* Attach only after starting new drained sections, so that recursive
+         * drain sections coming from @child don't get an extra .drained_begin
+         * callback. */
         if (child->role->attach) {
             child->role->attach(child);
         }
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
     assert(data.done);
 }
 
-static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
-                                  BdrvChild *parent)
+void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
+                           BdrvChild *parent)
 {
     BdrvChild *child, *next;
 
@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
     bdrv_drain_recurse(bs);
 
     if (recursive) {
+        bs->recursive_quiesce_counter++;
         QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
             bdrv_do_drained_begin(child->bs, true, child);
         }
@@ -XXX,XX +XXX,XX @@ void bdrv_subtree_drained_begin(BlockDriverState *bs)
     bdrv_do_drained_begin(bs, true, NULL);
 }
 
-static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
-                                BdrvChild *parent)
+void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
+                         BdrvChild *parent)
 {
     BdrvChild *child, *next;
     int old_quiesce_counter;
@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
     }
 
     if (recursive) {
+        bs->recursive_quiesce_counter--;
         QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
             bdrv_do_drained_end(child->bs, true, child);
         }
@@ -XXX,XX +XXX,XX @@ void bdrv_subtree_drained_end(BlockDriverState *bs)
     bdrv_do_drained_end(bs, true, NULL);
 }
 
+void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent)
+{
+    int i;
+
+    for (i = 0; i < new_parent->recursive_quiesce_counter; i++) {
+        bdrv_do_drained_begin(child->bs, true, child);
+    }
+}
+
+void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent)
+{
+    int i;
+
+    for (i = 0; i < old_parent->recursive_quiesce_counter; i++) {
+        bdrv_do_drained_end(child->bs, true, child);
+    }
+}
+
 /*
  * Wait for pending requests to complete on a single BlockDriverState subtree,
  * and suspend block driver's internal I/O until next request arrives.
-- 
2.13.6

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/test-bdrv-drain.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static void test_multiparent(void)
     blk_unref(blk_b);
 }
 
+static void test_graph_change(void)
+{
+    BlockBackend *blk_a, *blk_b;
+    BlockDriverState *bs_a, *bs_b, *backing;
+    BDRVTestState *a_s, *b_s, *backing_s;
+
+    blk_a = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    bs_a = bdrv_new_open_driver(&bdrv_test, "test-node-a", BDRV_O_RDWR,
+                                &error_abort);
+    a_s = bs_a->opaque;
+    blk_insert_bs(blk_a, bs_a, &error_abort);
+
+    blk_b = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    bs_b = bdrv_new_open_driver(&bdrv_test, "test-node-b", BDRV_O_RDWR,
+                                &error_abort);
+    b_s = bs_b->opaque;
+    blk_insert_bs(blk_b, bs_b, &error_abort);
+
+    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
+    backing_s = backing->opaque;
+    bdrv_set_backing_hd(bs_a, backing, &error_abort);
+
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
+    g_assert_cmpint(backing->quiesce_counter, ==, 0);
+    g_assert_cmpint(a_s->drain_count, ==, 0);
+    g_assert_cmpint(b_s->drain_count, ==, 0);
+    g_assert_cmpint(backing_s->drain_count, ==, 0);
+
+    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
+    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
+    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
+    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b);
+    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b);
+
+    bdrv_set_backing_hd(bs_b, backing, &error_abort);
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 5);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 5);
+    g_assert_cmpint(backing->quiesce_counter, ==, 5);
+    g_assert_cmpint(a_s->drain_count, ==, 5);
+    g_assert_cmpint(b_s->drain_count, ==, 5);
+    g_assert_cmpint(backing_s->drain_count, ==, 5);
+
+    bdrv_set_backing_hd(bs_b, NULL, &error_abort);
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 3);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 2);
+    g_assert_cmpint(backing->quiesce_counter, ==, 3);
+    g_assert_cmpint(a_s->drain_count, ==, 3);
+    g_assert_cmpint(b_s->drain_count, ==, 2);
+    g_assert_cmpint(backing_s->drain_count, ==, 3);
+
+    bdrv_set_backing_hd(bs_b, backing, &error_abort);
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 5);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 5);
+    g_assert_cmpint(backing->quiesce_counter, ==, 5);
+    g_assert_cmpint(a_s->drain_count, ==, 5);
+    g_assert_cmpint(b_s->drain_count, ==, 5);
+    g_assert_cmpint(backing_s->drain_count, ==, 5);
+
+    do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
+    do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
+    do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
+    do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
+    do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
+
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
+    g_assert_cmpint(backing->quiesce_counter, ==, 0);
+    g_assert_cmpint(a_s->drain_count, ==, 0);
+    g_assert_cmpint(b_s->drain_count, ==, 0);
+    g_assert_cmpint(backing_s->drain_count, ==, 0);
+
+    bdrv_unref(backing);
+    bdrv_unref(bs_a);
+    bdrv_unref(bs_b);
+    blk_unref(blk_a);
+    blk_unref(blk_b);
+}
+
 
 typedef struct TestBlockJob {
     BlockJob common;
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
 
     g_test_add_func("/bdrv-drain/nested", test_nested);
     g_test_add_func("/bdrv-drain/multiparent", test_multiparent);
+    g_test_add_func("/bdrv-drain/graph-change", test_graph_change);
 
     g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
     g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
-- 
2.13.6

Since commit bde70715, base is the only node that is reopened in
commit_start(). This means that the code, which still involves an
explicit BlockReopenQueue, can now be simplified by using bdrv_reopen().

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
---
 block/commit.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/block/commit.c b/block/commit.c
index XXXXXXX..XXXXXXX 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -XXX,XX +XXX,XX @@ void commit_start(const char *job_id, BlockDriverState *bs,
                   const char *filter_node_name, Error **errp)
 {
     CommitBlockJob *s;
-    BlockReopenQueue *reopen_queue = NULL;
     int orig_base_flags;
     BlockDriverState *iter;
     BlockDriverState *commit_top_bs = NULL;
@@ -XXX,XX +XXX,XX @@ void commit_start(const char *job_id, BlockDriverState *bs,
     /* convert base to r/w, if necessary */
     orig_base_flags = bdrv_get_flags(base);
     if (!(orig_base_flags & BDRV_O_RDWR)) {
-        reopen_queue = bdrv_reopen_queue(reopen_queue, base, NULL,
-                                         orig_base_flags | BDRV_O_RDWR);
-    }
-
-    if (reopen_queue) {
-        bdrv_reopen_multiple(bdrv_get_aio_context(bs), reopen_queue, &local_err);
+        bdrv_reopen(base, orig_base_flags | BDRV_O_RDWR, &local_err);
         if (local_err != NULL) {
             error_propagate(errp, local_err);
             goto fail;
-- 
2.13.6

The bdrv_reopen*() implementation doesn't like it if the graph is
changed between queuing nodes for reopen and actually reopening them
(one of the reasons is that queuing can be recursive).

So instead of draining the device only in bdrv_reopen_multiple(),
require that callers already drained all affected nodes, and assert this
in bdrv_reopen_queue().

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
---
 block.c             | 23 ++++++++++++++++-------
 block/replication.c |  6 ++++++
 qemu-io-cmds.c      |  3 +++
 3 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/block.c b/block.c
index XXXXXXX..XXXXXXX 100644
--- a/block.c
+++ b/block.c
@@ -XXX,XX +XXX,XX @@ BlockDriverState *bdrv_open(const char *filename, const char *reference,
  * returns a pointer to bs_queue, which is either the newly allocated
  * bs_queue, or the existing bs_queue being used.
  *
+ * bs must be drained between bdrv_reopen_queue() and bdrv_reopen_multiple().
  */
 static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue,
                                                  BlockDriverState *bs,
@@ -XXX,XX +XXX,XX @@ static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue,
     BdrvChild *child;
     QDict *old_options, *explicit_options;
 
+    /* Make sure that the caller remembered to use a drained section. This is
+     * important to avoid graph changes between the recursive queuing here and
+     * bdrv_reopen_multiple(). */
+    assert(bs->quiesce_counter > 0);
+
     if (bs_queue == NULL) {
         bs_queue = g_new0(BlockReopenQueue, 1);
         QSIMPLEQ_INIT(bs_queue);
@@ -XXX,XX +XXX,XX @@ BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
  * If all devices prepare successfully, then the changes are committed
  * to all devices.
  *
+ * All affected nodes must be drained between bdrv_reopen_queue() and
+ * bdrv_reopen_multiple().
  */
 int bdrv_reopen_multiple(AioContext *ctx, BlockReopenQueue *bs_queue, Error **errp)
 {
@@ -XXX,XX +XXX,XX @@ int bdrv_reopen_multiple(AioContext *ctx, BlockReopenQueue *bs_queue, Error **er
 
     assert(bs_queue != NULL);
 
-    aio_context_release(ctx);
-    bdrv_drain_all_begin();
-    aio_context_acquire(ctx);
-
     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
+        assert(bs_entry->state.bs->quiesce_counter > 0);
         if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
             error_propagate(errp, local_err);
             goto cleanup;
@@ -XXX,XX +XXX,XX @@ cleanup:
     }
     g_free(bs_queue);
 
-    bdrv_drain_all_end();
-
     return ret;
 }
 
@@ -XXX,XX +XXX,XX @@ int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
 {
     int ret = -1;
     Error *local_err = NULL;
-    BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, NULL, bdrv_flags);
+    BlockReopenQueue *queue;
 
+    bdrv_subtree_drained_begin(bs);
+
+    queue = bdrv_reopen_queue(NULL, bs, NULL, bdrv_flags);
     ret = bdrv_reopen_multiple(bdrv_get_aio_context(bs), queue, &local_err);
     if (local_err != NULL) {
         error_propagate(errp, local_err);
     }
+
+    bdrv_subtree_drained_end(bs);
+
     return ret;
 }
 
diff --git a/block/replication.c b/block/replication.c
index XXXXXXX..XXXXXXX 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -XXX,XX +XXX,XX @@ static void reopen_backing_file(BlockDriverState *bs, bool writable,
         new_secondary_flags = s->orig_secondary_flags;
     }
 
+    bdrv_subtree_drained_begin(s->hidden_disk->bs);
+    bdrv_subtree_drained_begin(s->secondary_disk->bs);
+
     if (orig_hidden_flags != new_hidden_flags) {
         reopen_queue = bdrv_reopen_queue(reopen_queue, s->hidden_disk->bs, NULL,
                                          new_hidden_flags);
@@ -XXX,XX +XXX,XX @@ static void reopen_backing_file(BlockDriverState *bs, bool writable,
                              reopen_queue, &local_err);
         error_propagate(errp, local_err);
     }
+
+    bdrv_subtree_drained_end(s->hidden_disk->bs);
+    bdrv_subtree_drained_end(s->secondary_disk->bs);
 }
 
 static void backup_job_cleanup(BlockDriverState *bs)
diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
index XXXXXXX..XXXXXXX 100644
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -XXX,XX +XXX,XX @@ static int reopen_f(BlockBackend *blk, int argc, char **argv)
     opts = qopts ? qemu_opts_to_qdict(qopts, NULL) : NULL;
     qemu_opts_reset(&reopen_opts);
 
+    bdrv_subtree_drained_begin(bs);
     brq = bdrv_reopen_queue(NULL, bs, opts, flags);
     bdrv_reopen_multiple(bdrv_get_aio_context(bs), brq, &local_err);
+    bdrv_subtree_drained_end(bs);
+
     if (local_err) {
         error_report_err(local_err);
     } else {
-- 
2.13.6